Source code for gmql.dataset.GDataframe

from .storers import FrameToGMQL
import pandas as pd
from .loaders import Loader
from .DataStructures import strand_aliases, stop_aliases, start_aliases, chr_aliases, id_sample_aliases
import numpy as np
from ..FileManagment import TempFileManager
# from ..ml.genometric_space import GenometricSpace

chr_types = [object, str]
start_types = [int, np.int, np.int8, np.int16, np.int32, np.int64]
stop_types = start_types
strand_types = [object, str]
id_sample_types = [object, str] + start_types

default_id_sample = "sample"


[docs]class GDataframe: """ Class holding the result of a materialization of a GMQLDataset. It is composed by two data structures: - A table with the *region* data - A table with the *metadata* corresponding to the regions """ def __init__(self, regs=None, meta=None): if (regs is None) and (meta is None): raise ValueError("At least one of meta or regs must not be None") if (regs is not None) and (not isinstance(regs, pd.DataFrame)): raise TypeError("regs: expected pandas Dataframe, got {}".format(type(regs))) if (meta is not None) and (not isinstance(meta, pd.DataFrame)): raise TypeError("meta: expected pandas Dataframe, got {}".format(type(meta))) if regs is None: index = meta.index regs = pd.DataFrame(index=index) if meta is None: meta = empty_meta(regs) self.regs = regs self.meta = meta # def to_genomic_space(self): # """ Translates the GDataframe to the Genomic Space data structure # # :return: a GenometricSpace object # """ # return GenometricSpace.from_memory(self.regs, self.meta)
[docs] def to_dataset_files(self, local_path=None, remote_path=None): """ Save the GDataframe to a local or remote location :param local_path: a local path to the folder in which the data must be saved :param remote_path: a remote dataset name that wants to be used for these data :return: None """ return FrameToGMQL.to_dataset_files(self, path_local=local_path, path_remote=remote_path)
[docs] def to_GMQLDataset(self, local_path=None, remote_path=None): """ Converts the GDataframe in a GMQLDataset for later local or remote computation :return: a GMQLDataset """ local = None remote = None if (local_path is None) and (remote_path is None): # get a temporary path local = TempFileManager.get_new_dataset_tmp_folder() if local_path is not None: local = local_path if remote_path is not None: remote = remote_path self.to_dataset_files(local, remote) if local is not None: return Loader.load_from_path(local_path=local) elif remote is not None: raise NotImplementedError("The remote loading is not implemented yet!")
def _normalize_metadata(self): meta = self.meta meta = meta.apply(_normalize_column) return GDataframe(regs=self.regs, meta=meta)
[docs] def project_meta(self, attributes): """ Projects the specified metadata attributes to new region fields :param attributes: a list of metadata attributes :return: a new GDataframe with additional region fields """ if not isinstance(attributes, list): raise TypeError('attributes must be a list') meta_to_project = self.meta[attributes].applymap(lambda l: ", ".join(l)) new_regs = self.regs.merge(meta_to_project, left_index=True, right_index=True) return GDataframe(regs=new_regs, meta=self.meta)
[docs] def to_matrix(self, index_regs=None, index_meta=None, columns_regs=None, columns_meta=None, values_regs=None, values_meta=None, **kwargs): """ Transforms the GDataframe to a pivot matrix having as index and columns the ones specified. This function is a wrapper around the pivot_table function of Pandas. :param index_regs: list of region fields to use as index :param index_meta: list of metadata attributes to use as index :param columns_regs: list of region fields to use as columns :param columns_meta: list of metadata attributes to use as columns :param values_regs: list of region fields to use as values :param values_meta: list of metadata attributes to use as values :param kwargs: other parameters to pass to the pivot_table function :return: a Pandas dataframe having as index the union of index_regs and index_meta, as columns the union of columns_regs and columns_meta and as values ths union of values_regs and values_meta """ index_regs = index_regs if index_regs is not None else [] index_meta = index_meta if index_meta is not None else [] columns_regs = columns_regs if columns_regs is not None else [] columns_meta = columns_meta if columns_meta is not None else [] values_regs = values_regs if values_regs is not None else [] values_meta = values_meta if values_meta is not None else [] index_meta_s = set(index_meta) columns_meta_s = set(columns_meta) values_meta_s = set(values_meta) meta_to_project = list(index_meta_s.union(columns_meta_s)\ .union(values_meta_s)\ .difference(set(self.regs.columns))) res = self.project_meta(meta_to_project) pivot_columns = columns_meta + columns_regs pivot_index = index_meta + index_regs pivot_values = values_regs + values_meta return res.regs.pivot_table(index=pivot_index, columns=pivot_columns, values=pivot_values, **kwargs)
def _normalize_column(column): lengths = column.map(len) if len(list(filter(lambda x: x > 1, lengths))) == 0: new_column = column.map(lambda x: x[0] if len(x) > 0 else np.nan) else: new_column = column return new_column def from_pandas(regs, meta=None, chr_name=None, start_name=None, stop_name=None, strand_name=None, sample_name=None): """ Creates a GDataframe from a pandas dataframe of region and a pandas dataframe of metadata :param regs: a pandas Dataframe of regions that is coherent with the GMQL data model :param meta: (optional) a pandas Dataframe of metadata that is coherent with the regions :param chr_name: (optional) which column of :attr:`~.regs` is the chromosome :param start_name: (optional) which column of :attr:`~.regs` is the start :param stop_name: (optional) which column of :attr:`~.regs` is the stop :param strand_name: (optional) which column of :attr:`~.regs` is the strand :param sample_name: (optional) which column of :attr:`~.regs` represents the sample name of that region. If nothing is provided, all the region will be put in a single sample. :return: a GDataframe """ regs = check_regs(regs, chr_name, start_name, stop_name, strand_name, sample_name) regs = to_gmql_regions(regs) if meta is not None: if not check_meta(meta, regs): raise ValueError("Error. Meta dataframe is not GMQL standard") else: meta = empty_meta(regs) return GDataframe(regs, meta) def check_regs(region_df, chr_name=None, start_name=None, stop_name=None, strand_name=None, sample_name=None): """ Modifies a region dataframe to be coherent with the GMQL data model :param region_df: a pandas Dataframe of regions that is coherent with the GMQL data model :param chr_name: (optional) which column of :attr:`~.region_df` is the chromosome :param start_name: (optional) which column of :attr:`~.region_df` is the start :param stop_name: (optional) which column of :attr:`~.region_df` is the stop :param strand_name: (optional) which column of :attr:`~.region_df` is the strand :return: a modified pandas Dataframe """ if sample_name is None: region_df.index = np.repeat(default_id_sample, len(region_df)) else: region_df = search_column(region_df, id_sample_aliases, id_sample_types, 'id_sample', sample_name) region_df = region_df.set_index("id_sample", drop=True) region_df = region_df.sort_index() region_df = search_column(region_df, chr_aliases, chr_types, 'chr', chr_name) region_df = search_column(region_df, start_aliases, start_types, 'start', start_name) region_df = search_column(region_df, stop_aliases, stop_types, 'stop', stop_name) region_df = search_column(region_df, strand_aliases, strand_types, 'strand', strand_name) return region_df def search_column(region_df, names, types, subs, name=None): columns = region_df.columns.map(str.lower) names = list(map(str.lower, names)) if name is not None: if name.lower() not in columns: raise ValueError("{} is not a column of the region dataframe".format(name)) if check_type(region_df[name], types): region_df = region_df.rename(columns={name: subs}) return region_df else: raise TypeError("Column {} is not of type {}.".format(name, types)) isok = False for e in columns: if e in names and check_type(region_df[e], types): region_df = region_df.rename(columns={e: subs}) isok = True break if (not isok) and (subs != 'strand'): raise ValueError("{} column was not found".format(subs)) return region_df def check_type(column, types): return column.dtype in types def check_meta(meta_df, regs_df): reg_index = regs_df.index reg_samples = set(reg_index.unique()) meta_index = meta_df.index meta_samples = set(meta_index.unique()) if reg_samples != meta_samples: return False else: return True def empty_meta(regs): index = regs.index.unique() df = pd.DataFrame(index=index) df['file'] = pd.Series(index=index, data=list(map(lambda x: [str(x)], index))) return df def to_gmql_regions(regs): cols = ['chr', 'start', 'stop'] if 'strand' in regs.columns: cols.append('strand') cols.extend([c for c in regs.columns if c not in cols]) return regs[cols]