from .storers import FrameToGMQL
import pandas as pd
from .loaders import Loader
from .DataStructures import strand_aliases, stop_aliases, start_aliases, chr_aliases, id_sample_aliases
import numpy as np
from ..FileManagment import TempFileManager
# from ..ml.genometric_space import GenometricSpace
chr_types = [object, str]
start_types = [int, np.int, np.int8, np.int16, np.int32, np.int64]
stop_types = start_types
strand_types = [object, str]
id_sample_types = [object, str] + start_types
default_id_sample = "sample"
[docs]class GDataframe:
""" Class holding the result of a materialization of a GMQLDataset.
It is composed by two data structures:
- A table with the *region* data
- A table with the *metadata* corresponding to the regions
"""
def __init__(self, regs=None, meta=None):
if (regs is None) and (meta is None):
raise ValueError("At least one of meta or regs must not be None")
if (regs is not None) and (not isinstance(regs, pd.DataFrame)):
raise TypeError("regs: expected pandas Dataframe, got {}".format(type(regs)))
if (meta is not None) and (not isinstance(meta, pd.DataFrame)):
raise TypeError("meta: expected pandas Dataframe, got {}".format(type(meta)))
if regs is None:
index = meta.index
regs = pd.DataFrame(index=index)
if meta is None:
meta = empty_meta(regs)
self.regs = regs
self.meta = meta
# def to_genomic_space(self):
# """ Translates the GDataframe to the Genomic Space data structure
#
# :return: a GenometricSpace object
# """
# return GenometricSpace.from_memory(self.regs, self.meta)
[docs] def to_dataset_files(self, local_path=None, remote_path=None):
""" Save the GDataframe to a local or remote location
:param local_path: a local path to the folder in which the data must be saved
:param remote_path: a remote dataset name that wants to be used for these data
:return: None
"""
return FrameToGMQL.to_dataset_files(self, path_local=local_path, path_remote=remote_path)
[docs] def to_GMQLDataset(self, local_path=None, remote_path=None):
""" Converts the GDataframe in a GMQLDataset for later local or remote computation
:return: a GMQLDataset
"""
local = None
remote = None
if (local_path is None) and (remote_path is None):
# get a temporary path
local = TempFileManager.get_new_dataset_tmp_folder()
if local_path is not None:
local = local_path
if remote_path is not None:
remote = remote_path
self.to_dataset_files(local, remote)
if local is not None:
return Loader.load_from_path(local_path=local)
elif remote is not None:
raise NotImplementedError("The remote loading is not implemented yet!")
def _normalize_metadata(self):
meta = self.meta
meta = meta.apply(_normalize_column)
return GDataframe(regs=self.regs, meta=meta)
[docs] def to_matrix(self, index_regs=None, index_meta=None,
columns_regs=None, columns_meta=None,
values_regs=None, values_meta=None, **kwargs):
""" Transforms the GDataframe to a pivot matrix having as index and columns the
ones specified. This function is a wrapper around the pivot_table function of Pandas.
:param index_regs: list of region fields to use as index
:param index_meta: list of metadata attributes to use as index
:param columns_regs: list of region fields to use as columns
:param columns_meta: list of metadata attributes to use as columns
:param values_regs: list of region fields to use as values
:param values_meta: list of metadata attributes to use as values
:param kwargs: other parameters to pass to the pivot_table function
:return: a Pandas dataframe having as index the union of index_regs and index_meta, as
columns the union of columns_regs and columns_meta and as values ths union
of values_regs and values_meta
"""
index_regs = index_regs if index_regs is not None else []
index_meta = index_meta if index_meta is not None else []
columns_regs = columns_regs if columns_regs is not None else []
columns_meta = columns_meta if columns_meta is not None else []
values_regs = values_regs if values_regs is not None else []
values_meta = values_meta if values_meta is not None else []
index_meta_s = set(index_meta)
columns_meta_s = set(columns_meta)
values_meta_s = set(values_meta)
meta_to_project = list(index_meta_s.union(columns_meta_s)\
.union(values_meta_s)\
.difference(set(self.regs.columns)))
res = self.project_meta(meta_to_project)
pivot_columns = columns_meta + columns_regs
pivot_index = index_meta + index_regs
pivot_values = values_regs + values_meta
return res.regs.pivot_table(index=pivot_index, columns=pivot_columns, values=pivot_values, **kwargs)
def _normalize_column(column):
lengths = column.map(len)
if len(list(filter(lambda x: x > 1, lengths))) == 0:
new_column = column.map(lambda x: x[0] if len(x) > 0 else np.nan)
else:
new_column = column
return new_column
def from_pandas(regs, meta=None, chr_name=None, start_name=None, stop_name=None,
strand_name=None, sample_name=None):
""" Creates a GDataframe from a pandas dataframe of region and a pandas dataframe of metadata
:param regs: a pandas Dataframe of regions that is coherent with the GMQL data model
:param meta: (optional) a pandas Dataframe of metadata that is coherent with the regions
:param chr_name: (optional) which column of :attr:`~.regs` is the chromosome
:param start_name: (optional) which column of :attr:`~.regs` is the start
:param stop_name: (optional) which column of :attr:`~.regs` is the stop
:param strand_name: (optional) which column of :attr:`~.regs` is the strand
:param sample_name: (optional) which column of :attr:`~.regs` represents the sample name
of that region. If nothing is provided, all the region will be put in a single sample.
:return: a GDataframe
"""
regs = check_regs(regs, chr_name, start_name, stop_name, strand_name, sample_name)
regs = to_gmql_regions(regs)
if meta is not None:
if not check_meta(meta, regs):
raise ValueError("Error. Meta dataframe is not GMQL standard")
else:
meta = empty_meta(regs)
return GDataframe(regs, meta)
def check_regs(region_df, chr_name=None, start_name=None, stop_name=None,
strand_name=None, sample_name=None):
""" Modifies a region dataframe to be coherent with the GMQL data model
:param region_df: a pandas Dataframe of regions that is coherent with the GMQL data model
:param chr_name: (optional) which column of :attr:`~.region_df` is the chromosome
:param start_name: (optional) which column of :attr:`~.region_df` is the start
:param stop_name: (optional) which column of :attr:`~.region_df` is the stop
:param strand_name: (optional) which column of :attr:`~.region_df` is the strand
:return: a modified pandas Dataframe
"""
if sample_name is None:
region_df.index = np.repeat(default_id_sample, len(region_df))
else:
region_df = search_column(region_df, id_sample_aliases,
id_sample_types, 'id_sample', sample_name)
region_df = region_df.set_index("id_sample", drop=True)
region_df = region_df.sort_index()
region_df = search_column(region_df, chr_aliases, chr_types, 'chr', chr_name)
region_df = search_column(region_df, start_aliases, start_types, 'start', start_name)
region_df = search_column(region_df, stop_aliases, stop_types, 'stop', stop_name)
region_df = search_column(region_df, strand_aliases, strand_types, 'strand', strand_name)
return region_df
def search_column(region_df, names, types, subs, name=None):
columns = region_df.columns.map(str.lower)
names = list(map(str.lower, names))
if name is not None:
if name.lower() not in columns:
raise ValueError("{} is not a column of the region dataframe".format(name))
if check_type(region_df[name], types):
region_df = region_df.rename(columns={name: subs})
return region_df
else:
raise TypeError("Column {} is not of type {}.".format(name, types))
isok = False
for e in columns:
if e in names and check_type(region_df[e], types):
region_df = region_df.rename(columns={e: subs})
isok = True
break
if (not isok) and (subs != 'strand'):
raise ValueError("{} column was not found".format(subs))
return region_df
def check_type(column, types):
return column.dtype in types
def check_meta(meta_df, regs_df):
reg_index = regs_df.index
reg_samples = set(reg_index.unique())
meta_index = meta_df.index
meta_samples = set(meta_index.unique())
if reg_samples != meta_samples:
return False
else:
return True
def empty_meta(regs):
index = regs.index.unique()
df = pd.DataFrame(index=index)
df['file'] = pd.Series(index=index, data=list(map(lambda x: [str(x)], index)))
return df
def to_gmql_regions(regs):
cols = ['chr', 'start', 'stop']
if 'strand' in regs.columns:
cols.append('strand')
cols.extend([c for c in regs.columns if c not in cols])
return regs[cols]