Source code for gmql.dataset.GMQLDataset

# -*- coding: utf-8 -*-
from ..managers import get_python_manager, get_remote_manager, get_source_table
from ..settings import get_mode
from ..scala_wrapper import none, Some
from .loaders import MetaLoaderFile, Materializations
from .DataStructures.RegField import RegField
from .DataStructures.MetaField import MetaField
from .DataStructures.Aggregates import *
from .DataStructures.GenometricPredicates import GenometricCondition
from .DataStructures import reg_fixed_fileds
from .import GDataframe
from .loaders import MemoryLoader, MetadataProfiler
from ..FileManagment.TempFileManager import get_unique_identifier, get_new_dataset_tmp_folder
from .loaders.Sources import PARSER, LOCAL, REMOTE
from .storers.parserToXML import parserToXML
import os


[docs]class GMQLDataset(object): """ The main abstraction of the library. A GMQLDataset represents a genomic dataset in the GMQL standard and it is divided in region data and meta data. The function that can be applied to a GMQLDataset affect one of these two features or both. For each operator function that can be applied to a GMQLDataset we provide the documentation, some examples, and we specify which operator of GMQL the function is wrapper of. """ def __init__(self, parser=None, index=None, location="local", path_or_name=None, local_sources=None, remote_sources=None, meta_profile=None): self.parser = parser self.__index = index self.location = location self.meta_profile = None if isinstance(meta_profile, MetadataProfiler.MetadataProfile): self.meta_profile = meta_profile elif meta_profile is not None: raise TypeError("meta_profile must be MetadataProfiler") self.path_or_name = path_or_name self.pmg = get_python_manager() self.opmng = self.pmg.getOperatorManager() # provenance if isinstance(local_sources, list): self._local_sources = local_sources else: self._local_sources = [] if isinstance(remote_sources, list): self._remote_sources = remote_sources else: self._remote_sources = [] # get the schema of the dataset schemaJava = self.pmg.getVariableSchemaNames(self.__index) self.schema = [] for field in schemaJava: self.schema.append(field) self.schema.extend(reg_fixed_fileds) # setting the schema as properties of the dataset for field in self.schema: if field not in dir(self): self.__setattr__(field, self.RegField(field)) else: self.__setattr__(field.upper(), self.RegField(field)) # add also left and right self.left = self.RegField("left") self.right = self.RegField("right") def __getitem__(self, arg): if isinstance(arg, tuple) and len(arg) == 2: item, t = arg return self.MetaField(name=item, t=t) elif isinstance(arg, str): return self.MetaField(name=arg) elif isinstance(arg, MetaField): return self.meta_select(predicate=arg) else: raise ValueError("Input must be a string or a MetaField. {} was found".format(type(arg))) def __show_info(self): print("GMQLDataset") print("\tParser:\t{}".format(self.parser.get_parser_name())) print("\tIndex:\t{}".format(self.__index)) # def get_metadata(self): # """ Returns the metadata related to the current GMQLDataset. This function can be used only # when a local dataset is loaded using the :meth:`~gmql.dataset.loaders.Loader.load_from_path` and no other # operation has been called on the GMQLDataset. # # The metadata are returned in the form of a Pandas Dataframe having as index the sample ids and as columns # the metadata attributes. # # :return: a Pandas Dataframe # """ # if self.path_or_name is None: # raise ValueError("You cannot explore the metadata of an intermediate query." # "You can get metadata only after a load_from_local or load_from_remote") # if self.location == 'local': # return self.__get_metadata_local() # elif self.location == 'remote': # return self.__get_metadata_remote() # # def __get_metadata_local(self): # meta = MetaLoaderFile.load_meta_from_path(self.path_or_name) # return meta def __get_metadata_remote(self): pass
[docs] def get_reg_attributes(self): """ Returns the region fields of the dataset :return: a list of field names """ return self.schema
[docs] def MetaField(self, name, t=None): """ Creates an instance of a metadata field of the dataset. It can be used in building expressions or conditions for projection or selection. Notice that this function is equivalent to call:: dataset["name"] If the MetaField is used in a region projection (:meth:`~.reg_project`), the user has also to specify the type of the metadata attribute that is selected:: dataset.reg_project(new_field_dict={'new_field': dataset['name', 'string']}) :param name: the name of the metadata that is considered :param t: the type of the metadata attribute {string, int, boolean, double} :return: a MetaField instance """ return MetaField(name=name, index=self.__index, t=t)
[docs] def RegField(self, name): """ Creates an instance of a region field of the dataset. It can be used in building expressions or conditions for projection or selection. Notice that this function is equivalent to:: dataset.name :param name: the name of the region field that is considered :return: a RegField instance """ return RegField(name=name, index=self.__index)
[docs] def select(self, meta_predicate=None, region_predicate=None, semiJoinDataset=None, semiJoinMeta=None): """ *Wrapper of* ``SELECT`` Selection operation. Enables to filter datasets on the basis of region features or metadata attributes. In addition it is possibile to perform a selection based on the existence of certain metadata :attr:`~.semiJoinMeta` attributes and the matching of their values with those associated with at least one sample in an external dataset :attr:`~.semiJoinDataset`. Therefore, the selection can be based on: * *Metadata predicates*: selection based on the existence and values of certain metadata attributes in each sample. In predicates, attribute-value conditions can be composed using logical predicates & (and), | (or) and ~ (not) * *Region predicates*: selection based on region attributes. Conditions can be composed using logical predicates & (and), | (or) and ~ (not) * *SemiJoin clauses*: selection based on the existence of certain metadata :attr:`~.semiJoinMeta` attributes and the matching of their values with those associated with at least one sample in an external dataset :attr:`~.semiJoinDataset` In the following example we select all the samples from Example_Dataset_1 regarding antibody CTCF. From these samples we select only the regions on chromosome 6. Finally we select only the samples which have a matching antibody_targetClass in Example_Dataset_2:: import gmql as gl d1 = gl.get_example_dataset("Example_Dataset_1") d2 = gl.get_example_dataset("Example_Dataset_2") d_select = d.select(meta_predicate = d['antibody'] == "CTCF", region_predicate = d.chr == "chr6", semiJoinDataset=d2, semiJoinMeta=["antibody_targetClass"]) :param meta_predicate: logical predicate on the metadata <attribute, value> pairs :param region_predicate: logical predicate on the region feature values :param semiJoinDataset: an other GMQLDataset :param semiJoinMeta: a list of metadata attributes (strings) :return: a new GMQLDataset """ semiJoinDataset_exists = False if isinstance(meta_predicate, MetaField): meta_condition = Some(meta_predicate.getMetaCondition()) elif meta_predicate is None: meta_condition = none() else: raise TypeError("meta_predicate must be a MetaField or None." " {} was provided".format(type(meta_predicate))) if isinstance(region_predicate, RegField): region_condition = Some(region_predicate.getRegionCondition()) elif region_predicate is None: region_condition = none() else: raise TypeError("region_predicate must be a RegField or None." " {} was provided".format(type(region_predicate))) new_location = self.location new_local_sources, new_remote_sources = self._local_sources, self._remote_sources if isinstance(semiJoinDataset, GMQLDataset): other_dataset = Some(semiJoinDataset.__index) semiJoinDataset_exists = True elif semiJoinDataset is None: other_dataset = none() else: raise TypeError("semiJoinDataset must be a GMQLDataset or None." " {} was provided".format(type(semiJoinDataset))) if isinstance(semiJoinMeta, list) and \ all([isinstance(x, str) for x in semiJoinMeta]): if semiJoinDataset_exists: semi_join = Some(self.opmng.getMetaJoinCondition(semiJoinMeta)) new_local_sources, new_remote_sources = self.__combine_sources(self, semiJoinDataset) new_location = self.__combine_locations(self, semiJoinDataset) else: raise ValueError("semiJoinDataset and semiJoinMeta must be present at the " "same time or totally absent") elif semiJoinMeta is None: semi_join = none() else: raise TypeError("semiJoinMeta must be a list of strings or None." " {} was provided".format(type(semiJoinMeta))) new_index = self.opmng.select(self.__index, other_dataset, semi_join, meta_condition, region_condition) return GMQLDataset(index=new_index, location=new_location, local_sources=new_local_sources, remote_sources=new_remote_sources, meta_profile=self.meta_profile)
[docs] def meta_select(self, predicate=None, semiJoinDataset=None, semiJoinMeta=None): """ *Wrapper of* ``SELECT`` Wrapper of the :meth:`~.select` function filtering samples only based on metadata. :param predicate: logical predicate on the values of the rows :param semiJoinDataset: an other GMQLDataset :param semiJoinMeta: a list of metadata :return: a new GMQLDataset Example 1:: output_dataset = dataset.meta_select(dataset['patient_age'] < 70) # This statement can be written also as output_dataset = dataset[ dataset['patient_age'] < 70 ] Example 2:: output_dataset = dataset.meta_select( (dataset['tissue_status'] == 'tumoral') & (tumor_tag != 'gbm') | (tumor_tag == 'brca')) # This statement can be written also as output_dataset = dataset[ (dataset['tissue_status'] == 'tumoral') & (tumor_tag != 'gbm') | (tumor_tag == 'brca') ] Example 3:: JUN_POLR2A_TF = HG19_ENCODE_NARROW.meta_select( JUN_POLR2A_TF['antibody_target'] == 'JUN', semiJoinDataset=POLR2A_TF, semiJoinMeta=['cell']) The meta selection predicate can use all the classical equalities and disequalities {>, <, >=, <=, ==, !=} and predicates can be connected by the classical logical symbols {& (AND), | (OR), ~ (NOT)} plus the *isin* function. """ return self.select(meta_predicate=predicate, semiJoinDataset=semiJoinDataset, semiJoinMeta=semiJoinMeta)
[docs] def reg_select(self, predicate=None, semiJoinDataset=None, semiJoinMeta=None): """ *Wrapper of* ``SELECT`` Wrapper of the :meth:`~.select` function filtering regions only based on region attributes. :param predicate: logical predicate on the values of the regions :param semiJoinDataset: an other GMQLDataset :param semiJoinMeta: a list of metadata :return: a new GMQLDataset An example of usage:: new_dataset = dataset.reg_select((dataset.chr == 'chr1') | (dataset.pValue < 0.9)) You can also use Metadata attributes in selection:: new_dataset = dataset.reg_select(dataset.score > dataset['size']) This statement selects all the regions whose field score is strictly higher than the sample metadata attribute size. The region selection predicate can use all the classical equalities and disequalities {>, <, >=, <=, ==, !=} and predicates can be connected by the classical logical symbols {& (AND), | (OR), ~ (NOT)} plus the *isin* function. In order to be sure about the correctness of the expression, please use parenthesis to delimit the various predicates. """ return self.select(region_predicate=predicate, semiJoinMeta=semiJoinMeta, semiJoinDataset=semiJoinDataset)
[docs] def project(self, projected_meta=None, new_attr_dict=None, all_but_meta=None, projected_regs=None, new_field_dict=None, all_but_regs=None): """ *Wrapper of* ``PROJECT`` The PROJECT operator creates, from an existing dataset, a new dataset with all the samples (with their regions and region values) in the input one, but keeping for each sample in the input dataset only those metadata and/or region attributes expressed in the operator parameter list. Region coordinates and values of the remaining metadata and region attributes remain equal to those in the input dataset. Differently from the SELECT operator, PROJECT allows to: * Remove existing metadata and/or region attributes from a dataset; * Create new metadata and/or region attributes to be added to the result. :param projected_meta: list of metadata attributes to project on :param new_attr_dict: an optional dictionary of the form {'new_meta_1': function1, 'new_meta_2': function2, ...} in which every function computes the new metadata attribute based on the values of the others :param all_but_meta: list of metadata attributes that must be excluded from the projection :param projected_regs: list of the region fields to select :param new_field_dict: an optional dictionary of the form {'new_field_1': function1, 'new_field_2': function2, ...} in which every function computes the new region field based on the values of the others :param all_but_regs: list of region fields that must be excluded from the projection :return: a new GMQLDataset """ projected_meta_exists = False if isinstance(projected_meta, list) and \ all([isinstance(x, str) for x in projected_meta]): projected_meta = Some(projected_meta) projected_meta_exists = True elif projected_meta is None: projected_meta = none() else: raise TypeError("projected_meta must be a list of strings or None." " {} was provided".format(type(projected_meta))) if isinstance(new_attr_dict, dict): meta_ext = [] expBuild = self.pmg.getNewExpressionBuilder(self.__index) for k in new_attr_dict.keys(): item = new_attr_dict[k] if isinstance(k, str): if isinstance(item, MetaField): me = expBuild.createMetaExtension(k, item.getMetaExpression()) elif isinstance(item, int): me = expBuild.createMetaExtension(k, expBuild.getMEType("int", str(item))) elif isinstance(item, str): me = expBuild.createMetaExtension(k, expBuild.getMEType("string", item)) elif isinstance(item, float): me = expBuild.createMetaExtension(k, expBuild.getMEType("float", str(item))) else: raise TypeError("Type {} of item of new_attr_dict is not valid".format(type(item))) meta_ext.append(me) else: raise TypeError("The key of new_attr_dict must be a string. " "{} was provided".format(type(k))) meta_ext = Some(meta_ext) elif new_attr_dict is None: meta_ext = none() else: raise TypeError("new_attr_dict must be a dictionary." " {} was provided".format(type(new_attr_dict))) if isinstance(all_but_meta, list) and \ all([isinstance(x, str) for x in all_but_meta]): if not projected_meta_exists: all_but_meta = Some(all_but_meta) all_but_value = True else: raise ValueError("all_but_meta and projected_meta are mutually exclusive") elif all_but_meta is None: all_but_meta = none() all_but_value = False else: raise TypeError("all_but_meta must be a list of strings." " {} was provided".format(type(all_but_meta))) projected_meta = all_but_meta if all_but_value else projected_meta projected_regs_exists = False if isinstance(projected_regs, list) and \ all([isinstance(x, str) for x in projected_regs]): projected_regs = Some(projected_regs) projected_regs_exists = True elif projected_regs is None: projected_regs = none() else: raise TypeError("projected_regs must be a list of strings or None." " {} was provided".format(type(projected_regs))) if isinstance(new_field_dict, dict): regs_ext = [] expBuild = self.pmg.getNewExpressionBuilder(self.__index) for k in new_field_dict.keys(): item = new_field_dict[k] if isinstance(k, str): if isinstance(item, RegField): re = expBuild.createRegionExtension(k, item.getRegionExpression()) elif isinstance(item, MetaField): re = expBuild.createRegionExtension(k, item.reMetaNode) elif isinstance(item, int): re = expBuild.createRegionExtension(k, expBuild.getREType("float", str(item))) elif isinstance(item, str): re = expBuild.createRegionExtension(k, expBuild.getREType("string", item)) elif isinstance(item, float): re = expBuild.createRegionExtension(k, expBuild.getREType("float", str(item))) else: raise TypeError("Type {} of item of new_field_dict is not valid".format(type(item))) regs_ext.append(re) else: raise TypeError("The key of new_field_dict must be a string. " "{} was provided".format(type(k))) regs_ext = Some(regs_ext) elif new_field_dict is None: regs_ext = none() else: raise TypeError("new_field_dict must be a dictionary." " {} was provided".format(type(new_field_dict))) if isinstance(all_but_regs, list) and \ all([isinstance(x, str) for x in all_but_regs]): if not projected_regs_exists: all_but_regs = Some(all_but_regs) else: raise ValueError("all_but_meta and projected_meta are mutually exclusive") elif all_but_regs is None: all_but_regs = none() else: raise TypeError("all_but_regs must be a list of strings." " {} was provided".format(type(all_but_regs))) new_index = self.opmng.project(self.__index, projected_meta, meta_ext, all_but_value, projected_regs, all_but_regs, regs_ext) return GMQLDataset(index=new_index, location=self.location, local_sources=self._local_sources, remote_sources=self._remote_sources, meta_profile=self.meta_profile)
[docs] def meta_project(self, attr_list=None, all_but=None, new_attr_dict=None): """ *Wrapper of* ``PROJECT`` Project the metadata based on a list of attribute names :param attr_list: list of the metadata fields to select :param all_but: list of metadata that must be excluded from the projection. :param new_attr_dict: an optional dictionary of the form {'new_field_1': function1, 'new_field_2': function2, ...} in which every function computes the new field based on the values of the others :return: a new GMQLDataset Notice that if attr_list is specified, all_but cannot be specified and viceversa. Examples:: new_dataset = dataset.meta_project(attr_list=['antibody', 'ID'], new_attr_dict={'new_meta': dataset['ID'] + 100}) """ return self.project(projected_meta=attr_list, new_attr_dict=new_attr_dict, all_but_meta=all_but)
[docs] def reg_project(self, field_list=None, all_but=None, new_field_dict=None): """ *Wrapper of* ``PROJECT`` Project the region data based on a list of field names :param field_list: list of the fields to select :param all_but: keep only the region fields different from the ones specified :param new_field_dict: an optional dictionary of the form {'new_field_1': function1, 'new_field_2': function2, ...} in which every function computes the new field based on the values of the others :return: a new GMQLDataset An example of usage:: new_dataset = dataset.reg_project(['pValue', 'name'], {'new_field': dataset.pValue / 2}) new_dataset = dataset.reg_project(field_list=['peak', 'pvalue'], new_field_dict={'new_field': dataset.pvalue * dataset['cell_age', 'float']}) Notice that you can use metadata attributes for building new region fields. The only thing to remember when doing so is to define also the type of the output region field in the metadata attribute definition (for example, :code:`dataset['cell_age', 'float']` is required for defining the new attribute :code:`new_field` as float). In particular, the following type names are accepted: 'string', 'char', 'long', 'integer', 'boolean', 'float', 'double'. """ return self.project(projected_regs=field_list, all_but_regs=all_but, new_field_dict=new_field_dict)
[docs] def extend(self, new_attr_dict): """ *Wrapper of* ``EXTEND`` For each sample in an input dataset, the EXTEND operator builds new metadata attributes, assigns their values as the result of arithmetic and/or aggregate functions calculated on sample region attributes, and adds them to the existing metadata attribute-value pairs of the sample. Sample number and their genomic regions, with their attributes and values, remain unchanged in the output dataset. :param new_attr_dict: a dictionary of the type {'new_metadata' : AGGREGATE_FUNCTION('field'), ...} :return: new GMQLDataset An example of usage, in which we count the number of regions in each sample and the minimum value of the `pValue` field and export it respectively as metadata attributes `regionCount` and `minPValue`:: import gmql as gl dataset = gl.get_example_dataset("Example_Dataset_1") new_dataset = dataset.extend({'regionCount' : gl.COUNT(), 'minPValue' : gl.MIN('pValue')}) """ if isinstance(new_attr_dict, dict): expBuild = self.pmg.getNewExpressionBuilder(self.__index) aggregates = [] for k in new_attr_dict.keys(): if isinstance(k, str): item = new_attr_dict[k] if isinstance(item, Aggregate): op_name = item.get_aggregate_name() op_argument = Some(item.get_argument()) if item.is_unary() else none() regsToMeta = expBuild.getRegionsToMeta(op_name, k, op_argument) aggregates.append(regsToMeta) else: raise TypeError("The items in new_reg_fields must be Aggregates." " {} was provided".format(type(item))) else: raise TypeError("The key of new_attr_dict must be a string. " "{} was provided".format(type(k))) else: raise TypeError("new_attr_dict must be a dictionary. " "{} was provided".format(type(new_attr_dict))) new_index = self.opmng.extend(self.__index, aggregates) return GMQLDataset(index=new_index, location=self.location, local_sources=self._local_sources, remote_sources=self._remote_sources, meta_profile=self.meta_profile)
[docs] def cover(self, minAcc, maxAcc, groupBy=None, new_reg_fields=None, cover_type="normal"): """ *Wrapper of* ``COVER`` COVER is a GMQL operator that takes as input a dataset (of usually, but not necessarily, multiple samples) and returns another dataset (with a single sample, if no groupby option is specified) by “collapsing” the input samples and their regions according to certain rules specified by the COVER parameters. The attributes of the output regions are only the region coordinates, plus in case, when aggregate functions are specified, new attributes with aggregate values over attribute values of the contributing input regions; output metadata are the union of the input ones, plus the metadata attributes JaccardIntersect and JaccardResult, representing global Jaccard Indexes for the considered dataset, computed as the correspondent region Jaccard Indexes but on the whole sample regions. :param cover_type: the kind of cover variant you want ['normal', 'flat', 'summit', 'histogram'] :param minAcc: minimum accumulation value, i.e. the minimum number of overlapping regions to be considered during COVER execution. It can be any positive number or the strings {'ALL', 'ANY'}. :param maxAcc: maximum accumulation value, i.e. the maximum number of overlapping regions to be considered during COVER execution. It can be any positive number or the strings {'ALL', 'ANY'}. :param groupBy: optional list of metadata attributes :param new_reg_fields: dictionary of the type {'new_region_attribute' : AGGREGATE_FUNCTION('field'), ...} :return: a new GMQLDataset An example of usage:: cell_tf = narrow_peak.cover("normal", minAcc=1, maxAcc="Any", groupBy=['cell', 'antibody_target']) """ if isinstance(cover_type, str): coverFlag = self.opmng.getCoverTypes(cover_type) else: raise TypeError("type must be a string. " "{} was provided".format(type(cover_type))) if isinstance(minAcc, str): minAccParam = self.opmng.getCoverParam(minAcc.lower()) elif isinstance(minAcc, int): minAccParam = self.opmng.getCoverParam(str(minAcc).lower()) else: raise TypeError("minAcc must be a string or an integer. " "{} was provided".format(type(minAcc))) if isinstance(maxAcc, str): maxAccParam = self.opmng.getCoverParam(maxAcc.lower()) elif isinstance(maxAcc, int): maxAccParam = self.opmng.getCoverParam(str(maxAcc).lower()) else: raise TypeError("maxAcc must be a string or an integer. " "{} was provided".format(type(minAcc))) if isinstance(groupBy, list) and \ all([isinstance(x, str) for x in groupBy]): groupBy_result = Some(groupBy) elif groupBy is None: groupBy_result = none() else: raise TypeError("groupBy must be a list of string. " "{} was provided".format(type(groupBy))) aggregates = [] if isinstance(new_reg_fields, dict): expBuild = self.pmg.getNewExpressionBuilder(self.__index) for k in new_reg_fields.keys(): if isinstance(k, str): item = new_reg_fields[k] if isinstance(item, (SUM, MIN, MAX, AVG, BAG, BAGD, MEDIAN, COUNT)): op_name = item.get_aggregate_name() op_argument = item.get_argument() if op_argument is None: op_argument = none() else: op_argument = Some(op_argument) regsToReg = expBuild.getRegionsToRegion(op_name, k, op_argument) aggregates.append(regsToReg) else: raise TypeError("The items in new_reg_fields must be Aggregates (SUM, MIN, MAX, AVG, BAG, " "BAGD, MEDIAN, COUNT)" " {} was provided".format(type(item))) else: raise TypeError("The key of new_reg_fields must be a string. " "{} was provided".format(type(k))) elif new_reg_fields is None: pass else: raise TypeError("new_reg_fields must be a list of dictionary. " "{} was provided".format(type(new_reg_fields))) new_index = self.opmng.cover(self.__index, coverFlag, minAccParam, maxAccParam, groupBy_result, aggregates) return GMQLDataset(index=new_index, location=self.location, local_sources=self._local_sources, remote_sources=self._remote_sources, meta_profile=self.meta_profile)
[docs] def normal_cover(self, minAcc, maxAcc, groupBy=None, new_reg_fields=None): """ *Wrapper of* ``COVER`` The normal cover operation as described in :meth:`~.cover`. Equivalent to calling:: dataset.cover("normal", ...) """ return self.cover(minAcc, maxAcc, groupBy, new_reg_fields, cover_type="normal")
[docs] def flat_cover(self, minAcc, maxAcc, groupBy=None, new_reg_fields=None): """ *Wrapper of* ``COVER`` Variant of the function :meth:`~.cover` that returns the union of all the regions which contribute to the COVER. More precisely, it returns the contiguous regions that start from the first end and stop at the last end of the regions which would contribute to each region of a COVER. Equivalent to calling:: cover("flat", ...) """ return self.cover(minAcc, maxAcc, groupBy, new_reg_fields, cover_type="flat")
[docs] def summit_cover(self, minAcc, maxAcc, groupBy=None, new_reg_fields=None): """ *Wrapper of* ``COVER`` Variant of the function :meth:`~.cover` that returns only those portions of the COVER result where the maximum number of regions overlap (this is done by returning only regions that start from a position after which the number of overlaps does not increase, and stop at a position where either the number of overlapping regions decreases or violates the maximum accumulation index). Equivalent to calling:: cover("summit", ...) """ return self.cover(minAcc, maxAcc, groupBy, new_reg_fields, cover_type="summit")
[docs] def histogram_cover(self, minAcc, maxAcc, groupBy=None, new_reg_fields=None): """ *Wrapper of* ``COVER`` Variant of the function :meth:`~.cover` that returns all regions contributing to the COVER divided in different (contiguous) parts according to their accumulation index value (one part for each different accumulation value), which is assigned to the AccIndex region attribute. Equivalent to calling:: cover("histogram", ...) """ return self.cover(minAcc, maxAcc, groupBy, new_reg_fields, cover_type="histogram")
[docs] def join(self, experiment, genometric_predicate, output="LEFT", joinBy=None, refName="REF", expName="EXP", left_on=None, right_on=None): """ *Wrapper of* ``JOIN`` The JOIN operator takes in input two datasets, respectively known as anchor (the first/left one) and experiment (the second/right one) and returns a dataset of samples consisting of regions extracted from the operands according to the specified condition (known as genometric predicate). The number of generated output samples is the Cartesian product of the number of samples in the anchor and in the experiment dataset (if no joinby close if specified). The attributes (and their values) of the regions in the output dataset are the union of the region attributes (with their values) in the input datasets; homonymous attributes are disambiguated by prefixing their name with their dataset name. The output metadata are the union of the input metadata, with their attribute names prefixed with their input dataset name. :param experiment: an other GMQLDataset :param genometric_predicate: a list of Genometric atomic conditions. For an explanation of each of them go to the respective page. :param output: one of four different values that declare which region is given in output for each input pair of anchor and experiment regions satisfying the genometric predicate: * 'LEFT': outputs the anchor regions from the anchor dataset that satisfy the genometric predicate * 'RIGHT': outputs the anchor regions from the experiment dataset that satisfy the genometric predicate * 'INT': outputs the overlapping part (intersection) of the anchor and experiment regions that satisfy the genometric predicate; if the intersection is empty, no output is produced * 'CONTIG': outputs the concatenation between the anchor and experiment regions that satisfy the genometric predicate, i.e. the output region is defined as having left (right) coordinates equal to the minimum (maximum) of the corresponding coordinate values in the anchor and experiment regions satisfying the genometric predicate :param joinBy: list of metadata attributes :param refName: name that you want to assign to the reference dataset :param expName: name that you want to assign to the experiment dataset :param left_on: list of region fields of the reference on which the join must be performed :param right_on: list of region fields of the experiment on which the join must be performed :return: a new GMQLDataset An example of usage, in which we perform the join operation between Example_Dataset_1 and Example_Dataset_2 specifying than we want to join the regions of the former with the first regions at a minimim distance of 120Kb of the latter and finally we want to output the regions of Example_Dataset_2 matching the criteria:: import gmql as gl d1 = gl.get_example_dataset("Example_Dataset_1") d2 = gl.get_example_dataset("Example_Dataset_2") result_dataset = d1.join(experiment=d2, genometric_predicate=[gl.MD(1), gl.DGE(120000)], output="right") """ if isinstance(experiment, GMQLDataset): other_idx = experiment.__index else: raise TypeError("experiment must be a GMQLDataset. " "{} was provided".format(type(experiment))) if isinstance(genometric_predicate, list) and \ all([isinstance(x, GenometricCondition) for x in genometric_predicate]): regionJoinCondition = self.opmng.getRegionJoinCondition(list(map(lambda x: x.get_gen_condition(), genometric_predicate))) else: raise TypeError("genometric_predicate must be a list og GenometricCondition. " "{} was found".format(type(genometric_predicate))) if isinstance(output, str): regionBuilder = self.opmng.getRegionBuilderJoin(output) else: raise TypeError("output must be a string. " "{} was provided".format(type(output))) if not isinstance(expName, str): raise TypeError("expName must be a string. {} was provided".format(type(expName))) if not isinstance(refName, str): raise TypeError("refName must be a string. {} was provided".format(type(expName))) if isinstance(joinBy, list) and \ all([isinstance(x, str) for x in joinBy]): metaJoinCondition = Some(self.opmng.getMetaJoinCondition(joinBy)) elif joinBy is None: metaJoinCondition = none() else: raise TypeError("joinBy must be a list of strings. " "{} was found".format(type(joinBy))) left_on_exists = False left_on_len = 0 if isinstance(left_on, list) and \ all([isinstance(x, str) for x in left_on]): left_on_len = len(left_on) left_on = Some(left_on) left_on_exists = True elif left_on is None: left_on = none() else: raise TypeError("left_on must be a list of strings. " "{} was provided".format(type(left_on))) if isinstance(right_on, list) and \ all([isinstance(x, str)] for x in right_on) and \ left_on_exists and len(right_on) == left_on_len: right_on = Some(right_on) elif right_on is None and not left_on_exists: right_on = none() else: raise TypeError("right_on must be a list of strings. " "{} was provided".format(type(right_on))) new_index = self.opmng.join(self.__index, other_idx, metaJoinCondition, regionJoinCondition, regionBuilder, refName, expName, left_on, right_on) new_local_sources, new_remote_sources = self.__combine_sources(self, experiment) new_location = self.__combine_locations(self, experiment) return GMQLDataset(index=new_index, location=new_location, local_sources=new_local_sources, remote_sources=new_remote_sources, meta_profile=self.meta_profile)
[docs] def map(self, experiment, new_reg_fields=None, joinBy=None, refName="REF", expName="EXP"): """ *Wrapper of* ``MAP`` MAP is a non-symmetric operator over two datasets, respectively called reference and experiment. The operation computes, for each sample in the experiment dataset, aggregates over the values of the experiment regions that intersect with a region in a reference sample, for each region of each sample in the reference dataset; we say that experiment regions are mapped to the reference regions. The number of generated output samples is the Cartesian product of the samples in the two input datasets; each output sample has the same regions as the related input reference sample, with their attributes and values, plus the attributes computed as aggregates over experiment region values. Output sample metadata are the union of the related input sample metadata, whose attribute names are prefixed with their input dataset name. For each reference sample, the MAP operation produces a matrix like structure, called genomic space, where each experiment sample is associated with a row, each reference region with a column, and each matrix row is a vector of numbers - the aggregates computed during MAP execution. When the features of the reference regions are unknown, the MAP helps in extracting the most interesting regions out of many candidates. :param experiment: a GMQLDataset :param new_reg_fields: an optional dictionary of the form {'new_field_1': AGGREGATE_FUNCTION(field), ...} :param joinBy: optional list of metadata :param refName: name that you want to assign to the reference dataset :param expName: name that you want to assign to the experiment dataset :return: a new GMQLDataset In the following example, we map the regions of Example_Dataset_2 on the ones of Example_Dataset_1 and for each region of Example_Dataset_1 we ouput the average Pvalue and number of mapped regions of Example_Dataset_2. In addition we specify that the output region fields and metadata attributes will have the `D1` and `D2` suffixes respectively for attributes and fields belonging the Example_Dataset_1 and Example_Dataset_2:: import gmql as gl d1 = gl.get_example_dataset("Example_Dataset_1") d2 = gl.get_example_dataset("Example_Dataset_2") result = d1.map(experiment=d2, refName="D1", expName="D2", new_reg_fields={"avg_pValue": gl.AVG("pvalue")}) """ if isinstance(experiment, GMQLDataset): other_idx = experiment.__index else: raise TypeError("experiment must be a GMQLDataset. " "{} was provided".format(type(experiment))) aggregates = [] if isinstance(new_reg_fields, dict): expBuild = self.pmg.getNewExpressionBuilder(experiment.__index) for k in new_reg_fields.keys(): if isinstance(k, str): item = new_reg_fields[k] if isinstance(item, (SUM, MIN, MAX, AVG, BAG, BAGD, MEDIAN, COUNT)): op_name = item.get_aggregate_name() op_argument = item.get_argument() if op_argument is None: op_argument = none() else: op_argument = Some(op_argument) regsToReg = expBuild.getRegionsToRegion(op_name, k, op_argument) aggregates.append(regsToReg) else: raise TypeError("The items in new_reg_fields must be Aggregates (SUM, MIN, MAX, AVG, BAG, BAGD, " "MEDIAN, COUNT)" " {} was provided".format(type(item))) else: raise TypeError("The key of new_reg_fields must be a string. " "{} was provided".format(type(k))) elif new_reg_fields is None: pass else: raise TypeError("new_reg_fields must be a list of dictionary. " "{} was provided".format(type(new_reg_fields))) if isinstance(joinBy, list) and \ all([isinstance(x, str) for x in joinBy]): metaJoinCondition = Some(self.opmng.getMetaJoinCondition(joinBy)) elif joinBy is None: metaJoinCondition = none() else: raise TypeError("joinBy must be a list of strings. " "{} was found".format(type(joinBy))) if not isinstance(expName, str): raise TypeError("expName must be a string. {} was provided".format(type(expName))) if not isinstance(refName, str): raise TypeError("refName must be a string. {} was provided".format(type(expName))) new_index = self.opmng.map(self.__index, other_idx, metaJoinCondition, aggregates, refName, expName) new_local_sources, new_remote_sources = self.__combine_sources(self, experiment) new_location = self.__combine_locations(self, experiment) return GMQLDataset(index=new_index, location=new_location, local_sources=new_local_sources, remote_sources=new_remote_sources, meta_profile=self.meta_profile)
[docs] def order(self, meta=None, meta_ascending=None, meta_top=None, meta_k=None, regs=None, regs_ascending=None, region_top=None, region_k=None): """ *Wrapper of* ``ORDER`` The ORDER operator is used to order either samples, sample regions, or both, in a dataset according to a set of metadata and/or region attributes, and/or region coordinates. The number of samples and their regions in the output dataset is as in the input dataset, as well as their metadata and region attributes and values, but a new ordering metadata and/or region attribute is added with the sample or region ordering value, respectively. :param meta: list of metadata attributes :param meta_ascending: list of boolean values (True = ascending, False = descending) :param meta_top: "top", "topq" or "topp" or None :param meta_k: a number specifying how many results to be retained :param regs: list of region attributes :param regs_ascending: list of boolean values (True = ascending, False = descending) :param region_top: "top", "topq" or "topp" or None :param region_k: a number specifying how many results to be retained :return: a new GMQLDataset Example of usage. We order Example_Dataset_1 metadata by ascending `antibody` and descending `antibody_class` keeping only the first sample. We also order the resulting regions based on the `score` field in descending order, keeping only the first one also in this case:: import gmql as gl d1 = gl.get_example_dataset("Example_Dataset_1") result = d1.order(meta=["antibody", "antibody_targetClass"], meta_ascending=[True, False], meta_top="top", meta_k=1, regs=['score'], regs_ascending=[False], region_top="top", region_k=1) """ meta_exists = False meta_len = 0 if isinstance(meta, list) and \ all([isinstance(x, str) for x in meta]): meta_exists = True meta_len = len(meta) meta = Some(meta) elif meta is None: meta = none() else: raise TypeError("meta must be a list of strings. " "{} was provided".format(type(meta))) if isinstance(meta_ascending, list) and \ all([isinstance(x, bool) for x in meta_ascending]) and \ meta_exists and meta_len == len(meta_ascending): meta_ascending = Some(meta_ascending) elif meta_ascending is None: if meta_exists: # by default meta_ascending is all True meta_ascending = Some([True for _ in range(meta_len)]) else: meta_ascending = none() else: raise TypeError("meta ascending must be a list of booleans having the same size " "of meta. {} was provided".format(type(meta_ascending))) regs_exists = False regs_len = 0 if isinstance(regs, list) and \ all([isinstance(x, str) for x in regs]): regs_exists = True regs_len = len(regs) regs = Some(regs) elif regs is None: regs = none() else: raise TypeError("regs must be a list of strings. " "{} was provided".format(type(regs))) if isinstance(regs_ascending, list) and \ all([isinstance(x, bool) for x in regs_ascending]) and \ regs_exists and regs_len == len(regs_ascending): regs_ascending = Some(regs_ascending) elif regs_ascending is None: if regs_exists: # by default regs_ascending is all True regs_ascending = Some([True for _ in range(regs_len)]) else: regs_ascending = none() else: raise TypeError("meta regs_ascending must be a list of booleans having the same size " "of regs. {} was provided".format(type(regs_ascending))) meta_top_exists = False if isinstance(meta_top, str): if meta_exists: meta_top = Some(meta_top) meta_top_exists = True else: raise ValueError("meta_top must be defined only when meta is defined") elif meta_top is None: meta_top = none() else: raise TypeError("meta_top must be a string. {} was provided".format(type(meta_top))) if isinstance(meta_k, int) and \ meta_top_exists: meta_k = Some(meta_k) elif meta_k is None and \ not meta_top_exists: meta_k = none() else: raise TypeError("meta_k must be an integer and should be provided together with a " "value of meta_top. {} was provided".format(type(meta_k))) region_top_exists = False if isinstance(region_top, str): if regs_exists: region_top = Some(region_top) region_top_exists = True else: raise ValueError("region_top must be defined only when regs is defined") elif region_top is None: region_top = none() else: raise TypeError("region_top must be a string. {} was provided".format(type(region_top))) if isinstance(region_k, int) and \ region_top_exists: region_k = Some(region_k) elif region_k is None and \ not region_top_exists: region_k = none() else: raise TypeError("region_k must be an integer and should be provided together with a " "value of region_top. {} was provided".format(type(region_k))) new_index = self.opmng.order(self.__index, meta, meta_ascending, meta_top, meta_k, regs, regs_ascending, region_top, region_k) return GMQLDataset(index=new_index, location=self.location, local_sources=self._local_sources, remote_sources=self._remote_sources, meta_profile=self.meta_profile)
[docs] def difference(self, other, joinBy=None, exact=False): """ *Wrapper of* ``DIFFERENCE`` DIFFERENCE is a binary, non-symmetric operator that produces one sample in the result for each sample of the first operand, by keeping the same metadata of the first operand sample and only those regions (with their schema and values) of the first operand sample which do not intersect with any region in the second operand sample (also known as negative regions) :param other: GMQLDataset :param joinBy: (optional) list of metadata attributes. It is used to extract subsets of samples on which to apply the operator: only those samples in the current and other dataset that have the same value for each specified attribute are considered when performing the operation :param exact: boolean. If true, the the regions are considered as intersecting only if their coordinates are exactly the same :return: a new GMQLDataset Example of usage. We compute the exact difference between Example_Dataset_1 and Example_Dataset_2, considering only the samples with same `antibody`:: import gmql as gl d1 = gl.get_example_dataset("Example_Dataset_1") d2 = gl.get_example_dataset("Example_Dataset_2") result = d1.difference(other=d2, exact=True, joinBy=['antibody']) """ if isinstance(other, GMQLDataset): other_idx = other.__index else: raise TypeError("other must be a GMQLDataset. " "{} was provided".format(type(other))) if isinstance(joinBy, list) and \ all([isinstance(x, str) for x in joinBy]): metaJoinCondition = Some(self.opmng.getMetaJoinCondition(joinBy)) elif joinBy is None: metaJoinCondition = none() else: raise TypeError("joinBy must be a list of strings. " "{} was provided".format(type(joinBy))) if not isinstance(exact, bool): raise TypeError("exact must be a boolean. " "{} was provided".format(type(exact))) new_index = self.opmng.difference(self.__index, other_idx, metaJoinCondition, exact) new_local_sources, new_remote_sources = self.__combine_sources(self, other) new_location = self.__combine_locations(self, other) return GMQLDataset(index=new_index, location=new_location, local_sources=new_local_sources, remote_sources=new_remote_sources, meta_profile=self.meta_profile)
[docs] def union(self, other, left_name="LEFT", right_name="RIGHT"): """ *Wrapper of* ``UNION`` The UNION operation is used to integrate homogeneous or heterogeneous samples of two datasets within a single dataset; for each sample of either one of the input datasets, a sample is created in the result as follows: * its metadata are the same as in the original sample; * its schema is the schema of the first (left) input dataset; new identifiers are assigned to each output sample; * its regions are the same (in coordinates and attribute values) as in the original sample. Region attributes which are missing in an input dataset sample (w.r.t. the merged schema) are set to null. :param other: a GMQLDataset :param left_name: name that you want to assign to the left dataset :param right_name: name tha t you want to assign to the right dataset :return: a new GMQLDataset Example of usage:: import gmql as gl d1 = gl.get_example_dataset("Example_Dataset_1") d2 = gl.get_example_dataset("Example_Dataset_2") result = d1.union(other=d2, left_name="D1", right_name="D2") """ if not isinstance(left_name, str) or \ not isinstance(right_name, str): raise TypeError("left_name and right_name must be strings. " "{} - {} was provided".format(type(left_name), type(right_name))) if isinstance(other, GMQLDataset): other_idx = other.__index else: raise TypeError("other must be a GMQLDataset. " "{} was provided".format(type(other))) if len(left_name) == 0 or len(right_name) == 0: raise ValueError("left_name and right_name must not be empty") new_index = self.opmng.union(self.__index, other_idx, left_name, right_name) new_local_sources, new_remote_sources = self.__combine_sources(self, other) new_location = self.__combine_locations(self, other) return GMQLDataset(index=new_index, location=new_location, local_sources=new_local_sources, remote_sources=new_remote_sources, meta_profile=self.meta_profile)
[docs] def merge(self, groupBy=None): """ *Wrapper of* ``MERGE`` The MERGE operator builds a new dataset consisting of a single sample having * as regions all the regions of all the input samples, with the same attributes and values * as metadata the union of all the metadata attribute-values of the input samples. A groupby clause can be specified on metadata: the samples are then partitioned in groups, each with a distinct value of the grouping metadata attributes, and the MERGE operation is applied to each group separately, yielding to one sample in the result dataset for each group. Samples without the grouping metadata attributes are disregarded :param groupBy: list of metadata attributes :return: a new GMQLDataset Example of usage:: import gmql as gl d1 = gl.get_example_dataset("Example_Dataset_1") result = d1.merge(['antibody']) """ if isinstance(groupBy, list) and \ all([isinstance(x, str) for x in groupBy]): groupBy = Some(groupBy) elif groupBy is None: groupBy = none() else: raise TypeError("groupBy must be a list of strings. " "{} was provided".format(type(groupBy))) new_index = self.opmng.merge(self.__index, groupBy) return GMQLDataset(index=new_index, location=self.location, local_sources=self._local_sources, remote_sources=self._remote_sources, meta_profile=self.meta_profile)
[docs] def group(self, meta=None, meta_aggregates=None, regs=None, regs_aggregates=None, meta_group_name="_group"): """ *Wrapper of* ``GROUP`` The GROUP operator is used for grouping both regions and/or metadata of input dataset samples according to distinct values of certain attributes (known as grouping attributes); new grouping attributes are added to samples in the output dataset, storing the results of aggregate function evaluations over metadata and/or regions in each group of samples. Samples having missing values for any of the grouping attributes are discarded. :param meta: (optional) a list of metadata attributes :param meta_aggregates: (optional) {'new_attr': fun} :param regs: (optional) a list of region fields :param regs_aggregates: {'new_attr': fun} :param meta_group_name: (optional) the name to give to the group attribute in the metadata :return: a new GMQLDataset Example of usage. We group samples by `antibody` and we aggregate the region pvalues taking the maximum value calling the new region field `maxPvalue`:: import gmql as gl d1 = gl.get_example_dataset("Example_Dataset_1") result = d1.group(meta=['antibody'], regs_aggregates={'maxPvalue': gl.MAX("pvalue")}) """ if isinstance(meta, list) and \ all([isinstance(x, str) for x in meta]): meta = Some(meta) elif meta is None: meta = none() else: raise TypeError("meta must be a list of strings. " "{} was provided".format(type(meta))) expBuild = self.pmg.getNewExpressionBuilder(self.__index) if isinstance(meta_aggregates, dict): metaAggregates = [] for k in meta_aggregates: if isinstance(k, str): item = meta_aggregates[k] if isinstance(item, (SUM, MIN, MAX, AVG, BAG, BAGD, STD, MEDIAN, COUNTSAMP)): functionName = item.get_aggregate_name() argument = item.get_argument() if argument is None: argument = none() else: argument = Some(argument) metaAggregates.append(expBuild.createMetaAggregateFunction(functionName, k, argument)) else: raise TypeError("the item of the dictionary must be an Aggregate of the following: " "SUM, MIN, MAX, AVG, BAG, BAGD, STD, COUNTSAMP. " "{} was provided".format(type(item))) else: raise TypeError("keys of meta_aggregates must be string. " "{} was provided".format(type(k))) metaAggregates = Some(metaAggregates) elif meta_aggregates is None: metaAggregates = none() else: raise TypeError("meta_aggregates must be a dictionary of Aggregate functions. " "{} was provided".format(type(meta_aggregates))) if isinstance(regs, list) and \ all([isinstance(x, str) for x in regs]): regs = Some(regs) elif regs is None: regs = none() else: raise TypeError("regs must be a list of strings. " "{} was provided".format(type(regs))) if isinstance(regs_aggregates, dict): regionAggregates = [] for k in regs_aggregates.keys(): if isinstance(k, str): item = regs_aggregates[k] if isinstance(item, (SUM, MIN, MAX, AVG, BAG, BAGD, MEDIAN, COUNT)): op_name = item.get_aggregate_name() op_argument = item.get_argument() if op_argument is None: op_argument = none() else: op_argument = Some(op_argument) regsToReg = expBuild.getRegionsToRegion(op_name, k, op_argument) regionAggregates.append(regsToReg) else: raise TypeError("the item of the dictionary must be an Aggregate of the following: " "SUM, MIN, MAX, AVG, BAG, BAGD, MEDIAN, COUNT. " "{} was provided".format(type(item))) else: raise TypeError("The key of new_reg_fields must be a string. " "{} was provided".format(type(k))) regionAggregates = Some(regionAggregates) elif regs_aggregates is None: regionAggregates = none() else: raise TypeError("new_reg_fields must be a list of dictionary. " "{} was provided".format(type(regs_aggregates))) if isinstance(meta_group_name, str): pass else: raise TypeError("meta_group_name must be a string. " "{} was provided".format(type(meta_group_name))) new_index = self.opmng.group(self.__index, meta, metaAggregates, meta_group_name, regs, regionAggregates) return GMQLDataset(index=new_index, location=self.location, local_sources=self._local_sources, remote_sources=self._remote_sources, meta_profile=self.meta_profile)
[docs] def meta_group(self, meta, meta_aggregates=None): """ *Wrapper of* ``GROUP`` Group operation only for metadata. For further information check :meth:`~.group` """ return self.group(meta=meta, meta_aggregates=meta_aggregates)
[docs] def regs_group(self, regs, regs_aggregates=None): """ *Wrapper of* ``GROUP`` Group operation only for region data. For further information check :meth:`~.group` """ return self.group(regs=regs, regs_aggregates=regs_aggregates)
""" Materialization utilities """
[docs] def materialize(self, output_path=None, output_name=None, all_load=True, mode=None): """ *Wrapper of* ``MATERIALIZE`` Starts the execution of the operations for the GMQLDataset. PyGMQL implements lazy execution and no operation is performed until the materialization of the results is requestd. This operation can happen both locally or remotely. * Local mode: if the GMQLDataset is local (based on local data) the user can specify the :param output_path: (Optional) If specified, the user can say where to locally save the results of the computations. :param output_name: (Optional) Can be used only if the dataset is remote. It represents the name that the user wants to give to the resulting dataset on the server :param all_load: (Optional) It specifies if the result dataset should be directly converted to a GDataframe (True) or to a GMQLDataset (False) for future local queries. :return: A GDataframe or a GMQLDataset """ current_mode = get_mode() if mode is None else mode new_index = self.__modify_dag(current_mode) if current_mode == 'local': return Materializations.materialize_local(new_index, output_path, all_load) elif current_mode == 'remote': return Materializations.materialize_remote(new_index, output_name, output_path, all_load) else: raise ValueError("Current mode is not defined. {} given".format(current_mode))
[docs] def head(self, n=5): """ Returns a small set of regions and metadata from a query. It is supposed to be used for debugging purposes or for data exploration. :param n: how many samples to retrieve :return: a GDataframe """ if n <= 0: raise ValueError("n must be a positive number. {} was given".format(n)) current_mode = get_mode() new_index = self.__modify_dag(current_mode) collected = self.pmg.take(new_index, n) regs = MemoryLoader.load_regions(collected) meta = MemoryLoader.load_metadata(collected) result = GDataframe.GDataframe(regs, meta) return result
def _get_serialized_dag(self): serialized_dag = self.pmg.serializeVariable(self.__index) return serialized_dag @staticmethod def __combine_sources(d1, d2): if (not isinstance(d1, GMQLDataset)) or (not isinstance(d2, GMQLDataset)): raise TypeError("The function takes only GMQLDataset") local_sources_1 = d1._local_sources remote_sources_1 = d1._remote_sources local_sources_2 = d2._local_sources remote_sources_2 = d2._remote_sources new_local_sources = list(set(local_sources_1 + local_sources_2)) new_remote_sources = list(set(remote_sources_1 + remote_sources_2)) return new_local_sources, new_remote_sources @staticmethod def __combine_locations(d1, d2): if (not isinstance(d1, GMQLDataset)) or (not isinstance(d2, GMQLDataset)): raise TypeError("The function takes only GMQLDataset") location_1 = d1.location location_2 = d2.location if location_1 == location_2: return location_1 else: return "mixed" def __modify_dag(self, mode): remote_manager = get_remote_manager() index = self.__index pmg = get_python_manager() sources = get_source_table() # create a new id having the exact same DAG inside, for modification new_index = pmg.cloneVariable(index) if mode == "local": for d in self._remote_sources: # for each remote source, we have to download it locally in a temporary folder d_sources = sources.get_source(id=d) local = d_sources[LOCAL] remote = d_sources[REMOTE] if local is None: new_name = get_new_dataset_tmp_folder() remote_manager.download_dataset(dataset_name=remote, local_path=new_name, how="stream") sources.modify_source(id=d, local=new_name, delete_local=True) else: new_name = local pmg.modify_dag_source(new_index, str(d), new_name) for d in self._local_sources: # for each local source, just take its path d_sources = sources.get_source(id=d) local = d_sources[LOCAL] if local is None: raise ValueError("Impossible state. Local source must have a local path") else: pmg.modify_dag_source(new_index, str(d), local) elif mode == "remote": for d in self._local_sources: # for each local source, we have to upload it remotely d_sources = sources.get_source(id=d) local = d_sources[LOCAL] remote = d_sources[REMOTE] parser = d_sources[PARSER] if remote is None: new_name = "LOCAL_" + get_unique_identifier() schema_dir = get_new_dataset_tmp_folder() os.makedirs(schema_dir) schema_tmp_path = os.path.join(schema_dir, new_name + ".schema") parserToXML(parser, new_name, schema_tmp_path) remote_manager.upload_dataset(dataset=local, dataset_name=new_name, schema_path=schema_tmp_path) sources.modify_source(id=d, remote=new_name, delete_remote=True) else: new_name = remote pmg.modify_dag_source(new_index, str(d), new_name) for d in self._remote_sources: d_sources = sources.get_source(id=d) remote = d_sources[REMOTE] if remote is None: raise ValueError("Impossible state. Remote source must have a remote name") else: pmg.modify_dag_source(new_index, str(d), remote) else: raise ValueError("Unknown mode {}".format(mode)) return new_index