Source code for gmql.dataset.loaders.Loader

from ...managers import get_python_manager, get_remote_manager, get_source_table
from ...settings import get_mode
from ...FileManagment import TempFileManager
from ..parsers.RegionParser import RegionParser
import os
from . import FILES_FOLDER, SCHEMA_FILE, WEB_PROFILE_FILE, PROFILE_FILE


def get_schema_path(path):
    schema_path = os.path.join(path, SCHEMA_FILE)
    return schema_path


def get_file_paths(path):
    real_path = preprocess_path(path)
    all_files = os.listdir(real_path)

    def filter_files(x):
        return not (x in [SCHEMA_FILE, WEB_PROFILE_FILE, PROFILE_FILE] or x.startswith("_"))

    files_paths = list(map(lambda x: os.path.join(real_path, x), filter(filter_files, all_files)))
    # files_paths = set(glob.glob(real_path + "/[!_]*"))
    schema_path = get_schema_path(real_path)
    return files_paths, schema_path


def preprocess_path(path):
    """ Given a dataset path, the following structure is to be expected:
        - path/
            - files/
                - S_00000.gdm
                - S_00000.gdm.meta
                - S_00001.gdm
                - S_00001.gdm.meta
                - ...
                - schema.xml
                - [profile.xml]
                - [web_profile.xml]
            - [info.txt]
            - [query.txt]
            - [vocabulary.txt]

    :param path
    :return: the path where the gdm data are
    """
    for sub_f in os.listdir(path):
        sub_f_tot = os.path.join(path, sub_f)
        if os.path.isdir(sub_f_tot) and sub_f == FILES_FOLDER:
            if check_for_dataset(sub_f_tot):
                return sub_f_tot
            else:
                raise ValueError("Dataset in {} was not in GMQL format".format(sub_f_tot))
    # if we are here it means that there is no files folder...so we need to check the root
    if check_for_dataset(path):
        return path
    else:
        raise ValueError("Dataset in {} was not in GMQL format".format(path))


def check_for_dataset(files):
    """ A GDM dataset has the form:
        - S_00000.gdm
        - S_00000.gdm.meta
        - S_00001.gdm
        - S_00001.gdm.meta
        - ...
        - schema.xml
        - [profile.xml]
        - [web_profile.xml]
    :param files: path of the dataset
    :return: True if the path contains a gdm dataset
    """
    all_files = os.listdir(files)
    meta_files = set(map(lambda y: y[: -9], filter(lambda x: x.endswith(".gdm.meta"), all_files)))
    regs_files = set(map(lambda y: y[: -4], filter(lambda x: x.endswith(".gdm"), all_files)))
    return meta_files == regs_files


[docs]def load_from_file(path, parser: RegionParser): """ Loads a GDM dataset from a single BED-like file. :param path: location of the file :param parser: RegionParser object specifying the parser of the file :return: a GMQLDataset """ from .. import GMQLDataset pmg = get_python_manager() id = add_to_sources(local_path=path, parser=parser) local_sources = [id] index = pmg.readFile(str(id), parser.get_gmql_parser()) return GMQLDataset.GMQLDataset(index=index, parser=parser, location="local", path_or_name=path, local_sources=local_sources)
[docs]def load_from_path(local_path, parser=None): """ Loads the data from a local path into a GMQLDataset. The loading of the files is "lazy", which means that the files are loaded only when the user does a materialization (see :func:`~gmql.dataset.GMQLDataset.GMQLDataset.materialize` ). The user can force the materialization of the data (maybe for an initial data exploration on only the metadata) by setting the :attr:`~.reg_load` (load in memory the region data), :attr:`~.meta_load` (load in memory the metadata) or :attr:`~.all_load` (load both region and meta data in memory). If the user specifies this final parameter as True, a :class:`~gmql.dataset.GDataframe.GDataframe` is returned, otherwise a :class:`~gmql.dataset.GMQLDataset.GMQLDataset` is returned :param local_path: local path of the dataset :param parser: the parser to be used for reading the data :param all_load: if set to True, both region and meta data are loaded in memory and an instance of GDataframe is returned :return: A new GMQLDataset or a GDataframe """ # from .. import GDataframe from .. import GMQLDataset pmg = get_python_manager() local_path = pmg.preProcessPath(local_path) # if all_load: # # load directly the metadata for exploration # meta = MetaLoaderFile.load_meta_from_path(local_path) # if isinstance(parser, RegionParser): # # region data # regs = RegLoaderFile.load_reg_from_path(local_path, parser) # else: # regs = RegLoaderFile.load_reg_from_path(local_path) # # return GDataframe.GDataframe(regs=regs, meta=meta) # else: # from ...settings import is_metaprofiling_enabled # if is_metaprofiling_enabled(): # meta_profile = create_metadata_profile(local_path) # else: meta_profile = None if parser is None: # find the parser parser = RegionParser.from_schema_file(local_path) elif not isinstance(parser, RegionParser): raise ValueError("parser must be RegionParser. {} was provided".format(type(parser))) id = add_to_sources(local_path=local_path, parser=parser) local_sources = [id] index = pmg.read_dataset(str(id), parser.get_gmql_parser()) return GMQLDataset.GMQLDataset(index=index, parser=parser, location="local", path_or_name=local_path, local_sources=local_sources, meta_profile=meta_profile)
def add_to_sources(local_path=None, remote_path=None, parser=None): source_table = get_source_table() id = source_table.search_source(local=local_path, remote=remote_path) if id is None: id = source_table.add_source(local=local_path, remote=remote_path, parser=parser) return id
[docs]def load_from_remote(remote_name, owner=None): """ Loads the data from a remote repository. :param remote_name: The name of the dataset in the remote repository :param owner: (optional) The owner of the dataset. If nothing is provided, the current user is used. For public datasets use 'public'. :return: A new GMQLDataset or a GDataframe """ from .. import GMQLDataset pmg = get_python_manager() remote_manager = get_remote_manager() parser = remote_manager.get_dataset_schema(remote_name, owner) id = add_to_sources(remote_path=remote_name, parser=parser) index = pmg.read_dataset(str(id), parser.get_gmql_parser()) remote_sources = [id] return GMQLDataset.GMQLDataset(index=index, location="remote", path_or_name=remote_name, remote_sources=remote_sources)
def load(path=None, name=None, owner=None, parser=None, all_load=False): # TODO: think if this method is useful or not... mode = get_mode() remote_manager = get_remote_manager() if mode == 'local': if isinstance(path, str) and (name is None): # we are given a local path return load_from_path(local_path=path, parser=parser, all_load=all_load) elif isinstance(name, str) and (path is None): local_path = TempFileManager.get_new_dataset_tmp_folder() remote_manager.download_dataset(dataset_name=name, local_path=local_path) return load_from_path(local_path=local_path, all_load=all_load) else: ValueError("You have to define path or name (mutually exclusive)") elif mode == 'remote': if isinstance(path, str) and (name is None): name = TempFileManager.get_unique_identifier() remote_manager.upload_dataset(dataset=path, dataset_name=name) return load_from_remote(remote_name=name) elif isinstance(name, str) and (path is None): return load_from_remote(remote_name=name, owner=owner) else: ValueError("You have to define path or name (mutually exclusive)") else: raise ValueError("Mode: {} unknown".format(mode))