from .FileManagment import TempFileManager, initialize_user_folder, get_resources_dir
from .configuration import Configuration
import os
import sys
__version__ = None
__progress_bar = True
__metadata_profiling = False
__remote_address = None
__mode = "local"
__master = "local[*]"
__gcloud_token = None
__folders = None
__init_configs_local = {
"spark.serializer": 'org.apache.spark.serializer.KryoSerializer',
'spark.executor.memory': '6g',
'spark.driver.memory': '8g',
'spark.kryoserializer.buffer.max': '1g',
'spark.driver.maxResultSize': '5g',
'spark.driver.host': 'localhost',
'spark.local.dir': '/tmp'
}
__regions_batch_size = 20000
__init_configs_spark = {}
__configuration = None
__java_options = [] # ['-Xmx8192m']
[docs]def get_configuration():
""" Returns the configurations of the current PyGMQL instance
:return: a Configuration object
"""
global __configuration
return __configuration
[docs]def set_spark_configs(d):
""" Set Spark configurations to be used during the spark-submit. Works only when the master is different from local.
:param d: a dictionary of {key: values}
:return: None
"""
global __init_configs_spark
__init_configs_spark.update(d)
def get_spark_configs():
global __init_configs_spark
return __init_configs_spark
def set_regions_batch_size(batch: int):
""" Set the number of regions to be loaded for each batch at the end of the materialize operation.
This setting is very low level and does not affect the result of the query, but changing this parameter
could speed up the loading of the results in Python in certain contexts.
:param batch: Number of regions.
:return: None
"""
global __regions_batch_size
__regions_batch_size = batch
def get_regions_batch_size():
global __regions_batch_size
return __regions_batch_size
def set_configuration(conf):
global __configuration
if not isinstance(conf, Configuration):
raise TypeError("Configuration expected. {} was found".format(type(conf)))
__configuration = conf
[docs]def set_master(master: str):
""" Set the master of the PyGMQL instance. It accepts any master configuration available in Spark.
:param master: master configuration
:return: None
"""
global __master
__master = master
def get_master():
global __master
return __master
[docs]def set_local_java_options(options: list):
""" When the mode is set to local, this function can be used to add JVM specific options before starting the
backend. It accepts any Java option.
:param options: list of string, one for each Java option
:return: None
"""
global __java_options
__java_options = options
def get_local_java_options():
global __java_options
return __java_options
[docs]def set_mode(how):
""" Sets the behavior of the API
:param how: if 'remote' all the execution is performed on the remote server; if 'local' all
it is executed locally. Default = 'local'
:return: None
"""
global __mode
if how == "local":
__mode = how
elif how == "remote":
__mode = how
else:
raise ValueError("how must be 'local' or 'remote'")
def get_mode():
global __mode
return __mode
def get_version():
version_file_name = os.path.join(get_resources_dir(), "version")
with open(version_file_name, "r") as f_ver:
version = f_ver.read().strip()
return version
[docs]def set_progress(how):
""" Enables or disables the progress bars for the loading, writing and downloading
of datasets
:param how: True if you want the progress bar, False otherwise
:return: None
Example::
import gmql as gl
gl.set_progress(True) # abilitates progress bars
# ....do something...
gl.set_progress(False) # removes progress bars
# ....do something...
"""
global __progress_bar
if isinstance(how, bool):
__progress_bar = how
else:
raise ValueError(
"how must be a boolean. {} was found".format(type(how)))
def is_progress_enabled():
global __progress_bar
return __progress_bar
def set_meta_profiling(how):
""" Enables or disables the profiling of metadata at the loading of a GMQLDataset
:param how: True if you want to analyze the metadata when a GMQLDataset is created
by a load_from_*. False otherwise. (Default=True)
:return: None
"""
global __metadata_profiling
if isinstance(how, bool):
__metadata_profiling = how
else:
raise TypeError("how must be boolean. {} was provided".format(type(how)))
def is_metaprofiling_enabled():
global __metadata_profiling
return __metadata_profiling
[docs]def set_remote_address(address):
""" Enables the user to set the address of the GMQL remote service
:param address: a string representing the URL of GMQL remote service
:return: None
"""
global __remote_address
__remote_address = address
def get_remote_address():
global __remote_address
return __remote_address
def get_folders():
global __folders
return __folders
def initialize_configuration():
global __configuration, __init_configs_local
configs = Configuration()
configs.set_spark_conf(d=__init_configs_local)
__configuration = configs
def init_settings():
global __version__, __folders, __configuration
__version__ = get_version()
initialize_user_folder()
__folders = TempFileManager.initialize_tmp_folders()
initialize_configuration()
__configuration.set_spark_conf("spark.local.dir", __folders['spark'])
if sys.platform.startswith("win32"):
# if we are on windows set the hadoop home to winutils.exe
hadoop_folder_fn = os.path.join(get_resources_dir(), "hadoop")
__configuration.set_system_conf("hadoop.home.dir", hadoop_folder_fn)