diff --git a/lakehouse_engine/core/exec_env.html b/lakehouse_engine/core/exec_env.html index 0298aef..a415453 100644 --- a/lakehouse_engine/core/exec_env.html +++ b/lakehouse_engine/core/exec_env.html @@ -65,88 +65,95 @@
1"""Module to take care of creating a singleton of the execution environment class.""" 2import os 3 - 4from pyspark import SparkConf - 5from pyspark.sql import SparkSession - 6 - 7from lakehouse_engine.utils.logging_handler import LoggingHandler + 4from pyspark.sql import SparkSession + 5 + 6from lakehouse_engine.utils.logging_handler import LoggingHandler + 7 8 - 9 -10class ExecEnv(object): -11 """Represents the basic resources regarding the engine execution environment. -12 -13 Currently, it is solely used to encapsulate the logic to get a Spark session. -14 """ -15 -16 SESSION: SparkSession -17 _LOGGER = LoggingHandler(__name__).get_logger() -18 DEFAULT_AWS_REGION = "eu-west-1" -19 -20 @classmethod -21 def get_or_create( -22 cls, -23 session: SparkSession = None, -24 enable_hive_support: bool = True, -25 app_name: str = None, -26 config: dict = None, -27 ) -> None: -28 """Get or create an execution environment session (currently Spark). -29 -30 It instantiates a singleton session that can be accessed anywhere from the -31 lakehouse engine. -32 -33 Args: -34 session: spark session. -35 enable_hive_support: whether to enable hive support or not. -36 app_name: application name. -37 config: extra spark configs to supply to the spark session. -38 """ -39 default_config = { -40 "spark.databricks.delta.optimizeWrite.enabled": True, -41 "spark.sql.adaptive.enabled": True, -42 "spark.databricks.delta.merge.enableLowShuffle": True, -43 } -44 cls._LOGGER.info( -45 f"Using the following default configs you may want to override them for " -46 f"your job: {default_config}" -47 ) -48 final_config: dict = {**default_config, **(config if config else {})} -49 cls._LOGGER.info(f"Final config is: {final_config}") -50 -51 if session: -52 cls.SESSION = session -53 else: -54 session_builder = SparkSession.builder.appName(app_name) -55 if config: -56 session_builder = session_builder.config( -57 conf=SparkConf().setAll(final_config.items()) # type: ignore -58 ) -59 if enable_hive_support: -60 session_builder = session_builder.enableHiveSupport() -61 cls.SESSION = session_builder.getOrCreate() -62 -63 cls._set_environment_variables(final_config.get("os_env_vars")) -64 -65 @classmethod -66 def _set_environment_variables(cls, os_env_vars: dict = None) -> None: -67 """Set environment variables at OS level. -68 -69 By default, we are setting the AWS_DEFAULT_REGION as we have identified this is -70 beneficial to avoid getBucketLocation permission problems. + 9class ExecEnv(object): +10 """Represents the basic resources regarding the engine execution environment. +11 +12 Currently, it is solely used to encapsulate the logic to get a Spark session. +13 """ +14 +15 SESSION: SparkSession +16 _LOGGER = LoggingHandler(__name__).get_logger() +17 DEFAULT_AWS_REGION = "eu-west-1" +18 +19 @classmethod +20 def get_or_create( +21 cls, +22 session: SparkSession = None, +23 enable_hive_support: bool = True, +24 app_name: str = None, +25 config: dict = None, +26 ) -> None: +27 """Get or create an execution environment session (currently Spark). +28 +29 It instantiates a singleton session that can be accessed anywhere from the +30 lakehouse engine. +31 +32 Args: +33 session: spark session. +34 enable_hive_support: whether to enable hive support or not. +35 app_name: application name. +36 config: extra spark configs to supply to the spark session. +37 """ +38 default_config = { +39 "spark.databricks.delta.optimizeWrite.enabled": True, +40 "spark.sql.adaptive.enabled": True, +41 "spark.databricks.delta.merge.enableLowShuffle": True, +42 } +43 cls._LOGGER.info( +44 f"Using the following default configs you may want to override them for " +45 f"your job: {default_config}" +46 ) +47 final_config: dict = {**default_config, **(config if config else {})} +48 cls._LOGGER.info(f"Final config is: {final_config}") +49 +50 if session: +51 cls.SESSION = session +52 else: +53 # with active session we do not need app name +54 if SparkSession.getActiveSession(): +55 app_name = SparkSession.getActiveSession().sparkContext.appName +56 cls._LOGGER.info(f"Detected active session: {app_name}") +57 elif not SparkSession.getActiveSession() and not app_name: +58 cls._LOGGER.info("No active session or appname detected") +59 app_name = "lakehouse_engine" +60 # we will still add this part to set configs +61 session_builder = SparkSession.builder.appName(app_name) +62 if config: +63 for k, v in final_config.items(): +64 session_builder.config(k, v) +65 +66 if enable_hive_support: +67 session_builder = session_builder.enableHiveSupport() +68 cls.SESSION = session_builder.getOrCreate() +69 +70 cls._set_environment_variables(final_config.get("os_env_vars")) 71 -72 Args: -73 os_env_vars: this parameter can be used to pass the environment variables to -74 be defined. -75 """ -76 if os_env_vars is None: -77 os_env_vars = {} +72 @classmethod +73 def _set_environment_variables(cls, os_env_vars: dict = None) -> None: +74 """Set environment variables at OS level. +75 +76 By default, we are setting the AWS_DEFAULT_REGION as we have identified this is +77 beneficial to avoid getBucketLocation permission problems. 78 -79 for env_var in os_env_vars.items(): -80 os.environ[env_var[0]] = env_var[1] -81 -82 if "AWS_DEFAULT_REGION" not in os_env_vars: -83 os.environ["AWS_DEFAULT_REGION"] = cls.SESSION.sparkContext.getConf().get( -84 "spark.databricks.clusterUsageTags.region", cls.DEFAULT_AWS_REGION -85 ) +79 Args: +80 os_env_vars: this parameter can be used to pass the environment variables to +81 be defined. +82 """ +83 if os_env_vars is None: +84 os_env_vars = {} +85 +86 for env_var in os_env_vars.items(): +87 os.environ[env_var[0]] = env_var[1] +88 +89 if "AWS_DEFAULT_REGION" not in os_env_vars: +90 os.environ["AWS_DEFAULT_REGION"] = cls.SESSION.sparkContext.getConf().get( +91 "spark.databricks.clusterUsageTags.region", cls.DEFAULT_AWS_REGION +92 )
11class ExecEnv(object): -12 """Represents the basic resources regarding the engine execution environment. -13 -14 Currently, it is solely used to encapsulate the logic to get a Spark session. -15 """ -16 -17 SESSION: SparkSession -18 _LOGGER = LoggingHandler(__name__).get_logger() -19 DEFAULT_AWS_REGION = "eu-west-1" -20 -21 @classmethod -22 def get_or_create( -23 cls, -24 session: SparkSession = None, -25 enable_hive_support: bool = True, -26 app_name: str = None, -27 config: dict = None, -28 ) -> None: -29 """Get or create an execution environment session (currently Spark). -30 -31 It instantiates a singleton session that can be accessed anywhere from the -32 lakehouse engine. -33 -34 Args: -35 session: spark session. -36 enable_hive_support: whether to enable hive support or not. -37 app_name: application name. -38 config: extra spark configs to supply to the spark session. -39 """ -40 default_config = { -41 "spark.databricks.delta.optimizeWrite.enabled": True, -42 "spark.sql.adaptive.enabled": True, -43 "spark.databricks.delta.merge.enableLowShuffle": True, -44 } -45 cls._LOGGER.info( -46 f"Using the following default configs you may want to override them for " -47 f"your job: {default_config}" -48 ) -49 final_config: dict = {**default_config, **(config if config else {})} -50 cls._LOGGER.info(f"Final config is: {final_config}") -51 -52 if session: -53 cls.SESSION = session -54 else: -55 session_builder = SparkSession.builder.appName(app_name) -56 if config: -57 session_builder = session_builder.config( -58 conf=SparkConf().setAll(final_config.items()) # type: ignore -59 ) -60 if enable_hive_support: -61 session_builder = session_builder.enableHiveSupport() -62 cls.SESSION = session_builder.getOrCreate() -63 -64 cls._set_environment_variables(final_config.get("os_env_vars")) -65 -66 @classmethod -67 def _set_environment_variables(cls, os_env_vars: dict = None) -> None: -68 """Set environment variables at OS level. -69 -70 By default, we are setting the AWS_DEFAULT_REGION as we have identified this is -71 beneficial to avoid getBucketLocation permission problems. +@@ -259,50 +274,58 @@10class ExecEnv(object): +11 """Represents the basic resources regarding the engine execution environment. +12 +13 Currently, it is solely used to encapsulate the logic to get a Spark session. +14 """ +15 +16 SESSION: SparkSession +17 _LOGGER = LoggingHandler(__name__).get_logger() +18 DEFAULT_AWS_REGION = "eu-west-1" +19 +20 @classmethod +21 def get_or_create( +22 cls, +23 session: SparkSession = None, +24 enable_hive_support: bool = True, +25 app_name: str = None, +26 config: dict = None, +27 ) -> None: +28 """Get or create an execution environment session (currently Spark). +29 +30 It instantiates a singleton session that can be accessed anywhere from the +31 lakehouse engine. +32 +33 Args: +34 session: spark session. +35 enable_hive_support: whether to enable hive support or not. +36 app_name: application name. +37 config: extra spark configs to supply to the spark session. +38 """ +39 default_config = { +40 "spark.databricks.delta.optimizeWrite.enabled": True, +41 "spark.sql.adaptive.enabled": True, +42 "spark.databricks.delta.merge.enableLowShuffle": True, +43 } +44 cls._LOGGER.info( +45 f"Using the following default configs you may want to override them for " +46 f"your job: {default_config}" +47 ) +48 final_config: dict = {**default_config, **(config if config else {})} +49 cls._LOGGER.info(f"Final config is: {final_config}") +50 +51 if session: +52 cls.SESSION = session +53 else: +54 # with active session we do not need app name +55 if SparkSession.getActiveSession(): +56 app_name = SparkSession.getActiveSession().sparkContext.appName +57 cls._LOGGER.info(f"Detected active session: {app_name}") +58 elif not SparkSession.getActiveSession() and not app_name: +59 cls._LOGGER.info("No active session or appname detected") +60 app_name = "lakehouse_engine" +61 # we will still add this part to set configs +62 session_builder = SparkSession.builder.appName(app_name) +63 if config: +64 for k, v in final_config.items(): +65 session_builder.config(k, v) +66 +67 if enable_hive_support: +68 session_builder = session_builder.enableHiveSupport() +69 cls.SESSION = session_builder.getOrCreate() +70 +71 cls._set_environment_variables(final_config.get("os_env_vars")) 72 -73 Args: -74 os_env_vars: this parameter can be used to pass the environment variables to -75 be defined. -76 """ -77 if os_env_vars is None: -78 os_env_vars = {} +73 @classmethod +74 def _set_environment_variables(cls, os_env_vars: dict = None) -> None: +75 """Set environment variables at OS level. +76 +77 By default, we are setting the AWS_DEFAULT_REGION as we have identified this is +78 beneficial to avoid getBucketLocation permission problems. 79 -80 for env_var in os_env_vars.items(): -81 os.environ[env_var[0]] = env_var[1] -82 -83 if "AWS_DEFAULT_REGION" not in os_env_vars: -84 os.environ["AWS_DEFAULT_REGION"] = cls.SESSION.sparkContext.getConf().get( -85 "spark.databricks.clusterUsageTags.region", cls.DEFAULT_AWS_REGION -86 ) +80 Args: +81 os_env_vars: this parameter can be used to pass the environment variables to +82 be defined. +83 """ +84 if os_env_vars is None: +85 os_env_vars = {} +86 +87 for env_var in os_env_vars.items(): +88 os.environ[env_var[0]] = env_var[1] +89 +90 if "AWS_DEFAULT_REGION" not in os_env_vars: +91 os.environ["AWS_DEFAULT_REGION"] = cls.SESSION.sparkContext.getConf().get( +92 "spark.databricks.clusterUsageTags.region", cls.DEFAULT_AWS_REGION +93 )
21 @classmethod -22 def get_or_create( -23 cls, -24 session: SparkSession = None, -25 enable_hive_support: bool = True, -26 app_name: str = None, -27 config: dict = None, -28 ) -> None: -29 """Get or create an execution environment session (currently Spark). -30 -31 It instantiates a singleton session that can be accessed anywhere from the -32 lakehouse engine. -33 -34 Args: -35 session: spark session. -36 enable_hive_support: whether to enable hive support or not. -37 app_name: application name. -38 config: extra spark configs to supply to the spark session. -39 """ -40 default_config = { -41 "spark.databricks.delta.optimizeWrite.enabled": True, -42 "spark.sql.adaptive.enabled": True, -43 "spark.databricks.delta.merge.enableLowShuffle": True, -44 } -45 cls._LOGGER.info( -46 f"Using the following default configs you may want to override them for " -47 f"your job: {default_config}" -48 ) -49 final_config: dict = {**default_config, **(config if config else {})} -50 cls._LOGGER.info(f"Final config is: {final_config}") -51 -52 if session: -53 cls.SESSION = session -54 else: -55 session_builder = SparkSession.builder.appName(app_name) -56 if config: -57 session_builder = session_builder.config( -58 conf=SparkConf().setAll(final_config.items()) # type: ignore -59 ) -60 if enable_hive_support: -61 session_builder = session_builder.enableHiveSupport() -62 cls.SESSION = session_builder.getOrCreate() -63 -64 cls._set_environment_variables(final_config.get("os_env_vars")) +diff --git a/lakehouse_engine/core/file_manager.html b/lakehouse_engine/core/file_manager.html index 34af7db..8ae2cd2 100644 --- a/lakehouse_engine/core/file_manager.html +++ b/lakehouse_engine/core/file_manager.html @@ -97,7 +97,7 @@20 @classmethod +21 def get_or_create( +22 cls, +23 session: SparkSession = None, +24 enable_hive_support: bool = True, +25 app_name: str = None, +26 config: dict = None, +27 ) -> None: +28 """Get or create an execution environment session (currently Spark). +29 +30 It instantiates a singleton session that can be accessed anywhere from the +31 lakehouse engine. +32 +33 Args: +34 session: spark session. +35 enable_hive_support: whether to enable hive support or not. +36 app_name: application name. +37 config: extra spark configs to supply to the spark session. +38 """ +39 default_config = { +40 "spark.databricks.delta.optimizeWrite.enabled": True, +41 "spark.sql.adaptive.enabled": True, +42 "spark.databricks.delta.merge.enableLowShuffle": True, +43 } +44 cls._LOGGER.info( +45 f"Using the following default configs you may want to override them for " +46 f"your job: {default_config}" +47 ) +48 final_config: dict = {**default_config, **(config if config else {})} +49 cls._LOGGER.info(f"Final config is: {final_config}") +50 +51 if session: +52 cls.SESSION = session +53 else: +54 # with active session we do not need app name +55 if SparkSession.getActiveSession(): +56 app_name = SparkSession.getActiveSession().sparkContext.appName +57 cls._LOGGER.info(f"Detected active session: {app_name}") +58 elif not SparkSession.getActiveSession() and not app_name: +59 cls._LOGGER.info("No active session or appname detected") +60 app_name = "lakehouse_engine" +61 # we will still add this part to set configs +62 session_builder = SparkSession.builder.appName(app_name) +63 if config: +64 for k, v in final_config.items(): +65 session_builder.config(k, v) +66 +67 if enable_hive_support: +68 session_builder = session_builder.enableHiveSupport() +69 cls.SESSION = session_builder.getOrCreate() +70 +71 cls._set_environment_variables(final_config.get("os_env_vars"))
@@ -636,218 +729,247 @@1"""File manager module.""" 2import time - 3from typing import Any, Optional + 3from typing import Any, Optional, Tuple 4 5import boto3 6 @@ -111,516 +111,609 @@14from lakehouse_engine.utils.logging_handler import LoggingHandler 15 16 - 17def _dry_run(bucket: str, object_paths: list) -> dict: - 18 """Build the dry run request return format. + 17def _process_directory_path(path: str) -> str: + 18 """Add '/' to the end of the path of a directory. 19 20 Args: - 21 bucket: name of bucket to perform operation. - 22 object_paths: paths of object to list. - 23 - 24 Returns: - 25 A dict with a list of objects that would be copied/deleted. - 26 """ - 27 response = {} + 21 path: directory to be processed + 22 + 23 Returns: + 24 Directory path stripped and with '/' at the end. + 25 """ + 26 path = path.strip() + 27 return path if path[-1] == "/" else path + "/" 28 - 29 for path in object_paths: - 30 path = path.strip() - 31 res = _list_objects_recursively(bucket=bucket, path=path) + 29 + 30def _dry_run(bucket: str, object_paths: list) -> dict: + 31 """Build the dry run request return format. 32 - 33 if res: - 34 response[path] = res - 35 else: - 36 response[path] = ["No such key"] - 37 - 38 return response - 39 - 40 - 41def _list_objects_recursively(bucket: str, path: str) -> list: - 42 """Recursively list all objects given a prefix in s3. - 43 - 44 Args: - 45 bucket: name of bucket to perform the list. - 46 path: path to be used as a prefix. + 33 Args: + 34 bucket: name of bucket to perform operation. + 35 object_paths: paths of object to list. + 36 + 37 Returns: + 38 A dict with a list of objects that would be copied/deleted. + 39 """ + 40 response = {} + 41 + 42 for path in object_paths: + 43 if _check_directory(bucket, path): + 44 path = _process_directory_path(path) + 45 + 46 res = _list_objects_recursively(bucket=bucket, path=path) 47 - 48 Returns: - 49 A list of object names fetched recursively. - 50 """ - 51 object_list = [] - 52 more_objects = True - 53 pagination = "" + 48 if res: + 49 response[path] = res + 50 else: + 51 response[path] = ["No such key"] + 52 + 53 return response 54 - 55 s3 = boto3.client("s3") - 56 - 57 while more_objects: - 58 if not pagination: - 59 list_response = s3.list_objects_v2(Bucket=bucket, Prefix=path) - 60 else: - 61 list_response = s3.list_objects_v2( - 62 Bucket=bucket, - 63 Prefix=path, - 64 ContinuationToken=pagination, - 65 ) - 66 - 67 if FileManagerAPIKeys.CONTENTS.value in list_response: - 68 for obj in list_response[FileManagerAPIKeys.CONTENTS.value]: - 69 object_list.append(obj[FileManagerAPIKeys.KEY.value]) + 55 + 56def _list_objects( + 57 s3_client: Any, bucket: str, path: str, paginator: str = "" + 58) -> Tuple[list, str]: + 59 """List 1000 objects in a bucket given a prefix and paginator in s3. + 60 + 61 Args: + 62 bucket: name of bucket to perform the list. + 63 path: path to be used as a prefix. + 64 paginator: paginator token to be used. + 65 + 66 Returns: + 67 A list of object names. + 68 """ + 69 object_list = [] 70 - 71 if FileManagerAPIKeys.CONTINUATION.value in list_response: - 72 pagination = list_response[FileManagerAPIKeys.CONTINUATION.value] - 73 else: - 74 more_objects = False - 75 - 76 return object_list - 77 - 78 - 79class FileManager(object): - 80 """Set of actions to manipulate files in several ways.""" - 81 - 82 _logger = LoggingHandler(__name__).get_logger() + 71 if not paginator: + 72 list_response = s3_client.list_objects_v2(Bucket=bucket, Prefix=path) + 73 else: + 74 list_response = s3_client.list_objects_v2( + 75 Bucket=bucket, + 76 Prefix=path, + 77 ContinuationToken=paginator, + 78 ) + 79 + 80 if FileManagerAPIKeys.CONTENTS.value in list_response: + 81 for obj in list_response[FileManagerAPIKeys.CONTENTS.value]: + 82 object_list.append(obj[FileManagerAPIKeys.KEY.value]) 83 - 84 def __init__(self, configs: dict): - 85 """Construct FileManager algorithm instances. - 86 - 87 Args: - 88 configs: configurations for the FileManager algorithm. - 89 """ - 90 self.configs = configs - 91 self.function = self.configs["function"] - 92 - 93 def get_function(self) -> None: - 94 """Get a specific function to execute.""" - 95 available_functions = { - 96 "delete_objects": self.delete_objects, - 97 "copy_objects": self.copy_objects, - 98 "request_restore": self.request_restore, - 99 "check_restore_status": self.check_restore_status, -100 "request_restore_to_destination_and_wait": ( -101 self.request_restore_to_destination_and_wait -102 ), -103 } -104 -105 self._logger.info("Function being executed: {}".format(self.function)) -106 if self.function in available_functions.keys(): -107 func = available_functions[self.function] -108 func() -109 else: -110 raise NotImplementedError( -111 f"The requested function {self.function} is not implemented." -112 ) -113 -114 def delete_objects(self) -> None: -115 """Delete objects and 'directories' in s3. -116 -117 If dry_run is set to True the function will print a dict with all the -118 paths that would be deleted based on the given keys. -119 """ -120 bucket = self.configs["bucket"] -121 objects_paths = self.configs["object_paths"] -122 dry_run = self.configs["dry_run"] -123 -124 s3 = boto3.client("s3") + 84 if FileManagerAPIKeys.CONTINUATION.value in list_response: + 85 pagination = list_response[FileManagerAPIKeys.CONTINUATION.value] + 86 else: + 87 pagination = "" + 88 + 89 return object_list, pagination + 90 + 91 + 92def _list_objects_recursively(bucket: str, path: str) -> list: + 93 """Recursively list all objects given a prefix in s3. + 94 + 95 Args: + 96 bucket: name of bucket to perform the list. + 97 path: path to be used as a prefix. + 98 + 99 Returns: +100 A list of object names fetched recursively. +101 """ +102 object_list = [] +103 more_objects = True +104 paginator = "" +105 +106 s3 = boto3.client("s3") +107 +108 while more_objects: +109 temp_list, paginator = _list_objects(s3, bucket, path, paginator) +110 +111 object_list.extend(temp_list) +112 +113 if not paginator: +114 more_objects = False +115 +116 return object_list +117 +118 +119def _check_directory(bucket: str, path: str) -> bool: +120 """Checks if the object is a 'directory' in s3. +121 +122 Args: +123 bucket: name of bucket to perform the check. +124 path: path to be used as a prefix. 125 -126 if dry_run: -127 response = _dry_run(bucket=bucket, object_paths=objects_paths) -128 -129 self._logger.info("Paths that would be deleted:") -130 else: -131 objects_to_delete = [] -132 for path in objects_paths: -133 for obj in _list_objects_recursively(bucket=bucket, path=path): -134 objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj}) -135 -136 response = s3.delete_objects( -137 Bucket=bucket, -138 Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete}, -139 ) -140 -141 self._logger.info(response) -142 -143 def copy_objects(self) -> None: -144 """Copies objects and 'directories' in s3.""" -145 source_bucket = self.configs["bucket"] -146 source_object = self.configs["source_object"] -147 destination_bucket = self.configs["destination_bucket"] -148 destination_object = self.configs["destination_object"] -149 dry_run = self.configs["dry_run"] -150 -151 FileManager._copy_objects( -152 source_bucket=source_bucket, -153 source_object=source_object, -154 destination_bucket=destination_bucket, -155 destination_object=destination_object, -156 dry_run=dry_run, -157 ) -158 -159 def request_restore(self) -> None: -160 """Request the restore of archived data.""" -161 source_bucket = self.configs["bucket"] -162 source_object = self.configs["source_object"] -163 restore_expiration = self.configs["restore_expiration"] -164 retrieval_tier = self.configs["retrieval_tier"] -165 dry_run = self.configs["dry_run"] -166 -167 ArchiveFileManager.request_restore( -168 source_bucket, -169 source_object, -170 restore_expiration, -171 retrieval_tier, -172 dry_run, -173 ) -174 -175 def check_restore_status(self) -> None: -176 """Check the restore status of archived data.""" -177 source_bucket = self.configs["bucket"] -178 source_object = self.configs["source_object"] -179 -180 restore_status = ArchiveFileManager.check_restore_status( -181 source_bucket, source_object -182 ) +126 Returns: +127 If path represents a 'directory'. +128 """ +129 s3 = boto3.client("s3") +130 objects, _ = _list_objects(s3, bucket, path) +131 return len(objects) > 1 +132 +133 +134class FileManager(object): +135 """Set of actions to manipulate files in several ways.""" +136 +137 _logger = LoggingHandler(__name__).get_logger() +138 +139 def __init__(self, configs: dict): +140 """Construct FileManager algorithm instances. +141 +142 Args: +143 configs: configurations for the FileManager algorithm. +144 """ +145 self.configs = configs +146 self.function = self.configs["function"] +147 +148 def get_function(self) -> None: +149 """Get a specific function to execute.""" +150 available_functions = { +151 "delete_objects": self.delete_objects, +152 "copy_objects": self.copy_objects, +153 "request_restore": self.request_restore, +154 "check_restore_status": self.check_restore_status, +155 "request_restore_to_destination_and_wait": ( +156 self.request_restore_to_destination_and_wait +157 ), +158 } +159 +160 self._logger.info("Function being executed: {}".format(self.function)) +161 if self.function in available_functions.keys(): +162 func = available_functions[self.function] +163 func() +164 else: +165 raise NotImplementedError( +166 f"The requested function {self.function} is not implemented." +167 ) +168 +169 def _delete_objects(self, bucket: str, objects_paths: list) -> None: +170 """Delete objects recursively in s3. +171 +172 Params: +173 bucket: name of bucket to perform the delete operation. +174 objects_paths: objects to be deleted. +175 """ +176 s3 = boto3.client("s3") +177 +178 for path in objects_paths: +179 if _check_directory(bucket, path): +180 path = _process_directory_path(path) +181 else: +182 path = path.strip() 183 -184 self._logger.info( -185 f""" -186 Restore status: -187 - Not Started: {restore_status.get('not_started_objects')} -188 - Ongoing: {restore_status.get('ongoing_objects')} -189 - Restored: {restore_status.get('restored_objects')} -190 Total objects in this restore process: {restore_status.get('total_objects')} -191 """ -192 ) -193 -194 def request_restore_to_destination_and_wait(self) -> None: -195 """Request and wait for the restore to complete, polling the restore status. -196 -197 After the restore is done, copy the restored files to destination -198 """ -199 source_bucket = self.configs["bucket"] -200 source_object = self.configs["source_object"] -201 destination_bucket = self.configs["destination_bucket"] -202 destination_object = self.configs["destination_object"] -203 restore_expiration = self.configs["restore_expiration"] -204 retrieval_tier = self.configs["retrieval_tier"] -205 dry_run = self.configs["dry_run"] -206 -207 ArchiveFileManager.request_restore_and_wait( -208 source_bucket=source_bucket, -209 source_object=source_object, -210 restore_expiration=restore_expiration, -211 retrieval_tier=retrieval_tier, -212 dry_run=dry_run, -213 ) +184 more_objects = True +185 paginator = "" +186 objects_to_delete = [] +187 +188 while more_objects: +189 objects_found, paginator = _list_objects( +190 s3_client=s3, bucket=bucket, path=path, paginator=paginator +191 ) +192 for obj in objects_found: +193 objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj}) +194 +195 if not paginator: +196 more_objects = False +197 +198 response = s3.delete_objects( +199 Bucket=bucket, +200 Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete}, +201 ) +202 self._logger.info(response) +203 objects_to_delete = [] +204 +205 def delete_objects(self) -> None: +206 """Delete objects and 'directories' in s3. +207 +208 If dry_run is set to True the function will print a dict with all the +209 paths that would be deleted based on the given keys. +210 """ +211 bucket = self.configs["bucket"] +212 objects_paths = self.configs["object_paths"] +213 dry_run = self.configs["dry_run"] 214 -215 FileManager._logger.info( -216 f"Restoration complete for {source_bucket} and {source_object}" -217 ) -218 FileManager._logger.info( -219 f"Starting to copy data from {source_bucket}/{source_object} to " -220 f"{destination_bucket}/{destination_object}" -221 ) -222 FileManager._copy_objects( -223 source_bucket=source_bucket, -224 source_object=source_object, -225 destination_bucket=destination_bucket, -226 destination_object=destination_object, -227 dry_run=dry_run, -228 ) -229 FileManager._logger.info( -230 f"Finished copying data, data should be available on {destination_bucket}/" -231 f"{destination_object}" -232 ) -233 -234 @staticmethod -235 def _copy_objects( -236 source_bucket: str, -237 source_object: str, -238 destination_bucket: str, -239 destination_object: str, -240 dry_run: bool, -241 ) -> None: -242 """Copies objects and 'directories' in s3. -243 -244 Args: -245 source_bucket: name of bucket to perform the copy. -246 source_object: object/folder to be copied. -247 destination_bucket: name of the target bucket to copy. -248 destination_object: target object/folder to copy. -249 dry_run: if dry_run is set to True the function will print a dict with -250 all the paths that would be deleted based on the given keys. -251 """ -252 s3 = boto3.client("s3") -253 -254 if dry_run: -255 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) -256 -257 FileManager._logger.info("Paths that would be copied:") -258 FileManager._logger.info(response) -259 else: -260 copy_object = _list_objects_recursively( -261 bucket=source_bucket, path=source_object -262 ) +215 if dry_run: +216 response = _dry_run(bucket=bucket, object_paths=objects_paths) +217 +218 self._logger.info("Paths that would be deleted:") +219 self._logger.info(response) +220 else: +221 self._delete_objects(bucket, objects_paths) +222 +223 def copy_objects(self) -> None: +224 """Copies objects and 'directories' in s3.""" +225 source_bucket = self.configs["bucket"] +226 source_object = self.configs["source_object"] +227 destination_bucket = self.configs["destination_bucket"] +228 destination_object = self.configs["destination_object"] +229 dry_run = self.configs["dry_run"] +230 +231 FileManager._copy_objects( +232 source_bucket=source_bucket, +233 source_object=source_object, +234 destination_bucket=destination_bucket, +235 destination_object=destination_object, +236 dry_run=dry_run, +237 ) +238 +239 def request_restore(self) -> None: +240 """Request the restore of archived data.""" +241 source_bucket = self.configs["bucket"] +242 source_object = self.configs["source_object"] +243 restore_expiration = self.configs["restore_expiration"] +244 retrieval_tier = self.configs["retrieval_tier"] +245 dry_run = self.configs["dry_run"] +246 +247 ArchiveFileManager.request_restore( +248 source_bucket, +249 source_object, +250 restore_expiration, +251 retrieval_tier, +252 dry_run, +253 ) +254 +255 def check_restore_status(self) -> None: +256 """Check the restore status of archived data.""" +257 source_bucket = self.configs["bucket"] +258 source_object = self.configs["source_object"] +259 +260 restore_status = ArchiveFileManager.check_restore_status( +261 source_bucket, source_object +262 ) 263 -264 if len(copy_object) == 1: -265 FileManager._logger.info(f"Copying obj: {source_object}") -266 -267 response = s3.copy_object( -268 Bucket=destination_bucket, -269 CopySource={ -270 FileManagerAPIKeys.BUCKET.value: source_bucket, -271 FileManagerAPIKeys.KEY.value: source_object, -272 }, -273 Key=f"""{destination_object}/{copy_object[0].split("/")[-1]}""", -274 ) -275 FileManager._logger.info(response) -276 else: -277 for obj in copy_object: -278 FileManager._logger.info(f"Copying obj: {obj}") -279 -280 final_path = obj.replace(source_object, "") -281 -282 response = s3.copy_object( -283 Bucket=destination_bucket, -284 CopySource={ -285 FileManagerAPIKeys.BUCKET.value: source_bucket, -286 FileManagerAPIKeys.KEY.value: obj, -287 }, -288 Key=f"{destination_object}{final_path}", -289 ) -290 FileManager._logger.info(response) -291 -292 -293class ArchiveFileManager(object): -294 """Set of actions to restore archives.""" -295 -296 _logger = LoggingHandler(__name__).get_logger() -297 -298 @staticmethod -299 def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]: -300 """Get the archived object if it's an object. -301 -302 Args: -303 bucket: name of bucket to check get the object. -304 object_key: object to get. -305 -306 Returns: -307 S3 Object if it's an archived object, otherwise None. -308 """ -309 s3 = boto3.resource("s3") -310 object_to_restore = s3.Object(bucket, object_key) -311 -312 if ( -313 object_to_restore.storage_class is not None -314 and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS -315 ): -316 return object_to_restore -317 else: -318 return None -319 -320 @staticmethod -321 def _check_object_restore_status( -322 bucket: str, object_key: str -323 ) -> Optional[RestoreStatus]: -324 """Check the restore status of the archive. -325 -326 Args: -327 bucket: name of bucket to check the restore status. -328 object_key: object to check the restore status. -329 -330 Returns: -331 The restore status represented by an enum, possible values are: -332 NOT_STARTED, ONGOING or RESTORED -333 """ -334 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) -335 -336 if archived_object is None: -337 status = None -338 elif archived_object.restore is None: -339 status = RestoreStatus.NOT_STARTED -340 elif 'ongoing-request="true"' in archived_object.restore: -341 status = RestoreStatus.ONGOING -342 else: -343 status = RestoreStatus.RESTORED +264 self._logger.info( +265 f""" +266 Restore status: +267 - Not Started: {restore_status.get('not_started_objects')} +268 - Ongoing: {restore_status.get('ongoing_objects')} +269 - Restored: {restore_status.get('restored_objects')} +270 Total objects in this restore process: {restore_status.get('total_objects')} +271 """ +272 ) +273 +274 def request_restore_to_destination_and_wait(self) -> None: +275 """Request and wait for the restore to complete, polling the restore status. +276 +277 After the restore is done, copy the restored files to destination +278 """ +279 source_bucket = self.configs["bucket"] +280 source_object = self.configs["source_object"] +281 destination_bucket = self.configs["destination_bucket"] +282 destination_object = self.configs["destination_object"] +283 restore_expiration = self.configs["restore_expiration"] +284 retrieval_tier = self.configs["retrieval_tier"] +285 dry_run = self.configs["dry_run"] +286 +287 ArchiveFileManager.request_restore_and_wait( +288 source_bucket=source_bucket, +289 source_object=source_object, +290 restore_expiration=restore_expiration, +291 retrieval_tier=retrieval_tier, +292 dry_run=dry_run, +293 ) +294 +295 FileManager._logger.info( +296 f"Restoration complete for {source_bucket} and {source_object}" +297 ) +298 FileManager._logger.info( +299 f"Starting to copy data from {source_bucket}/{source_object} to " +300 f"{destination_bucket}/{destination_object}" +301 ) +302 FileManager._copy_objects( +303 source_bucket=source_bucket, +304 source_object=source_object, +305 destination_bucket=destination_bucket, +306 destination_object=destination_object, +307 dry_run=dry_run, +308 ) +309 FileManager._logger.info( +310 f"Finished copying data, data should be available on {destination_bucket}/" +311 f"{destination_object}" +312 ) +313 +314 @staticmethod +315 def _copy_objects( +316 source_bucket: str, +317 source_object: str, +318 destination_bucket: str, +319 destination_object: str, +320 dry_run: bool, +321 ) -> None: +322 """Copies objects and 'directories' in s3. +323 +324 Args: +325 source_bucket: name of bucket to perform the copy. +326 source_object: object/folder to be copied. +327 destination_bucket: name of the target bucket to copy. +328 destination_object: target object/folder to copy. +329 dry_run: if dry_run is set to True the function will print a dict with +330 all the paths that would be deleted based on the given keys. +331 """ +332 s3 = boto3.client("s3") +333 +334 if dry_run: +335 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) +336 +337 FileManager._logger.info("Paths that would be copied:") +338 FileManager._logger.info(response) +339 else: +340 original_object_name = source_object.split("/")[-1] +341 +342 if _check_directory(source_bucket, source_object): +343 source_object = _process_directory_path(source_object) 344 -345 return status -346 -347 @staticmethod -348 def check_restore_status(source_bucket: str, source_object: str) -> dict: -349 """Check the restore status of archived data. -350 -351 Args: -352 source_bucket: name of bucket to check the restore status. -353 source_object: object to check the restore status. -354 -355 Returns: -356 A dict containing the amount of objects in each status. -357 """ -358 not_started_objects = 0 -359 ongoing_objects = 0 -360 restored_objects = 0 -361 total_objects = 0 -362 -363 objects_to_restore = _list_objects_recursively( -364 bucket=source_bucket, path=source_object -365 ) -366 -367 for obj in objects_to_restore: -368 ArchiveFileManager._logger.info(f"Checking restore status for: {obj}") -369 -370 restore_status = ArchiveFileManager._check_object_restore_status( -371 source_bucket, obj -372 ) -373 if not restore_status: -374 ArchiveFileManager._logger.warning( -375 f"Restore status not found for {source_bucket}/{obj}" -376 ) -377 else: -378 total_objects += 1 +345 copy_object = _list_objects_recursively( +346 bucket=source_bucket, path=source_object +347 ) +348 +349 for obj in copy_object: +350 FileManager._logger.info(f"Copying obj: {obj}") +351 +352 final_path = obj.replace(source_object, "") +353 +354 response = s3.copy_object( +355 Bucket=destination_bucket, +356 CopySource={ +357 FileManagerAPIKeys.BUCKET.value: source_bucket, +358 FileManagerAPIKeys.KEY.value: obj, +359 }, +360 Key=f"{destination_object}/{original_object_name}/{final_path}", +361 ) +362 FileManager._logger.info(response) +363 else: +364 FileManager._logger.info(f"Copying obj: {source_object}") +365 +366 response = s3.copy_object( +367 Bucket=destination_bucket, +368 CopySource={ +369 FileManagerAPIKeys.BUCKET.value: source_bucket, +370 FileManagerAPIKeys.KEY.value: source_object, +371 }, +372 Key=f"""{destination_object}/{original_object_name}""", +373 ) +374 FileManager._logger.info(response) +375 +376 +377class ArchiveFileManager(object): +378 """Set of actions to restore archives.""" 379 -380 if RestoreStatus.NOT_STARTED == restore_status: -381 not_started_objects += 1 -382 elif RestoreStatus.ONGOING == restore_status: -383 ongoing_objects += 1 -384 else: -385 restored_objects += 1 -386 -387 ArchiveFileManager._logger.info( -388 f"{obj} restore status is {restore_status.value}" -389 ) -390 -391 return { -392 "total_objects": total_objects, -393 "not_started_objects": not_started_objects, -394 "ongoing_objects": ongoing_objects, -395 "restored_objects": restored_objects, -396 } -397 -398 @staticmethod -399 def _request_restore_object( -400 bucket: str, object_key: str, expiration: int, retrieval_tier: str -401 ) -> None: -402 """Request a restore of the archive. +380 _logger = LoggingHandler(__name__).get_logger() +381 +382 @staticmethod +383 def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]: +384 """Get the archived object if it's an object. +385 +386 Args: +387 bucket: name of bucket to check get the object. +388 object_key: object to get. +389 +390 Returns: +391 S3 Object if it's an archived object, otherwise None. +392 """ +393 s3 = boto3.resource("s3") +394 object_to_restore = s3.Object(bucket, object_key) +395 +396 if ( +397 object_to_restore.storage_class is not None +398 and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS +399 ): +400 return object_to_restore +401 else: +402 return None 403 -404 Args: -405 bucket: name of bucket to perform the restore. -406 object_key: object to be restored. -407 expiration: restore expiration. -408 retrieval_tier: type of restore, possible values are: -409 Bulk, Standard or Expedited. -410 """ -411 if not RestoreType.exists(retrieval_tier): -412 raise RestoreTypeNotFoundException( -413 f"Restore type {retrieval_tier} not supported." -414 ) -415 -416 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) -417 -418 if archived_object and archived_object.restore is None: -419 ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.") -420 archived_object.restore_object( -421 RestoreRequest={ -422 "Days": expiration, -423 "GlacierJobParameters": {"Tier": retrieval_tier}, -424 } -425 ) +404 @staticmethod +405 def _check_object_restore_status( +406 bucket: str, object_key: str +407 ) -> Optional[RestoreStatus]: +408 """Check the restore status of the archive. +409 +410 Args: +411 bucket: name of bucket to check the restore status. +412 object_key: object to check the restore status. +413 +414 Returns: +415 The restore status represented by an enum, possible values are: +416 NOT_STARTED, ONGOING or RESTORED +417 """ +418 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) +419 +420 if archived_object is None: +421 status = None +422 elif archived_object.restore is None: +423 status = RestoreStatus.NOT_STARTED +424 elif 'ongoing-request="true"' in archived_object.restore: +425 status = RestoreStatus.ONGOING 426 else: -427 ArchiveFileManager._logger.info( -428 f"Restore request for {bucket}/{object_key} not performed." -429 ) +427 status = RestoreStatus.RESTORED +428 +429 return status 430 431 @staticmethod -432 def request_restore( -433 source_bucket: str, -434 source_object: str, -435 restore_expiration: int, -436 retrieval_tier: str, -437 dry_run: bool, -438 ) -> None: -439 """Request the restore of archived data. -440 -441 Args: -442 source_bucket: name of bucket to perform the restore. -443 source_object: object to be restored. -444 restore_expiration: restore expiration in days. -445 retrieval_tier: type of restore, possible values are: -446 Bulk, Standard or Expedited. -447 dry_run: if dry_run is set to True the function will print a dict with -448 all the paths that would be deleted based on the given keys. -449 """ -450 if dry_run: -451 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) -452 -453 ArchiveFileManager._logger.info("Paths that would be restored:") -454 ArchiveFileManager._logger.info(response) -455 else: -456 objects_to_restore = _list_objects_recursively( -457 bucket=source_bucket, path=source_object -458 ) -459 -460 for obj in objects_to_restore: -461 ArchiveFileManager._request_restore_object( -462 source_bucket, -463 obj, -464 restore_expiration, -465 retrieval_tier, -466 ) -467 -468 @staticmethod -469 def request_restore_and_wait( -470 source_bucket: str, -471 source_object: str, -472 restore_expiration: int, -473 retrieval_tier: str, -474 dry_run: bool, -475 ) -> None: -476 """Request and wait for the restore to complete, polling the restore status. +432 def check_restore_status(source_bucket: str, source_object: str) -> dict: +433 """Check the restore status of archived data. +434 +435 Args: +436 source_bucket: name of bucket to check the restore status. +437 source_object: object to check the restore status. +438 +439 Returns: +440 A dict containing the amount of objects in each status. +441 """ +442 not_started_objects = 0 +443 ongoing_objects = 0 +444 restored_objects = 0 +445 total_objects = 0 +446 +447 if _check_directory(source_bucket, source_object): +448 source_object = _process_directory_path(source_object) +449 +450 objects_to_restore = _list_objects_recursively( +451 bucket=source_bucket, path=source_object +452 ) +453 +454 for obj in objects_to_restore: +455 ArchiveFileManager._logger.info(f"Checking restore status for: {obj}") +456 +457 restore_status = ArchiveFileManager._check_object_restore_status( +458 source_bucket, obj +459 ) +460 if not restore_status: +461 ArchiveFileManager._logger.warning( +462 f"Restore status not found for {source_bucket}/{obj}" +463 ) +464 else: +465 total_objects += 1 +466 +467 if RestoreStatus.NOT_STARTED == restore_status: +468 not_started_objects += 1 +469 elif RestoreStatus.ONGOING == restore_status: +470 ongoing_objects += 1 +471 else: +472 restored_objects += 1 +473 +474 ArchiveFileManager._logger.info( +475 f"{obj} restore status is {restore_status.value}" +476 ) 477 -478 Args: -479 source_bucket: name of bucket to perform the restore. -480 source_object: object to be restored. -481 restore_expiration: restore expiration in days. -482 retrieval_tier: type of restore, possible values are: -483 Bulk, Standard or Expedited. -484 dry_run: if dry_run is set to True the function will print a dict with -485 all the paths that would be deleted based on the given keys. -486 """ -487 if retrieval_tier != RestoreType.EXPEDITED.value: -488 ArchiveFileManager._logger.error( -489 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " -490 "kind of restore should be used just with `Expedited` retrieval tier " -491 "to save cluster costs." -492 ) -493 raise ValueError( -494 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " -495 "kind of restore should be used just with `Expedited` retrieval tier " -496 "to save cluster costs." -497 ) -498 -499 ArchiveFileManager.request_restore( -500 source_bucket=source_bucket, -501 source_object=source_object, -502 restore_expiration=restore_expiration, -503 retrieval_tier=retrieval_tier, -504 dry_run=dry_run, -505 ) -506 restore_status = ArchiveFileManager.check_restore_status( -507 source_bucket, source_object -508 ) -509 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") -510 -511 if not dry_run: -512 ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.") -513 wait_time = 300 -514 while restore_status.get("total_objects") > restore_status.get( -515 "restored_objects" -516 ): -517 ArchiveFileManager._logger.info( -518 "Not all objects have been restored yet, checking the status again " -519 f"in {wait_time} seconds." -520 ) -521 time.sleep(wait_time) -522 wait_time = 30 -523 restore_status = ArchiveFileManager.check_restore_status( -524 source_bucket, source_object -525 ) -526 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") +478 return { +479 "total_objects": total_objects, +480 "not_started_objects": not_started_objects, +481 "ongoing_objects": ongoing_objects, +482 "restored_objects": restored_objects, +483 } +484 +485 @staticmethod +486 def _request_restore_object( +487 bucket: str, object_key: str, expiration: int, retrieval_tier: str +488 ) -> None: +489 """Request a restore of the archive. +490 +491 Args: +492 bucket: name of bucket to perform the restore. +493 object_key: object to be restored. +494 expiration: restore expiration. +495 retrieval_tier: type of restore, possible values are: +496 Bulk, Standard or Expedited. +497 """ +498 if not RestoreType.exists(retrieval_tier): +499 raise RestoreTypeNotFoundException( +500 f"Restore type {retrieval_tier} not supported." +501 ) +502 +503 if _check_directory(bucket, object_key): +504 object_key = _process_directory_path(object_key) +505 +506 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) +507 +508 if archived_object and archived_object.restore is None: +509 ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.") +510 archived_object.restore_object( +511 RestoreRequest={ +512 "Days": expiration, +513 "GlacierJobParameters": {"Tier": retrieval_tier}, +514 } +515 ) +516 else: +517 ArchiveFileManager._logger.info( +518 f"Restore request for {bucket}/{object_key} not performed." +519 ) +520 +521 @staticmethod +522 def request_restore( +523 source_bucket: str, +524 source_object: str, +525 restore_expiration: int, +526 retrieval_tier: str, +527 dry_run: bool, +528 ) -> None: +529 """Request the restore of archived data. +530 +531 Args: +532 source_bucket: name of bucket to perform the restore. +533 source_object: object to be restored. +534 restore_expiration: restore expiration in days. +535 retrieval_tier: type of restore, possible values are: +536 Bulk, Standard or Expedited. +537 dry_run: if dry_run is set to True the function will print a dict with +538 all the paths that would be deleted based on the given keys. +539 """ +540 if _check_directory(source_bucket, source_object): +541 source_object = _process_directory_path(source_object) +542 +543 if dry_run: +544 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) +545 +546 ArchiveFileManager._logger.info("Paths that would be restored:") +547 ArchiveFileManager._logger.info(response) +548 else: +549 objects_to_restore = _list_objects_recursively( +550 bucket=source_bucket, path=source_object +551 ) +552 +553 for obj in objects_to_restore: +554 ArchiveFileManager._request_restore_object( +555 source_bucket, +556 obj, +557 restore_expiration, +558 retrieval_tier, +559 ) +560 +561 @staticmethod +562 def request_restore_and_wait( +563 source_bucket: str, +564 source_object: str, +565 restore_expiration: int, +566 retrieval_tier: str, +567 dry_run: bool, +568 ) -> None: +569 """Request and wait for the restore to complete, polling the restore status. +570 +571 Args: +572 source_bucket: name of bucket to perform the restore. +573 source_object: object to be restored. +574 restore_expiration: restore expiration in days. +575 retrieval_tier: type of restore, possible values are: +576 Bulk, Standard or Expedited. +577 dry_run: if dry_run is set to True the function will print a dict with +578 all the paths that would be deleted based on the given keys. +579 """ +580 if retrieval_tier != RestoreType.EXPEDITED.value: +581 ArchiveFileManager._logger.error( +582 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " +583 "kind of restore should be used just with `Expedited` retrieval tier " +584 "to save cluster costs." +585 ) +586 raise ValueError( +587 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " +588 "kind of restore should be used just with `Expedited` retrieval tier " +589 "to save cluster costs." +590 ) +591 +592 ArchiveFileManager.request_restore( +593 source_bucket=source_bucket, +594 source_object=source_object, +595 restore_expiration=restore_expiration, +596 retrieval_tier=retrieval_tier, +597 dry_run=dry_run, +598 ) +599 restore_status = ArchiveFileManager.check_restore_status( +600 source_bucket, source_object +601 ) +602 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") +603 +604 if not dry_run: +605 ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.") +606 wait_time = 300 +607 while restore_status.get("total_objects") > restore_status.get( +608 "restored_objects" +609 ): +610 ArchiveFileManager._logger.info( +611 "Not all objects have been restored yet, checking the status again " +612 f"in {wait_time} seconds." +613 ) +614 time.sleep(wait_time) +615 wait_time = 30 +616 restore_status = ArchiveFileManager.check_restore_status( +617 source_bucket, source_object +618 ) +619 ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
80class FileManager(object): - 81 """Set of actions to manipulate files in several ways.""" - 82 - 83 _logger = LoggingHandler(__name__).get_logger() - 84 - 85 def __init__(self, configs: dict): - 86 """Construct FileManager algorithm instances. - 87 - 88 Args: - 89 configs: configurations for the FileManager algorithm. - 90 """ - 91 self.configs = configs - 92 self.function = self.configs["function"] - 93 - 94 def get_function(self) -> None: - 95 """Get a specific function to execute.""" - 96 available_functions = { - 97 "delete_objects": self.delete_objects, - 98 "copy_objects": self.copy_objects, - 99 "request_restore": self.request_restore, -100 "check_restore_status": self.check_restore_status, -101 "request_restore_to_destination_and_wait": ( -102 self.request_restore_to_destination_and_wait -103 ), -104 } -105 -106 self._logger.info("Function being executed: {}".format(self.function)) -107 if self.function in available_functions.keys(): -108 func = available_functions[self.function] -109 func() -110 else: -111 raise NotImplementedError( -112 f"The requested function {self.function} is not implemented." -113 ) -114 -115 def delete_objects(self) -> None: -116 """Delete objects and 'directories' in s3. -117 -118 If dry_run is set to True the function will print a dict with all the -119 paths that would be deleted based on the given keys. -120 """ -121 bucket = self.configs["bucket"] -122 objects_paths = self.configs["object_paths"] -123 dry_run = self.configs["dry_run"] -124 -125 s3 = boto3.client("s3") -126 -127 if dry_run: -128 response = _dry_run(bucket=bucket, object_paths=objects_paths) -129 -130 self._logger.info("Paths that would be deleted:") -131 else: -132 objects_to_delete = [] -133 for path in objects_paths: -134 for obj in _list_objects_recursively(bucket=bucket, path=path): -135 objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj}) -136 -137 response = s3.delete_objects( -138 Bucket=bucket, -139 Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete}, -140 ) -141 -142 self._logger.info(response) -143 -144 def copy_objects(self) -> None: -145 """Copies objects and 'directories' in s3.""" -146 source_bucket = self.configs["bucket"] -147 source_object = self.configs["source_object"] -148 destination_bucket = self.configs["destination_bucket"] -149 destination_object = self.configs["destination_object"] -150 dry_run = self.configs["dry_run"] -151 -152 FileManager._copy_objects( -153 source_bucket=source_bucket, -154 source_object=source_object, -155 destination_bucket=destination_bucket, -156 destination_object=destination_object, -157 dry_run=dry_run, -158 ) -159 -160 def request_restore(self) -> None: -161 """Request the restore of archived data.""" -162 source_bucket = self.configs["bucket"] -163 source_object = self.configs["source_object"] -164 restore_expiration = self.configs["restore_expiration"] -165 retrieval_tier = self.configs["retrieval_tier"] -166 dry_run = self.configs["dry_run"] -167 -168 ArchiveFileManager.request_restore( -169 source_bucket, -170 source_object, -171 restore_expiration, -172 retrieval_tier, -173 dry_run, -174 ) -175 -176 def check_restore_status(self) -> None: -177 """Check the restore status of archived data.""" -178 source_bucket = self.configs["bucket"] -179 source_object = self.configs["source_object"] -180 -181 restore_status = ArchiveFileManager.check_restore_status( -182 source_bucket, source_object -183 ) +@@ -865,14 +987,14 @@135class FileManager(object): +136 """Set of actions to manipulate files in several ways.""" +137 +138 _logger = LoggingHandler(__name__).get_logger() +139 +140 def __init__(self, configs: dict): +141 """Construct FileManager algorithm instances. +142 +143 Args: +144 configs: configurations for the FileManager algorithm. +145 """ +146 self.configs = configs +147 self.function = self.configs["function"] +148 +149 def get_function(self) -> None: +150 """Get a specific function to execute.""" +151 available_functions = { +152 "delete_objects": self.delete_objects, +153 "copy_objects": self.copy_objects, +154 "request_restore": self.request_restore, +155 "check_restore_status": self.check_restore_status, +156 "request_restore_to_destination_and_wait": ( +157 self.request_restore_to_destination_and_wait +158 ), +159 } +160 +161 self._logger.info("Function being executed: {}".format(self.function)) +162 if self.function in available_functions.keys(): +163 func = available_functions[self.function] +164 func() +165 else: +166 raise NotImplementedError( +167 f"The requested function {self.function} is not implemented." +168 ) +169 +170 def _delete_objects(self, bucket: str, objects_paths: list) -> None: +171 """Delete objects recursively in s3. +172 +173 Params: +174 bucket: name of bucket to perform the delete operation. +175 objects_paths: objects to be deleted. +176 """ +177 s3 = boto3.client("s3") +178 +179 for path in objects_paths: +180 if _check_directory(bucket, path): +181 path = _process_directory_path(path) +182 else: +183 path = path.strip() 184 -185 self._logger.info( -186 f""" -187 Restore status: -188 - Not Started: {restore_status.get('not_started_objects')} -189 - Ongoing: {restore_status.get('ongoing_objects')} -190 - Restored: {restore_status.get('restored_objects')} -191 Total objects in this restore process: {restore_status.get('total_objects')} -192 """ -193 ) -194 -195 def request_restore_to_destination_and_wait(self) -> None: -196 """Request and wait for the restore to complete, polling the restore status. -197 -198 After the restore is done, copy the restored files to destination -199 """ -200 source_bucket = self.configs["bucket"] -201 source_object = self.configs["source_object"] -202 destination_bucket = self.configs["destination_bucket"] -203 destination_object = self.configs["destination_object"] -204 restore_expiration = self.configs["restore_expiration"] -205 retrieval_tier = self.configs["retrieval_tier"] -206 dry_run = self.configs["dry_run"] -207 -208 ArchiveFileManager.request_restore_and_wait( -209 source_bucket=source_bucket, -210 source_object=source_object, -211 restore_expiration=restore_expiration, -212 retrieval_tier=retrieval_tier, -213 dry_run=dry_run, -214 ) +185 more_objects = True +186 paginator = "" +187 objects_to_delete = [] +188 +189 while more_objects: +190 objects_found, paginator = _list_objects( +191 s3_client=s3, bucket=bucket, path=path, paginator=paginator +192 ) +193 for obj in objects_found: +194 objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj}) +195 +196 if not paginator: +197 more_objects = False +198 +199 response = s3.delete_objects( +200 Bucket=bucket, +201 Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete}, +202 ) +203 self._logger.info(response) +204 objects_to_delete = [] +205 +206 def delete_objects(self) -> None: +207 """Delete objects and 'directories' in s3. +208 +209 If dry_run is set to True the function will print a dict with all the +210 paths that would be deleted based on the given keys. +211 """ +212 bucket = self.configs["bucket"] +213 objects_paths = self.configs["object_paths"] +214 dry_run = self.configs["dry_run"] 215 -216 FileManager._logger.info( -217 f"Restoration complete for {source_bucket} and {source_object}" -218 ) -219 FileManager._logger.info( -220 f"Starting to copy data from {source_bucket}/{source_object} to " -221 f"{destination_bucket}/{destination_object}" -222 ) -223 FileManager._copy_objects( -224 source_bucket=source_bucket, -225 source_object=source_object, -226 destination_bucket=destination_bucket, -227 destination_object=destination_object, -228 dry_run=dry_run, -229 ) -230 FileManager._logger.info( -231 f"Finished copying data, data should be available on {destination_bucket}/" -232 f"{destination_object}" -233 ) -234 -235 @staticmethod -236 def _copy_objects( -237 source_bucket: str, -238 source_object: str, -239 destination_bucket: str, -240 destination_object: str, -241 dry_run: bool, -242 ) -> None: -243 """Copies objects and 'directories' in s3. -244 -245 Args: -246 source_bucket: name of bucket to perform the copy. -247 source_object: object/folder to be copied. -248 destination_bucket: name of the target bucket to copy. -249 destination_object: target object/folder to copy. -250 dry_run: if dry_run is set to True the function will print a dict with -251 all the paths that would be deleted based on the given keys. -252 """ -253 s3 = boto3.client("s3") -254 -255 if dry_run: -256 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) -257 -258 FileManager._logger.info("Paths that would be copied:") -259 FileManager._logger.info(response) -260 else: -261 copy_object = _list_objects_recursively( -262 bucket=source_bucket, path=source_object -263 ) +216 if dry_run: +217 response = _dry_run(bucket=bucket, object_paths=objects_paths) +218 +219 self._logger.info("Paths that would be deleted:") +220 self._logger.info(response) +221 else: +222 self._delete_objects(bucket, objects_paths) +223 +224 def copy_objects(self) -> None: +225 """Copies objects and 'directories' in s3.""" +226 source_bucket = self.configs["bucket"] +227 source_object = self.configs["source_object"] +228 destination_bucket = self.configs["destination_bucket"] +229 destination_object = self.configs["destination_object"] +230 dry_run = self.configs["dry_run"] +231 +232 FileManager._copy_objects( +233 source_bucket=source_bucket, +234 source_object=source_object, +235 destination_bucket=destination_bucket, +236 destination_object=destination_object, +237 dry_run=dry_run, +238 ) +239 +240 def request_restore(self) -> None: +241 """Request the restore of archived data.""" +242 source_bucket = self.configs["bucket"] +243 source_object = self.configs["source_object"] +244 restore_expiration = self.configs["restore_expiration"] +245 retrieval_tier = self.configs["retrieval_tier"] +246 dry_run = self.configs["dry_run"] +247 +248 ArchiveFileManager.request_restore( +249 source_bucket, +250 source_object, +251 restore_expiration, +252 retrieval_tier, +253 dry_run, +254 ) +255 +256 def check_restore_status(self) -> None: +257 """Check the restore status of archived data.""" +258 source_bucket = self.configs["bucket"] +259 source_object = self.configs["source_object"] +260 +261 restore_status = ArchiveFileManager.check_restore_status( +262 source_bucket, source_object +263 ) 264 -265 if len(copy_object) == 1: -266 FileManager._logger.info(f"Copying obj: {source_object}") -267 -268 response = s3.copy_object( -269 Bucket=destination_bucket, -270 CopySource={ -271 FileManagerAPIKeys.BUCKET.value: source_bucket, -272 FileManagerAPIKeys.KEY.value: source_object, -273 }, -274 Key=f"""{destination_object}/{copy_object[0].split("/")[-1]}""", -275 ) -276 FileManager._logger.info(response) -277 else: -278 for obj in copy_object: -279 FileManager._logger.info(f"Copying obj: {obj}") -280 -281 final_path = obj.replace(source_object, "") -282 -283 response = s3.copy_object( -284 Bucket=destination_bucket, -285 CopySource={ -286 FileManagerAPIKeys.BUCKET.value: source_bucket, -287 FileManagerAPIKeys.KEY.value: obj, -288 }, -289 Key=f"{destination_object}{final_path}", -290 ) -291 FileManager._logger.info(response) +265 self._logger.info( +266 f""" +267 Restore status: +268 - Not Started: {restore_status.get('not_started_objects')} +269 - Ongoing: {restore_status.get('ongoing_objects')} +270 - Restored: {restore_status.get('restored_objects')} +271 Total objects in this restore process: {restore_status.get('total_objects')} +272 """ +273 ) +274 +275 def request_restore_to_destination_and_wait(self) -> None: +276 """Request and wait for the restore to complete, polling the restore status. +277 +278 After the restore is done, copy the restored files to destination +279 """ +280 source_bucket = self.configs["bucket"] +281 source_object = self.configs["source_object"] +282 destination_bucket = self.configs["destination_bucket"] +283 destination_object = self.configs["destination_object"] +284 restore_expiration = self.configs["restore_expiration"] +285 retrieval_tier = self.configs["retrieval_tier"] +286 dry_run = self.configs["dry_run"] +287 +288 ArchiveFileManager.request_restore_and_wait( +289 source_bucket=source_bucket, +290 source_object=source_object, +291 restore_expiration=restore_expiration, +292 retrieval_tier=retrieval_tier, +293 dry_run=dry_run, +294 ) +295 +296 FileManager._logger.info( +297 f"Restoration complete for {source_bucket} and {source_object}" +298 ) +299 FileManager._logger.info( +300 f"Starting to copy data from {source_bucket}/{source_object} to " +301 f"{destination_bucket}/{destination_object}" +302 ) +303 FileManager._copy_objects( +304 source_bucket=source_bucket, +305 source_object=source_object, +306 destination_bucket=destination_bucket, +307 destination_object=destination_object, +308 dry_run=dry_run, +309 ) +310 FileManager._logger.info( +311 f"Finished copying data, data should be available on {destination_bucket}/" +312 f"{destination_object}" +313 ) +314 +315 @staticmethod +316 def _copy_objects( +317 source_bucket: str, +318 source_object: str, +319 destination_bucket: str, +320 destination_object: str, +321 dry_run: bool, +322 ) -> None: +323 """Copies objects and 'directories' in s3. +324 +325 Args: +326 source_bucket: name of bucket to perform the copy. +327 source_object: object/folder to be copied. +328 destination_bucket: name of the target bucket to copy. +329 destination_object: target object/folder to copy. +330 dry_run: if dry_run is set to True the function will print a dict with +331 all the paths that would be deleted based on the given keys. +332 """ +333 s3 = boto3.client("s3") +334 +335 if dry_run: +336 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) +337 +338 FileManager._logger.info("Paths that would be copied:") +339 FileManager._logger.info(response) +340 else: +341 original_object_name = source_object.split("/")[-1] +342 +343 if _check_directory(source_bucket, source_object): +344 source_object = _process_directory_path(source_object) +345 +346 copy_object = _list_objects_recursively( +347 bucket=source_bucket, path=source_object +348 ) +349 +350 for obj in copy_object: +351 FileManager._logger.info(f"Copying obj: {obj}") +352 +353 final_path = obj.replace(source_object, "") +354 +355 response = s3.copy_object( +356 Bucket=destination_bucket, +357 CopySource={ +358 FileManagerAPIKeys.BUCKET.value: source_bucket, +359 FileManagerAPIKeys.KEY.value: obj, +360 }, +361 Key=f"{destination_object}/{original_object_name}/{final_path}", +362 ) +363 FileManager._logger.info(response) +364 else: +365 FileManager._logger.info(f"Copying obj: {source_object}") +366 +367 response = s3.copy_object( +368 Bucket=destination_bucket, +369 CopySource={ +370 FileManagerAPIKeys.BUCKET.value: source_bucket, +371 FileManagerAPIKeys.KEY.value: source_object, +372 }, +373 Key=f"""{destination_object}/{original_object_name}""", +374 ) +375 FileManager._logger.info(response)
85 def __init__(self, configs: dict): -86 """Construct FileManager algorithm instances. -87 -88 Args: -89 configs: configurations for the FileManager algorithm. -90 """ -91 self.configs = configs -92 self.function = self.configs["function"] +@@ -895,26 +1017,26 @@140 def __init__(self, configs: dict): +141 """Construct FileManager algorithm instances. +142 +143 Args: +144 configs: configurations for the FileManager algorithm. +145 """ +146 self.configs = configs +147 self.function = self.configs["function"]
94 def get_function(self) -> None: - 95 """Get a specific function to execute.""" - 96 available_functions = { - 97 "delete_objects": self.delete_objects, - 98 "copy_objects": self.copy_objects, - 99 "request_restore": self.request_restore, -100 "check_restore_status": self.check_restore_status, -101 "request_restore_to_destination_and_wait": ( -102 self.request_restore_to_destination_and_wait -103 ), -104 } -105 -106 self._logger.info("Function being executed: {}".format(self.function)) -107 if self.function in available_functions.keys(): -108 func = available_functions[self.function] -109 func() -110 else: -111 raise NotImplementedError( -112 f"The requested function {self.function} is not implemented." -113 ) +@@ -934,34 +1056,23 @@149 def get_function(self) -> None: +150 """Get a specific function to execute.""" +151 available_functions = { +152 "delete_objects": self.delete_objects, +153 "copy_objects": self.copy_objects, +154 "request_restore": self.request_restore, +155 "check_restore_status": self.check_restore_status, +156 "request_restore_to_destination_and_wait": ( +157 self.request_restore_to_destination_and_wait +158 ), +159 } +160 +161 self._logger.info("Function being executed: {}".format(self.function)) +162 if self.function in available_functions.keys(): +163 func = available_functions[self.function] +164 func() +165 else: +166 raise NotImplementedError( +167 f"The requested function {self.function} is not implemented." +168 )
115 def delete_objects(self) -> None: -116 """Delete objects and 'directories' in s3. -117 -118 If dry_run is set to True the function will print a dict with all the -119 paths that would be deleted based on the given keys. -120 """ -121 bucket = self.configs["bucket"] -122 objects_paths = self.configs["object_paths"] -123 dry_run = self.configs["dry_run"] -124 -125 s3 = boto3.client("s3") -126 -127 if dry_run: -128 response = _dry_run(bucket=bucket, object_paths=objects_paths) -129 -130 self._logger.info("Paths that would be deleted:") -131 else: -132 objects_to_delete = [] -133 for path in objects_paths: -134 for obj in _list_objects_recursively(bucket=bucket, path=path): -135 objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj}) -136 -137 response = s3.delete_objects( -138 Bucket=bucket, -139 Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete}, -140 ) -141 -142 self._logger.info(response) +@@ -984,21 +1095,21 @@206 def delete_objects(self) -> None: +207 """Delete objects and 'directories' in s3. +208 +209 If dry_run is set to True the function will print a dict with all the +210 paths that would be deleted based on the given keys. +211 """ +212 bucket = self.configs["bucket"] +213 objects_paths = self.configs["object_paths"] +214 dry_run = self.configs["dry_run"] +215 +216 if dry_run: +217 response = _dry_run(bucket=bucket, object_paths=objects_paths) +218 +219 self._logger.info("Paths that would be deleted:") +220 self._logger.info(response) +221 else: +222 self._delete_objects(bucket, objects_paths)
144 def copy_objects(self) -> None: -145 """Copies objects and 'directories' in s3.""" -146 source_bucket = self.configs["bucket"] -147 source_object = self.configs["source_object"] -148 destination_bucket = self.configs["destination_bucket"] -149 destination_object = self.configs["destination_object"] -150 dry_run = self.configs["dry_run"] -151 -152 FileManager._copy_objects( -153 source_bucket=source_bucket, -154 source_object=source_object, -155 destination_bucket=destination_bucket, -156 destination_object=destination_object, -157 dry_run=dry_run, -158 ) +@@ -1018,21 +1129,21 @@224 def copy_objects(self) -> None: +225 """Copies objects and 'directories' in s3.""" +226 source_bucket = self.configs["bucket"] +227 source_object = self.configs["source_object"] +228 destination_bucket = self.configs["destination_bucket"] +229 destination_object = self.configs["destination_object"] +230 dry_run = self.configs["dry_run"] +231 +232 FileManager._copy_objects( +233 source_bucket=source_bucket, +234 source_object=source_object, +235 destination_bucket=destination_bucket, +236 destination_object=destination_object, +237 dry_run=dry_run, +238 )
160 def request_restore(self) -> None: -161 """Request the restore of archived data.""" -162 source_bucket = self.configs["bucket"] -163 source_object = self.configs["source_object"] -164 restore_expiration = self.configs["restore_expiration"] -165 retrieval_tier = self.configs["retrieval_tier"] -166 dry_run = self.configs["dry_run"] -167 -168 ArchiveFileManager.request_restore( -169 source_bucket, -170 source_object, -171 restore_expiration, -172 retrieval_tier, -173 dry_run, -174 ) +@@ -1052,24 +1163,24 @@240 def request_restore(self) -> None: +241 """Request the restore of archived data.""" +242 source_bucket = self.configs["bucket"] +243 source_object = self.configs["source_object"] +244 restore_expiration = self.configs["restore_expiration"] +245 retrieval_tier = self.configs["retrieval_tier"] +246 dry_run = self.configs["dry_run"] +247 +248 ArchiveFileManager.request_restore( +249 source_bucket, +250 source_object, +251 restore_expiration, +252 retrieval_tier, +253 dry_run, +254 )
176 def check_restore_status(self) -> None: -177 """Check the restore status of archived data.""" -178 source_bucket = self.configs["bucket"] -179 source_object = self.configs["source_object"] -180 -181 restore_status = ArchiveFileManager.check_restore_status( -182 source_bucket, source_object -183 ) -184 -185 self._logger.info( -186 f""" -187 Restore status: -188 - Not Started: {restore_status.get('not_started_objects')} -189 - Ongoing: {restore_status.get('ongoing_objects')} -190 - Restored: {restore_status.get('restored_objects')} -191 Total objects in this restore process: {restore_status.get('total_objects')} -192 """ -193 ) +@@ -1089,45 +1200,45 @@256 def check_restore_status(self) -> None: +257 """Check the restore status of archived data.""" +258 source_bucket = self.configs["bucket"] +259 source_object = self.configs["source_object"] +260 +261 restore_status = ArchiveFileManager.check_restore_status( +262 source_bucket, source_object +263 ) +264 +265 self._logger.info( +266 f""" +267 Restore status: +268 - Not Started: {restore_status.get('not_started_objects')} +269 - Ongoing: {restore_status.get('ongoing_objects')} +270 - Restored: {restore_status.get('restored_objects')} +271 Total objects in this restore process: {restore_status.get('total_objects')} +272 """ +273 )
195 def request_restore_to_destination_and_wait(self) -> None: -196 """Request and wait for the restore to complete, polling the restore status. -197 -198 After the restore is done, copy the restored files to destination -199 """ -200 source_bucket = self.configs["bucket"] -201 source_object = self.configs["source_object"] -202 destination_bucket = self.configs["destination_bucket"] -203 destination_object = self.configs["destination_object"] -204 restore_expiration = self.configs["restore_expiration"] -205 retrieval_tier = self.configs["retrieval_tier"] -206 dry_run = self.configs["dry_run"] -207 -208 ArchiveFileManager.request_restore_and_wait( -209 source_bucket=source_bucket, -210 source_object=source_object, -211 restore_expiration=restore_expiration, -212 retrieval_tier=retrieval_tier, -213 dry_run=dry_run, -214 ) -215 -216 FileManager._logger.info( -217 f"Restoration complete for {source_bucket} and {source_object}" -218 ) -219 FileManager._logger.info( -220 f"Starting to copy data from {source_bucket}/{source_object} to " -221 f"{destination_bucket}/{destination_object}" -222 ) -223 FileManager._copy_objects( -224 source_bucket=source_bucket, -225 source_object=source_object, -226 destination_bucket=destination_bucket, -227 destination_object=destination_object, -228 dry_run=dry_run, -229 ) -230 FileManager._logger.info( -231 f"Finished copying data, data should be available on {destination_bucket}/" -232 f"{destination_object}" -233 ) +@@ -1150,240 +1261,249 @@275 def request_restore_to_destination_and_wait(self) -> None: +276 """Request and wait for the restore to complete, polling the restore status. +277 +278 After the restore is done, copy the restored files to destination +279 """ +280 source_bucket = self.configs["bucket"] +281 source_object = self.configs["source_object"] +282 destination_bucket = self.configs["destination_bucket"] +283 destination_object = self.configs["destination_object"] +284 restore_expiration = self.configs["restore_expiration"] +285 retrieval_tier = self.configs["retrieval_tier"] +286 dry_run = self.configs["dry_run"] +287 +288 ArchiveFileManager.request_restore_and_wait( +289 source_bucket=source_bucket, +290 source_object=source_object, +291 restore_expiration=restore_expiration, +292 retrieval_tier=retrieval_tier, +293 dry_run=dry_run, +294 ) +295 +296 FileManager._logger.info( +297 f"Restoration complete for {source_bucket} and {source_object}" +298 ) +299 FileManager._logger.info( +300 f"Starting to copy data from {source_bucket}/{source_object} to " +301 f"{destination_bucket}/{destination_object}" +302 ) +303 FileManager._copy_objects( +304 source_bucket=source_bucket, +305 source_object=source_object, +306 destination_bucket=destination_bucket, +307 destination_object=destination_object, +308 dry_run=dry_run, +309 ) +310 FileManager._logger.info( +311 f"Finished copying data, data should be available on {destination_bucket}/" +312 f"{destination_object}" +313 )
294class ArchiveFileManager(object): -295 """Set of actions to restore archives.""" -296 -297 _logger = LoggingHandler(__name__).get_logger() -298 -299 @staticmethod -300 def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]: -301 """Get the archived object if it's an object. -302 -303 Args: -304 bucket: name of bucket to check get the object. -305 object_key: object to get. -306 -307 Returns: -308 S3 Object if it's an archived object, otherwise None. -309 """ -310 s3 = boto3.resource("s3") -311 object_to_restore = s3.Object(bucket, object_key) -312 -313 if ( -314 object_to_restore.storage_class is not None -315 and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS -316 ): -317 return object_to_restore -318 else: -319 return None -320 -321 @staticmethod -322 def _check_object_restore_status( -323 bucket: str, object_key: str -324 ) -> Optional[RestoreStatus]: -325 """Check the restore status of the archive. -326 -327 Args: -328 bucket: name of bucket to check the restore status. -329 object_key: object to check the restore status. -330 -331 Returns: -332 The restore status represented by an enum, possible values are: -333 NOT_STARTED, ONGOING or RESTORED -334 """ -335 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) -336 -337 if archived_object is None: -338 status = None -339 elif archived_object.restore is None: -340 status = RestoreStatus.NOT_STARTED -341 elif 'ongoing-request="true"' in archived_object.restore: -342 status = RestoreStatus.ONGOING -343 else: -344 status = RestoreStatus.RESTORED -345 -346 return status -347 -348 @staticmethod -349 def check_restore_status(source_bucket: str, source_object: str) -> dict: -350 """Check the restore status of archived data. -351 -352 Args: -353 source_bucket: name of bucket to check the restore status. -354 source_object: object to check the restore status. -355 -356 Returns: -357 A dict containing the amount of objects in each status. -358 """ -359 not_started_objects = 0 -360 ongoing_objects = 0 -361 restored_objects = 0 -362 total_objects = 0 -363 -364 objects_to_restore = _list_objects_recursively( -365 bucket=source_bucket, path=source_object -366 ) -367 -368 for obj in objects_to_restore: -369 ArchiveFileManager._logger.info(f"Checking restore status for: {obj}") -370 -371 restore_status = ArchiveFileManager._check_object_restore_status( -372 source_bucket, obj -373 ) -374 if not restore_status: -375 ArchiveFileManager._logger.warning( -376 f"Restore status not found for {source_bucket}/{obj}" -377 ) -378 else: -379 total_objects += 1 +@@ -1403,56 +1523,59 @@378class ArchiveFileManager(object): +379 """Set of actions to restore archives.""" 380 -381 if RestoreStatus.NOT_STARTED == restore_status: -382 not_started_objects += 1 -383 elif RestoreStatus.ONGOING == restore_status: -384 ongoing_objects += 1 -385 else: -386 restored_objects += 1 -387 -388 ArchiveFileManager._logger.info( -389 f"{obj} restore status is {restore_status.value}" -390 ) -391 -392 return { -393 "total_objects": total_objects, -394 "not_started_objects": not_started_objects, -395 "ongoing_objects": ongoing_objects, -396 "restored_objects": restored_objects, -397 } -398 -399 @staticmethod -400 def _request_restore_object( -401 bucket: str, object_key: str, expiration: int, retrieval_tier: str -402 ) -> None: -403 """Request a restore of the archive. +381 _logger = LoggingHandler(__name__).get_logger() +382 +383 @staticmethod +384 def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]: +385 """Get the archived object if it's an object. +386 +387 Args: +388 bucket: name of bucket to check get the object. +389 object_key: object to get. +390 +391 Returns: +392 S3 Object if it's an archived object, otherwise None. +393 """ +394 s3 = boto3.resource("s3") +395 object_to_restore = s3.Object(bucket, object_key) +396 +397 if ( +398 object_to_restore.storage_class is not None +399 and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS +400 ): +401 return object_to_restore +402 else: +403 return None 404 -405 Args: -406 bucket: name of bucket to perform the restore. -407 object_key: object to be restored. -408 expiration: restore expiration. -409 retrieval_tier: type of restore, possible values are: -410 Bulk, Standard or Expedited. -411 """ -412 if not RestoreType.exists(retrieval_tier): -413 raise RestoreTypeNotFoundException( -414 f"Restore type {retrieval_tier} not supported." -415 ) -416 -417 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) -418 -419 if archived_object and archived_object.restore is None: -420 ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.") -421 archived_object.restore_object( -422 RestoreRequest={ -423 "Days": expiration, -424 "GlacierJobParameters": {"Tier": retrieval_tier}, -425 } -426 ) +405 @staticmethod +406 def _check_object_restore_status( +407 bucket: str, object_key: str +408 ) -> Optional[RestoreStatus]: +409 """Check the restore status of the archive. +410 +411 Args: +412 bucket: name of bucket to check the restore status. +413 object_key: object to check the restore status. +414 +415 Returns: +416 The restore status represented by an enum, possible values are: +417 NOT_STARTED, ONGOING or RESTORED +418 """ +419 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) +420 +421 if archived_object is None: +422 status = None +423 elif archived_object.restore is None: +424 status = RestoreStatus.NOT_STARTED +425 elif 'ongoing-request="true"' in archived_object.restore: +426 status = RestoreStatus.ONGOING 427 else: -428 ArchiveFileManager._logger.info( -429 f"Restore request for {bucket}/{object_key} not performed." -430 ) +428 status = RestoreStatus.RESTORED +429 +430 return status 431 432 @staticmethod -433 def request_restore( -434 source_bucket: str, -435 source_object: str, -436 restore_expiration: int, -437 retrieval_tier: str, -438 dry_run: bool, -439 ) -> None: -440 """Request the restore of archived data. -441 -442 Args: -443 source_bucket: name of bucket to perform the restore. -444 source_object: object to be restored. -445 restore_expiration: restore expiration in days. -446 retrieval_tier: type of restore, possible values are: -447 Bulk, Standard or Expedited. -448 dry_run: if dry_run is set to True the function will print a dict with -449 all the paths that would be deleted based on the given keys. -450 """ -451 if dry_run: -452 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) -453 -454 ArchiveFileManager._logger.info("Paths that would be restored:") -455 ArchiveFileManager._logger.info(response) -456 else: -457 objects_to_restore = _list_objects_recursively( -458 bucket=source_bucket, path=source_object -459 ) -460 -461 for obj in objects_to_restore: -462 ArchiveFileManager._request_restore_object( -463 source_bucket, -464 obj, -465 restore_expiration, -466 retrieval_tier, -467 ) -468 -469 @staticmethod -470 def request_restore_and_wait( -471 source_bucket: str, -472 source_object: str, -473 restore_expiration: int, -474 retrieval_tier: str, -475 dry_run: bool, -476 ) -> None: -477 """Request and wait for the restore to complete, polling the restore status. +433 def check_restore_status(source_bucket: str, source_object: str) -> dict: +434 """Check the restore status of archived data. +435 +436 Args: +437 source_bucket: name of bucket to check the restore status. +438 source_object: object to check the restore status. +439 +440 Returns: +441 A dict containing the amount of objects in each status. +442 """ +443 not_started_objects = 0 +444 ongoing_objects = 0 +445 restored_objects = 0 +446 total_objects = 0 +447 +448 if _check_directory(source_bucket, source_object): +449 source_object = _process_directory_path(source_object) +450 +451 objects_to_restore = _list_objects_recursively( +452 bucket=source_bucket, path=source_object +453 ) +454 +455 for obj in objects_to_restore: +456 ArchiveFileManager._logger.info(f"Checking restore status for: {obj}") +457 +458 restore_status = ArchiveFileManager._check_object_restore_status( +459 source_bucket, obj +460 ) +461 if not restore_status: +462 ArchiveFileManager._logger.warning( +463 f"Restore status not found for {source_bucket}/{obj}" +464 ) +465 else: +466 total_objects += 1 +467 +468 if RestoreStatus.NOT_STARTED == restore_status: +469 not_started_objects += 1 +470 elif RestoreStatus.ONGOING == restore_status: +471 ongoing_objects += 1 +472 else: +473 restored_objects += 1 +474 +475 ArchiveFileManager._logger.info( +476 f"{obj} restore status is {restore_status.value}" +477 ) 478 -479 Args: -480 source_bucket: name of bucket to perform the restore. -481 source_object: object to be restored. -482 restore_expiration: restore expiration in days. -483 retrieval_tier: type of restore, possible values are: -484 Bulk, Standard or Expedited. -485 dry_run: if dry_run is set to True the function will print a dict with -486 all the paths that would be deleted based on the given keys. -487 """ -488 if retrieval_tier != RestoreType.EXPEDITED.value: -489 ArchiveFileManager._logger.error( -490 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " -491 "kind of restore should be used just with `Expedited` retrieval tier " -492 "to save cluster costs." -493 ) -494 raise ValueError( -495 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " -496 "kind of restore should be used just with `Expedited` retrieval tier " -497 "to save cluster costs." -498 ) -499 -500 ArchiveFileManager.request_restore( -501 source_bucket=source_bucket, -502 source_object=source_object, -503 restore_expiration=restore_expiration, -504 retrieval_tier=retrieval_tier, -505 dry_run=dry_run, -506 ) -507 restore_status = ArchiveFileManager.check_restore_status( -508 source_bucket, source_object -509 ) -510 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") -511 -512 if not dry_run: -513 ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.") -514 wait_time = 300 -515 while restore_status.get("total_objects") > restore_status.get( -516 "restored_objects" -517 ): -518 ArchiveFileManager._logger.info( -519 "Not all objects have been restored yet, checking the status again " -520 f"in {wait_time} seconds." -521 ) -522 time.sleep(wait_time) -523 wait_time = 30 -524 restore_status = ArchiveFileManager.check_restore_status( -525 source_bucket, source_object -526 ) -527 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") +479 return { +480 "total_objects": total_objects, +481 "not_started_objects": not_started_objects, +482 "ongoing_objects": ongoing_objects, +483 "restored_objects": restored_objects, +484 } +485 +486 @staticmethod +487 def _request_restore_object( +488 bucket: str, object_key: str, expiration: int, retrieval_tier: str +489 ) -> None: +490 """Request a restore of the archive. +491 +492 Args: +493 bucket: name of bucket to perform the restore. +494 object_key: object to be restored. +495 expiration: restore expiration. +496 retrieval_tier: type of restore, possible values are: +497 Bulk, Standard or Expedited. +498 """ +499 if not RestoreType.exists(retrieval_tier): +500 raise RestoreTypeNotFoundException( +501 f"Restore type {retrieval_tier} not supported." +502 ) +503 +504 if _check_directory(bucket, object_key): +505 object_key = _process_directory_path(object_key) +506 +507 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) +508 +509 if archived_object and archived_object.restore is None: +510 ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.") +511 archived_object.restore_object( +512 RestoreRequest={ +513 "Days": expiration, +514 "GlacierJobParameters": {"Tier": retrieval_tier}, +515 } +516 ) +517 else: +518 ArchiveFileManager._logger.info( +519 f"Restore request for {bucket}/{object_key} not performed." +520 ) +521 +522 @staticmethod +523 def request_restore( +524 source_bucket: str, +525 source_object: str, +526 restore_expiration: int, +527 retrieval_tier: str, +528 dry_run: bool, +529 ) -> None: +530 """Request the restore of archived data. +531 +532 Args: +533 source_bucket: name of bucket to perform the restore. +534 source_object: object to be restored. +535 restore_expiration: restore expiration in days. +536 retrieval_tier: type of restore, possible values are: +537 Bulk, Standard or Expedited. +538 dry_run: if dry_run is set to True the function will print a dict with +539 all the paths that would be deleted based on the given keys. +540 """ +541 if _check_directory(source_bucket, source_object): +542 source_object = _process_directory_path(source_object) +543 +544 if dry_run: +545 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) +546 +547 ArchiveFileManager._logger.info("Paths that would be restored:") +548 ArchiveFileManager._logger.info(response) +549 else: +550 objects_to_restore = _list_objects_recursively( +551 bucket=source_bucket, path=source_object +552 ) +553 +554 for obj in objects_to_restore: +555 ArchiveFileManager._request_restore_object( +556 source_bucket, +557 obj, +558 restore_expiration, +559 retrieval_tier, +560 ) +561 +562 @staticmethod +563 def request_restore_and_wait( +564 source_bucket: str, +565 source_object: str, +566 restore_expiration: int, +567 retrieval_tier: str, +568 dry_run: bool, +569 ) -> None: +570 """Request and wait for the restore to complete, polling the restore status. +571 +572 Args: +573 source_bucket: name of bucket to perform the restore. +574 source_object: object to be restored. +575 restore_expiration: restore expiration in days. +576 retrieval_tier: type of restore, possible values are: +577 Bulk, Standard or Expedited. +578 dry_run: if dry_run is set to True the function will print a dict with +579 all the paths that would be deleted based on the given keys. +580 """ +581 if retrieval_tier != RestoreType.EXPEDITED.value: +582 ArchiveFileManager._logger.error( +583 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " +584 "kind of restore should be used just with `Expedited` retrieval tier " +585 "to save cluster costs." +586 ) +587 raise ValueError( +588 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " +589 "kind of restore should be used just with `Expedited` retrieval tier " +590 "to save cluster costs." +591 ) +592 +593 ArchiveFileManager.request_restore( +594 source_bucket=source_bucket, +595 source_object=source_object, +596 restore_expiration=restore_expiration, +597 retrieval_tier=retrieval_tier, +598 dry_run=dry_run, +599 ) +600 restore_status = ArchiveFileManager.check_restore_status( +601 source_bucket, source_object +602 ) +603 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") +604 +605 if not dry_run: +606 ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.") +607 wait_time = 300 +608 while restore_status.get("total_objects") > restore_status.get( +609 "restored_objects" +610 ): +611 ArchiveFileManager._logger.info( +612 "Not all objects have been restored yet, checking the status again " +613 f"in {wait_time} seconds." +614 ) +615 time.sleep(wait_time) +616 wait_time = 30 +617 restore_status = ArchiveFileManager.check_restore_status( +618 source_bucket, source_object +619 ) +620 ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
348 @staticmethod -349 def check_restore_status(source_bucket: str, source_object: str) -> dict: -350 """Check the restore status of archived data. -351 -352 Args: -353 source_bucket: name of bucket to check the restore status. -354 source_object: object to check the restore status. -355 -356 Returns: -357 A dict containing the amount of objects in each status. -358 """ -359 not_started_objects = 0 -360 ongoing_objects = 0 -361 restored_objects = 0 -362 total_objects = 0 -363 -364 objects_to_restore = _list_objects_recursively( -365 bucket=source_bucket, path=source_object -366 ) -367 -368 for obj in objects_to_restore: -369 ArchiveFileManager._logger.info(f"Checking restore status for: {obj}") -370 -371 restore_status = ArchiveFileManager._check_object_restore_status( -372 source_bucket, obj -373 ) -374 if not restore_status: -375 ArchiveFileManager._logger.warning( -376 f"Restore status not found for {source_bucket}/{obj}" -377 ) -378 else: -379 total_objects += 1 -380 -381 if RestoreStatus.NOT_STARTED == restore_status: -382 not_started_objects += 1 -383 elif RestoreStatus.ONGOING == restore_status: -384 ongoing_objects += 1 -385 else: -386 restored_objects += 1 -387 -388 ArchiveFileManager._logger.info( -389 f"{obj} restore status is {restore_status.value}" -390 ) -391 -392 return { -393 "total_objects": total_objects, -394 "not_started_objects": not_started_objects, -395 "ongoing_objects": ongoing_objects, -396 "restored_objects": restored_objects, -397 } +@@ -1480,42 +1603,45 @@432 @staticmethod +433 def check_restore_status(source_bucket: str, source_object: str) -> dict: +434 """Check the restore status of archived data. +435 +436 Args: +437 source_bucket: name of bucket to check the restore status. +438 source_object: object to check the restore status. +439 +440 Returns: +441 A dict containing the amount of objects in each status. +442 """ +443 not_started_objects = 0 +444 ongoing_objects = 0 +445 restored_objects = 0 +446 total_objects = 0 +447 +448 if _check_directory(source_bucket, source_object): +449 source_object = _process_directory_path(source_object) +450 +451 objects_to_restore = _list_objects_recursively( +452 bucket=source_bucket, path=source_object +453 ) +454 +455 for obj in objects_to_restore: +456 ArchiveFileManager._logger.info(f"Checking restore status for: {obj}") +457 +458 restore_status = ArchiveFileManager._check_object_restore_status( +459 source_bucket, obj +460 ) +461 if not restore_status: +462 ArchiveFileManager._logger.warning( +463 f"Restore status not found for {source_bucket}/{obj}" +464 ) +465 else: +466 total_objects += 1 +467 +468 if RestoreStatus.NOT_STARTED == restore_status: +469 not_started_objects += 1 +470 elif RestoreStatus.ONGOING == restore_status: +471 ongoing_objects += 1 +472 else: +473 restored_objects += 1 +474 +475 ArchiveFileManager._logger.info( +476 f"{obj} restore status is {restore_status.value}" +477 ) +478 +479 return { +480 "total_objects": total_objects, +481 "not_started_objects": not_started_objects, +482 "ongoing_objects": ongoing_objects, +483 "restored_objects": restored_objects, +484 }
432 @staticmethod -433 def request_restore( -434 source_bucket: str, -435 source_object: str, -436 restore_expiration: int, -437 retrieval_tier: str, -438 dry_run: bool, -439 ) -> None: -440 """Request the restore of archived data. -441 -442 Args: -443 source_bucket: name of bucket to perform the restore. -444 source_object: object to be restored. -445 restore_expiration: restore expiration in days. -446 retrieval_tier: type of restore, possible values are: -447 Bulk, Standard or Expedited. -448 dry_run: if dry_run is set to True the function will print a dict with -449 all the paths that would be deleted based on the given keys. -450 """ -451 if dry_run: -452 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) -453 -454 ArchiveFileManager._logger.info("Paths that would be restored:") -455 ArchiveFileManager._logger.info(response) -456 else: -457 objects_to_restore = _list_objects_recursively( -458 bucket=source_bucket, path=source_object -459 ) -460 -461 for obj in objects_to_restore: -462 ArchiveFileManager._request_restore_object( -463 source_bucket, -464 obj, -465 restore_expiration, -466 retrieval_tier, -467 ) +@@ -1545,65 +1671,65 @@522 @staticmethod +523 def request_restore( +524 source_bucket: str, +525 source_object: str, +526 restore_expiration: int, +527 retrieval_tier: str, +528 dry_run: bool, +529 ) -> None: +530 """Request the restore of archived data. +531 +532 Args: +533 source_bucket: name of bucket to perform the restore. +534 source_object: object to be restored. +535 restore_expiration: restore expiration in days. +536 retrieval_tier: type of restore, possible values are: +537 Bulk, Standard or Expedited. +538 dry_run: if dry_run is set to True the function will print a dict with +539 all the paths that would be deleted based on the given keys. +540 """ +541 if _check_directory(source_bucket, source_object): +542 source_object = _process_directory_path(source_object) +543 +544 if dry_run: +545 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) +546 +547 ArchiveFileManager._logger.info("Paths that would be restored:") +548 ArchiveFileManager._logger.info(response) +549 else: +550 objects_to_restore = _list_objects_recursively( +551 bucket=source_bucket, path=source_object +552 ) +553 +554 for obj in objects_to_restore: +555 ArchiveFileManager._request_restore_object( +556 source_bucket, +557 obj, +558 restore_expiration, +559 retrieval_tier, +560 )
469 @staticmethod -470 def request_restore_and_wait( -471 source_bucket: str, -472 source_object: str, -473 restore_expiration: int, -474 retrieval_tier: str, -475 dry_run: bool, -476 ) -> None: -477 """Request and wait for the restore to complete, polling the restore status. -478 -479 Args: -480 source_bucket: name of bucket to perform the restore. -481 source_object: object to be restored. -482 restore_expiration: restore expiration in days. -483 retrieval_tier: type of restore, possible values are: -484 Bulk, Standard or Expedited. -485 dry_run: if dry_run is set to True the function will print a dict with -486 all the paths that would be deleted based on the given keys. -487 """ -488 if retrieval_tier != RestoreType.EXPEDITED.value: -489 ArchiveFileManager._logger.error( -490 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " -491 "kind of restore should be used just with `Expedited` retrieval tier " -492 "to save cluster costs." -493 ) -494 raise ValueError( -495 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " -496 "kind of restore should be used just with `Expedited` retrieval tier " -497 "to save cluster costs." -498 ) -499 -500 ArchiveFileManager.request_restore( -501 source_bucket=source_bucket, -502 source_object=source_object, -503 restore_expiration=restore_expiration, -504 retrieval_tier=retrieval_tier, -505 dry_run=dry_run, -506 ) -507 restore_status = ArchiveFileManager.check_restore_status( -508 source_bucket, source_object -509 ) -510 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") -511 -512 if not dry_run: -513 ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.") -514 wait_time = 300 -515 while restore_status.get("total_objects") > restore_status.get( -516 "restored_objects" -517 ): -518 ArchiveFileManager._logger.info( -519 "Not all objects have been restored yet, checking the status again " -520 f"in {wait_time} seconds." -521 ) -522 time.sleep(wait_time) -523 wait_time = 30 -524 restore_status = ArchiveFileManager.check_restore_status( -525 source_bucket, source_object -526 ) -527 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") +diff --git a/lakehouse_engine/transformers/data_maskers.html b/lakehouse_engine/transformers/data_maskers.html index f60bde2..26960f6 100644 --- a/lakehouse_engine/transformers/data_maskers.html +++ b/lakehouse_engine/transformers/data_maskers.html @@ -111,32 +111,29 @@562 @staticmethod +563 def request_restore_and_wait( +564 source_bucket: str, +565 source_object: str, +566 restore_expiration: int, +567 retrieval_tier: str, +568 dry_run: bool, +569 ) -> None: +570 """Request and wait for the restore to complete, polling the restore status. +571 +572 Args: +573 source_bucket: name of bucket to perform the restore. +574 source_object: object to be restored. +575 restore_expiration: restore expiration in days. +576 retrieval_tier: type of restore, possible values are: +577 Bulk, Standard or Expedited. +578 dry_run: if dry_run is set to True the function will print a dict with +579 all the paths that would be deleted based on the given keys. +580 """ +581 if retrieval_tier != RestoreType.EXPEDITED.value: +582 ArchiveFileManager._logger.error( +583 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " +584 "kind of restore should be used just with `Expedited` retrieval tier " +585 "to save cluster costs." +586 ) +587 raise ValueError( +588 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " +589 "kind of restore should be used just with `Expedited` retrieval tier " +590 "to save cluster costs." +591 ) +592 +593 ArchiveFileManager.request_restore( +594 source_bucket=source_bucket, +595 source_object=source_object, +596 restore_expiration=restore_expiration, +597 retrieval_tier=retrieval_tier, +598 dry_run=dry_run, +599 ) +600 restore_status = ArchiveFileManager.check_restore_status( +601 source_bucket, source_object +602 ) +603 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") +604 +605 if not dry_run: +606 ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.") +607 wait_time = 300 +608 while restore_status.get("total_objects") > restore_status.get( +609 "restored_objects" +610 ): +611 ArchiveFileManager._logger.info( +612 "Not all objects have been restored yet, checking the status again " +613 f"in {wait_time} seconds." +614 ) +615 time.sleep(wait_time) +616 wait_time = 30 +617 restore_status = ArchiveFileManager.check_restore_status( +618 source_bucket, source_object +619 ) +620 ArchiveFileManager._logger.info(f"Restore status: {restore_status}")44 else: 45 raise WrongArgumentsException("Hashing approach is not supported.") 46 -47 if suffix and suffix != "": -48 masked_df = masked_df.drop(col) -49 -50 return masked_df -51 -52 return inner -53 -54 @classmethod -55 def column_dropper(cls, cols: List[str]) -> Callable: -56 """Drop specific columns. +47 return masked_df +48 +49 return inner +50 +51 @classmethod +52 def column_dropper(cls, cols: List[str]) -> Callable: +53 """Drop specific columns. +54 +55 Args: +56 cols: list of column names to drop. 57 -58 Args: -59 cols: list of column names to drop. -60 -61 Returns: -62 A function to be called in .transform() spark function. -63 """ -64 -65 def inner(df: DataFrame) -> DataFrame: -66 drop_df = df -67 for col in cols: -68 drop_df = drop_df.drop(col) -69 -70 return drop_df -71 -72 return inner +58 Returns: +59 A function to be called in .transform() spark function. +60 """ +61 +62 def inner(df: DataFrame) -> DataFrame: +63 drop_df = df +64 for col in cols: +65 drop_df = drop_df.drop(col) +66 +67 return drop_df +68 +69 return inner
55 @classmethod -56 def column_dropper(cls, cols: List[str]) -> Callable: -57 """Drop specific columns. +diff --git a/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html b/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html index 791bbc3..2980331 100644 --- a/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html +++ b/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html @@ -667,7 +667,7 @@52 @classmethod +53 def column_dropper(cls, cols: List[str]) -> Callable: +54 """Drop specific columns. +55 +56 Args: +57 cols: list of column names to drop. 58 -59 Args: -60 cols: list of column names to drop. -61 -62 Returns: -63 A function to be called in .transform() spark function. -64 """ -65 -66 def inner(df: DataFrame) -> DataFrame: -67 drop_df = df -68 for col in cols: -69 drop_df = drop_df.drop(col) -70 -71 return drop_df -72 -73 return inner +59 Returns: +60 A function to be called in .transform() spark function. +61 """ +62 +63 def inner(df: DataFrame) -> DataFrame: +64 drop_df = df +65 for col in cols: +66 drop_df = drop_df.drop(col) +67 +68 return drop_df +69 +70 return innerInherited Members
- JDBCExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231012165159', max_timestamp_custom_schema: Optional[str] = None) + JDBCExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231018182628', max_timestamp_custom_schema: Optional[str] = None)diff --git a/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html b/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html index 8d456b6..1fc9f3e 100644 --- a/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html +++ b/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html @@ -536,7 +536,7 @@Inherited Members
- SAPB4Extraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: str = 'REQTSN DECIMAL(23,0)', min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231012165159', max_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)', latest_timestamp_input_col: str = 'REQTSN', request_status_tbl: str = 'SAPHANADB.RSPMREQUEST', request_col_name: str = 'REQUEST_TSN', data_target: Optional[str] = None, act_req_join_condition: Optional[str] = None, include_changelog_tech_cols: Optional[bool] = None, extra_cols_req_status_tbl: Optional[str] = None, request_status_tbl_filter: Optional[str] = None, adso_type: Optional[str] = None, default_max_timestamp: str = '1970000000000000000000') + SAPB4Extraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: str = 'REQTSN DECIMAL(23,0)', min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231018182628', max_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)', latest_timestamp_input_col: str = 'REQTSN', request_status_tbl: str = 'SAPHANADB.RSPMREQUEST', request_col_name: str = 'REQUEST_TSN', data_target: Optional[str] = None, act_req_join_condition: Optional[str] = None, include_changelog_tech_cols: Optional[bool] = None, extra_cols_req_status_tbl: Optional[str] = None, request_status_tbl_filter: Optional[str] = None, adso_type: Optional[str] = None, default_max_timestamp: str = '1970000000000000000000')diff --git a/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html b/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html index f0d1aba..1506a30 100644 --- a/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html +++ b/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html @@ -511,7 +511,7 @@
- SAPBWExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231012165159', max_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)', latest_timestamp_input_col: str = 'actrequest_timestamp', act_request_table: str = 'SAPPHA.RSODSACTREQ', request_col_name: str = 'actrequest', act_req_join_condition: Optional[str] = None, odsobject: Optional[str] = None, include_changelog_tech_cols: bool = True, extra_cols_act_request: Optional[str] = None, get_timestamp_from_act_request: bool = False, sap_bw_schema: str = 'SAPPHA', default_max_timestamp: str = '197000000000000') + SAPBWExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231018182628', max_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)', latest_timestamp_input_col: str = 'actrequest_timestamp', act_request_table: str = 'SAPPHA.RSODSACTREQ', request_col_name: str = 'actrequest', act_req_join_condition: Optional[str] = None, odsobject: Optional[str] = None, include_changelog_tech_cols: bool = True, extra_cols_act_request: Optional[str] = None, get_timestamp_from_act_request: bool = False, sap_bw_schema: str = 'SAPPHA', default_max_timestamp: str = '197000000000000')diff --git a/search.js b/search.js index fad226a..4de7dca 100644 --- a/search.js +++ b/search.js @@ -1,6 +1,6 @@ window.pdocSearch = (function(){ /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e 1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();o Lakehouse engine package containing all the system subpackages.\n"}, {"fullname": "lakehouse_engine.algorithms", "modulename": "lakehouse_engine.algorithms", "kind": "module", "doc": " Package containing all the lakehouse engine algorithms.
\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "kind": "module", "doc": "Module containing the Algorithm class.
\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm", "kind": "class", "doc": "Class to define the behavior of every algorithm based on ACONs.
\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.__init__", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.__init__", "kind": "function", "doc": "Construct Algorithm instances.
\n\nArgs:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.get_dq_spec", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.get_dq_spec", "kind": "function", "doc": "Get data quality specification object from acon.
\n\nArgs:\n spec: data quality specifications.
\n\nReturns:\n The DQSpec and the List of DQ Functions Specs.
\n", "signature": "(\tcls,\tspec: dict) -> Tuple[lakehouse_engine.core.definitions.DQSpec, List[lakehouse_engine.core.definitions.DQFunctionSpec], List[lakehouse_engine.core.definitions.DQFunctionSpec]]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader", "modulename": "lakehouse_engine.algorithms.data_loader", "kind": "module", "doc": "Module to define DataLoader class.
\n"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader", "kind": "class", "doc": "Load data using an algorithm configuration (ACON represented as dict).
\n\nThis algorithm focuses on the cases where users will be specifying all the algorithm\nsteps and configurations through a dict based configuration, which we name ACON\nin our framework.
\n\nSince an ACON is a dict you can pass a custom transformer through a python function\nand, therefore, the DataLoader can also be used to load data with custom\ntransformations not provided in our transformers package.
\n\nAs the algorithm base class of the lakehouse-engine framework is based on the\nconcept of ACON, this DataLoader algorithm simply inherits from Algorithm,\nwithout overriding anything. We designed the codebase like this to avoid\ninstantiating the Algorithm class directly, which was always meant to be an\nabstraction for any specific algorithm included in the lakehouse-engine framework.
\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.__init__", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.__init__", "kind": "function", "doc": "Construct DataLoader algorithm instances.
\n\nA data loader needs several specifications to work properly,\nbut some of them might be optional. The available specifications are:
\n\n\n\n- input specifications (mandatory): specify how to read data.\n- transform specifications (optional): specify how to transform data.\n- data quality specifications (optional): specify how to execute the data\n quality process.\n- output specifications (mandatory): specify how to write data to the\n target.\n- terminate specifications (optional): specify what to do after writing into\n the target (e.g., optimizing target table, vacuum, compute stats, etc).\n
Args:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.read", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.read", "kind": "function", "doc": "Read data from an input location into a distributed dataframe.
\n\nReturns:\n An ordered dict with all the dataframes that were read.
\n", "signature": "(self) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.transform", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.transform", "kind": "function", "doc": "Transform (optionally) the data that was read.
\n\nIf there isn't a transformation specification this step will be skipped, and the\noriginal dataframes that were read will be returned.\nTransformations can have dependency from another transformation result, however\nwe need to keep in mind if we are using streaming source and for some reason we\nneed to enable micro batch processing, this result cannot be used as input to\nanother transformation. Micro batch processing in pyspark streaming is only\navailable in .write(), which means this transformation with micro batch needs\nto be the end of the process.
\n\nArgs:\n data: input dataframes in an ordered dict.
\n\nReturns:\n Another ordered dict with the transformed dataframes, according to the\n transformation specification.
\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.process_dq", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.process_dq", "kind": "function", "doc": "Process the data quality tasks for the data that was read and/or transformed.
\n\nIt supports multiple input dataframes. Although just one is advisable.
\n\nIt is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.
\n\nArgs:\n data: dataframes from previous steps of the algorithm that we which to\n run the DQ process on.
\n\nReturns:\n Another ordered dict with the validated dataframes.
\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.write", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.write", "kind": "function", "doc": "Write the data that was read and transformed (if applicable).
\n\nIt supports writing multiple datasets. However, we only recommend to write one\ndataframe. This recommendation is based on easy debugging and reproducibility,\nsince if we start mixing several datasets being fueled by the same algorithm, it\nwould unleash an infinite sea of reproducibility issues plus tight coupling and\ndependencies between datasets. Having said that, there may be cases where\nwriting multiple datasets is desirable according to the use case requirements.\nUse it accordingly.
\n\nArgs:\n data: dataframes that were read and transformed (if applicable).
\n\nReturns:\n Dataframes that were written.
\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.terminate", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.terminate", "kind": "function", "doc": "Terminate the algorithm.
\n\nArgs:\n data: dataframes that were written.
\n", "signature": "(self, data: collections.OrderedDict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.execute", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.execute", "kind": "function", "doc": "Define the algorithm execution behaviour.
\n", "signature": "(self) -> Optional[collections.OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator", "modulename": "lakehouse_engine.algorithms.dq_validator", "kind": "module", "doc": "Module to define Data Validator class.
\n"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator", "kind": "class", "doc": "Validate data using an algorithm configuration (ACON represented as dict).
\n\nThis algorithm focuses on isolate Data Quality Validations from loading,\napplying a set of data quality functions to a specific input dataset,\nwithout the need to define any output specification.\nYou can use any input specification compatible with the lakehouse engine\n(dataframe, table, files, etc).
\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.__init__", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.__init__", "kind": "function", "doc": "Construct DQValidator algorithm instances.
\n\nA data quality validator needs the following specifications to work\nproperly:\n - input specification (mandatory): specify how and what data to\n read.\n - data quality specification (mandatory): specify how to execute\n the data quality process.\n - restore_prev_version (optional): specify if, having\n delta table/files as input, they should be restored to the\n previous version if the data quality process fails. Note: this\n is only considered if fail_on_error is kept as True.
\n\nArgs:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.read", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.read", "kind": "function", "doc": "Read data from an input location into a distributed dataframe.
\n\nReturns:\n Dataframe with data that was read.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.process_dq", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.process_dq", "kind": "function", "doc": "Process the data quality tasks for the data that was read.
\n\nIt supports a single input dataframe.
\n\nIt is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.
\n\nArgs:\n data: input dataframe on which to run the DQ process.
\n\nReturns:\n Validated dataframe.
\n", "signature": "(\tself,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.execute", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.execute", "kind": "function", "doc": "Define the algorithm execution behaviour.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.exceptions", "modulename": "lakehouse_engine.algorithms.exceptions", "kind": "module", "doc": "Package defining all the algorithm custom exceptions.
\n"}, {"fullname": "lakehouse_engine.algorithms.exceptions.ReconciliationFailedException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "ReconciliationFailedException", "kind": "class", "doc": "Exception for when the reconciliation process fails.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.NoNewDataException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "NoNewDataException", "kind": "class", "doc": "Exception for when no new data is available.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.SensorAlreadyExistsException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "SensorAlreadyExistsException", "kind": "class", "doc": "Exception for when a sensor with same sensor id already exists.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.RestoreTypeNotFoundException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "RestoreTypeNotFoundException", "kind": "class", "doc": "Exception for when the restore type is not found.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "kind": "module", "doc": "Module containing the Reconciliator class.
\n"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType", "kind": "class", "doc": "Type of Reconciliation.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.PCT", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.PCT", "kind": "variable", "doc": "\n", "default_value": "<ReconciliationType.PCT: 'percentage'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.ABS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.ABS", "kind": "variable", "doc": "\n", "default_value": "<ReconciliationType.ABS: 'absolute'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers", "kind": "class", "doc": "Transformers Available for the Reconciliation Algorithm.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "kind": "variable", "doc": "\n", "annotation": ": dict", "default_value": "<ReconciliationTransformers.AVAILABLE_TRANSFORMERS: {'cache': <bound method Optimizers.cache of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>, 'persist': <bound method Optimizers.persist of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>}>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator", "kind": "class", "doc": "Class to define the behavior of an algorithm that checks if data reconciles.
\n\nChecking if data reconciles, using this algorithm, is a matter of reading the\n'truth' data and the 'current' data. You can use any input specification compatible\nwith the lakehouse engine to read 'truth' or 'current' data. On top of that, you\ncan pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can\npreprocess the data before it goes into the actual reconciliation process.\nMoreover, you can use the 'truth_preprocess_query_args' and\n'current_preprocess_query_args' to pass additional arguments to be used to apply\nadditional operations on top of the dataframe, resulting from the previous steps.\nWith these arguments you can apply additional operations like caching or persisting\nthe Dataframe. The way to pass the additional arguments for the operations is\nsimilar to the TransformSpec, but only a few operations are allowed. Those are\ndefined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS.
\n\nThe reconciliation process is focused on joining 'truth' with 'current' by all\nprovided columns except the ones passed as 'metrics'. After that it calculates the\ndifferences in the metrics attributes (either percentage or absolute difference).\nFinally, it aggregates the differences, using the supplied aggregation function\n(e.g., sum, avg, min, max, etc).
\n\nAll of these configurations are passed via the ACON to instantiate a\nReconciliatorSpec object.
\n\nNotes:\n - It is crucial that both the current and truth datasets have exactly the same\n structure.\n - You should not use 0 as yellow or red threshold, as the algorithm will verify\n if the difference between the truth and current values is bigger\n or equal than those thresholds.\n - The reconciliation does not produce any negative values or percentages, as we\n use the absolute value of the differences. This means that the recon result\n will not indicate if it was the current values that were bigger or smaller\n than the truth values, or vice versa.
\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.__init__", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.__init__", "kind": "function", "doc": "Construct Algorithm instances.
\n\nArgs:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_source_of_truth", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_source_of_truth", "kind": "function", "doc": "Get the source of truth (expected result) for the reconciliation process.
\n\nReturns:\n DataFrame containing the source of truth.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_current_results", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_current_results", "kind": "function", "doc": "Get the current results from the table that we are checking if it reconciles.
\n\nReturns:\n DataFrame containing the current results.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.execute", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.execute", "kind": "function", "doc": "Reconcile the current results against the truth dataset.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.sensor", "modulename": "lakehouse_engine.algorithms.sensor", "kind": "module", "doc": "Module to define Sensor algorithm behavior.
\n"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor", "kind": "class", "doc": "Class representing a sensor to check if the upstream has new data.
\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.__init__", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.__init__", "kind": "function", "doc": "Construct Sensor instances.
\n\nArgs:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.execute", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.execute", "kind": "function", "doc": "Execute the sensor.
\n", "signature": "(self) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.configs", "modulename": "lakehouse_engine.configs", "kind": "module", "doc": "This module receives a config file which is included in the wheel.
\n"}, {"fullname": "lakehouse_engine.core", "modulename": "lakehouse_engine.core", "kind": "module", "doc": "Package with the core behaviour of the lakehouse engine.
\n"}, {"fullname": "lakehouse_engine.core.definitions", "modulename": "lakehouse_engine.core.definitions", "kind": "module", "doc": "Definitions of standard values and structures for core components.
\n"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat", "kind": "class", "doc": "Formats of algorithm input.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JDBC", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.AVRO", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JSON", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CSV", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.PARQUET", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DELTAFILES", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CLOUDFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CLOUDFILES", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.CLOUDFILES: 'cloudfiles'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.KAFKA", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SQL", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SQL", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.SQL: 'sql'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_BW", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_BW", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.SAP_BW: 'sap_bw'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_B4", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_B4", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.SAP_B4: 'sap_b4'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DATAFRAME", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SFTP", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SFTP", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.SFTP: 'sftp'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.values", "kind": "function", "doc": "Generates a list containing all enum values.
\n\nReturn:\n A list with all enum values.
\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.exists", "kind": "function", "doc": "Checks if the input format exists in the enum values.
\n\nArgs:\n input_format: format to check if exists.
\n\nReturn:\n If the input format exists in our enum.
\n", "signature": "(cls, input_format: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat", "kind": "class", "doc": "Formats of algorithm output.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JDBC", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.AVRO", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JSON", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CSV", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.PARQUET", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DELTAFILES", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.KAFKA", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CONSOLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CONSOLE", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.CONSOLE: 'console'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.NOOP", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.NOOP", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.NOOP: 'noop'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DATAFRAME", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.FILE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.FILE", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.FILE: 'file'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.TABLE", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.TABLE: 'table'>"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType", "kind": "class", "doc": "Type of notifier available.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType.EMAIL", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType.EMAIL", "kind": "variable", "doc": "\n", "default_value": "<NotifierType.EMAIL: 'email'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationEmailServers", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationEmailServers", "kind": "class", "doc": "Types of email server with special behaviour.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters", "kind": "class", "doc": "Parameters to be replaced in runtime.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "kind": "variable", "doc": "\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_JOB_NAME: 'databricks_job_name'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "kind": "variable", "doc": "\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID: 'databricks_workspace_id'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType", "kind": "class", "doc": "Define the types of read operations.
\n\nBATCH - read the data in batch mode (e.g., Spark batch).\nSTREAMING - read the data in streaming mode (e.g., Spark streaming).
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.BATCH", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.BATCH", "kind": "variable", "doc": "\n", "default_value": "<ReadType.BATCH: 'batch'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.STREAMING", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.STREAMING", "kind": "variable", "doc": "\n", "default_value": "<ReadType.STREAMING: 'streaming'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode", "kind": "class", "doc": "Different modes that control how we handle compliance to the provided schema.
\n\nThese read modes map to Spark's read modes at the moment.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.PERMISSIVE", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.PERMISSIVE", "kind": "variable", "doc": "\n", "default_value": "<ReadMode.PERMISSIVE: 'PERMISSIVE'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.FAILFAST", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.FAILFAST", "kind": "variable", "doc": "\n", "default_value": "<ReadMode.FAILFAST: 'FAILFAST'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.DROPMALFORMED", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.DROPMALFORMED", "kind": "variable", "doc": "\n", "default_value": "<ReadMode.DROPMALFORMED: 'DROPMALFORMED'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults", "kind": "class", "doc": "Defaults used on the data quality process.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_STORE", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.FILE_SYSTEM_STORE: 'file_system'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_S3_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_S3_STORE", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_BATCH_IDENTIFIERS", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_BATCH_IDENTIFIERS", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DQ_BATCH_IDENTIFIERS: ['spec_id', 'input_id', 'timestamp']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_CLASS_NAME", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATASOURCE_CLASS_NAME: 'Datasource'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_EXECUTION_ENGINE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_EXECUTION_ENGINE", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATASOURCE_EXECUTION_ENGINE: 'SparkDFExecutionEngine'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_CLASS_NAME", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_CONNECTORS_CLASS_NAME: 'RuntimeDataConnector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_MODULE_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_MODULE_NAME", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_CONNECTORS_MODULE_NAME: 'great_expectations.datasource.data_connector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CLASS_NAME: 'SimpleCheckpoint'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION: 1.0>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.STORE_BACKEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.STORE_BACKEND", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.EXPECTATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.EXPECTATIONS_STORE_PREFIX", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.EXPECTATIONS_STORE_PREFIX: 'dq/expectations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATIONS_STORE_PREFIX", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.VALIDATIONS_STORE_PREFIX: 'dq/validations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_DOCS_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_DOCS_PREFIX", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_DOCS_PREFIX: 'dq/data_docs/site/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CHECKPOINT_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CHECKPOINT_STORE_PREFIX", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.CHECKPOINT_STORE_PREFIX: 'dq/checkpoints/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.VALIDATION_COLUMN_IDENTIFIER: 'validationresultidentifier'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CUSTOM_EXPECTATION_LIST", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CUSTOM_EXPECTATION_LIST", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.CUSTOM_EXPECTATION_LIST: ['expect_column_values_to_be_date_not_older_than', 'expect_column_pair_a_to_be_smaller_or_equal_than_b', 'expect_multicolumn_column_a_must_equal_b_or_c', 'expect_queried_column_agg_value_to_be']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_VALIDATIONS_SCHEMA", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_VALIDATIONS_SCHEMA", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DQ_VALIDATIONS_SCHEMA: StructType([StructField('dq_validations', StructType([StructField('run_name', StringType(), True), StructField('run_success', BooleanType(), True), StructField('raised_exceptions', BooleanType(), True), StructField('run_row_success', BooleanType(), True), StructField('dq_failure_details', ArrayType(StructType([StructField('expectation_type', StringType(), True), StructField('kwargs', StringType(), True)]), True), True)]), True)])>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType", "kind": "class", "doc": "Types of write operations.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.OVERWRITE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.OVERWRITE", "kind": "variable", "doc": "\n", "default_value": "<WriteType.OVERWRITE: 'overwrite'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.COMPLETE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.COMPLETE", "kind": "variable", "doc": "\n", "default_value": "<WriteType.COMPLETE: 'complete'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.APPEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.APPEND", "kind": "variable", "doc": "\n", "default_value": "<WriteType.APPEND: 'append'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.UPDATE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.UPDATE", "kind": "variable", "doc": "\n", "default_value": "<WriteType.UPDATE: 'update'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.MERGE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.MERGE", "kind": "variable", "doc": "\n", "default_value": "<WriteType.MERGE: 'merge'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.ERROR_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.ERROR_IF_EXISTS", "kind": "variable", "doc": "\n", "default_value": "<WriteType.ERROR_IF_EXISTS: 'error'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.IGNORE_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.IGNORE_IF_EXISTS", "kind": "variable", "doc": "\n", "default_value": "<WriteType.IGNORE_IF_EXISTS: 'ignore'>"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec", "kind": "class", "doc": "Specification of an algorithm input.
\n\nThis is very aligned with the way the execution environment connects to the sources\n(e.g., spark sources).
\n\nspec_id: spec_id of the input specification read_type: ReadType type of read\noperation.\ndata_format: format of the input.\nsftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp\n directory.\ndf_name: dataframe name.\ndb_table: table name in the form of
. .\nlocation: uri that identifies from where to read data in the specified format.\nenforce_schema_from_table: if we want to enforce the table schema or not, by\n providing a table name in the form of
. .\nquery: sql query to execute and return the dataframe. Use it if you do not want to\n read from a file system nor from a table, but rather from a sql query instead.\nschema: dict representation of a schema of the input (e.g., Spark struct type\n schema).\nschema_path: path to a file with a representation of a schema of the input (e.g.,\n Spark struct type schema).\nwith_filepath: if we want to include the path of the file that is being read. Only\n works with the file reader (batch and streaming modes are supported).\noptions: dict with other relevant options according to the execution\n environment (e.g., spark) possible sources.\ncalculate_upper_bound: when to calculate upper bound to extract from SAP BW or not.\ncalc_upper_bound_schema: specific schema for the calculated upper_bound.\ngenerate_predicates: when to generate predicates to extract from SAP BW or not.\npredicates_add_null: if we want to include is null on partition by predicates.\n"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tspec_id: str,\tread_type: str,\tdata_format: Optional[str] = None,\tsftp_files_format: Optional[str] = None,\tdf_name: Optional[pyspark.sql.dataframe.DataFrame] = None,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tquery: Optional[str] = None,\tenforce_schema_from_table: Optional[str] = None,\tschema: Optional[dict] = None,\tschema_path: Optional[str] = None,\twith_filepath: bool = False,\toptions: Optional[dict] = None,\tjdbc_args: Optional[dict] = None,\tcalculate_upper_bound: bool = False,\tcalc_upper_bound_schema: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates_add_null: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec", "kind": "class", "doc": "
Transformer Specification, i.e., a single transformation amongst many.
\n\nfunction: name of the function (or callable function) to be executed.\nargs: (not applicable if using a callable function) dict with the arguments to pass\nto the function
\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec.__init__", "kind": "function", "doc": "\n", "signature": "(function: str, args: dict)"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec", "kind": "class", "doc": "pairs with the name of the parameter of the function and the\nrespective value. Transformation Specification.
\n\nI.e., the specification that defines the many transformations to be done to the data\nthat was read.
\n\nspec_id: id of the terminate specification input_id: id of the corresponding input\nspecification.\ntransformers: list of transformers to execute.\nforce_streaming_foreach_batch_processing: sometimes, when using streaming, we want\n to force the transform to be executed in the foreachBatch function to ensure\n non-supported streaming operations can be properly executed.
\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tspec_id: str,\tinput_id: str,\ttransformers: List[lakehouse_engine.core.definitions.TransformerSpec],\tforce_streaming_foreach_batch_processing: bool = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQType", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType", "kind": "class", "doc": "Available data quality tasks.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQType.VALIDATOR", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.VALIDATOR", "kind": "variable", "doc": "\n", "default_value": "<DQType.VALIDATOR: 'validator'>"}, {"fullname": "lakehouse_engine.core.definitions.DQType.ASSISTANT", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.ASSISTANT", "kind": "variable", "doc": "\n", "default_value": "<DQType.ASSISTANT: 'assistant'>"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec", "kind": "class", "doc": "Defines a data quality function specification.
\n\nfunction - name of the data quality function (expectation) to execute.\nIt follows the great_expectations api https://greatexpectations.io/expectations/.\nargs - args of the function (expectation). Follow the same api as above.
\n"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec.__init__", "kind": "function", "doc": "\n", "signature": "(function: str, args: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec", "kind": "class", "doc": "Data quality overall specification.
\n\n\n\nspec_id - id of the specification.\ninput_id - id of the input specification.\ndq_type - type of DQ process to execute (e.g. validator).\ndq_functions - list of function specifications to execute.\nunexpected_rows_pk - the list of columns composing the primary key of the\n source data to identify the rows failing the DQ validations. Note: only one\n of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It\n is mandatory to provide one of these arguments when using tag_source_data\n as True. When tag_source_data is False, this is not mandatory, but still\n recommended.\ntbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.\n Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to\n be provided. It is mandatory to provide one of these arguments when using\n tag_source_data as True. hen tag_source_data is False, this is not\n mandatory, but still recommended.\ngx_result_format - great expectations result format. Default: \"COMPLETE\".\n
\u00b4 tag_source_data - when set to true, this will ensure that the DQ process ends by\n tagging the source data with an additional column with information about the\n DQ results. This column makes it possible to identify if the DQ run was\n succeeded in general and, if not, it unlocks the insights to know what\n specific rows have made the DQ validations fail and why. Default: False.\n Note: it only works if result_sink_explode is True, gx_result_format is\n COMPLETE, fail_on_error is False (which is done automatically when\n you specify tag_source_data as True) and tbl_to_derive_pk or\n unexpected_rows_pk is configured.\n store_backend - which store_backend to use (e.g. s3 or file_system).\n local_fs_root_dir - path of the root directory. Note: only applicable for\n store_backend file_system.\n bucket - the bucket name to consider for the store_backend (store DQ artefacts).\n Note: only applicable for store_backend s3.\n data_docs_bucket - the bucket name for data docs only. When defined, it will\n supersede bucket parameter.\n expectations_store_prefix - prefix where to store expectations' data. Note: only\n applicable for store_backend s3.\n validations_store_prefix - prefix where to store validations' data. Note: only\n applicable for store_backend s3.\n data_docs_prefix - prefix where to store data_docs' data. Note: only applicable\n for store_backend s3.\n checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only\n applicable for store_backend s3.\n data_asset_name - name of the data asset to consider when configuring the great\n expectations' data source.\n expectation_suite_name - name to consider for great expectations' suite.\n assistant_options - additional options to pass to the DQ assistant processor.\n result_sink_db_table - db.table_name indicating the database and table in which\n to save the results of the DQ process.\n result_sink_location - file system location in which to save the results of the\n DQ process.\n result_sink_partitions - the list of partitions to consider.\n result_sink_format - format of the result table (e.g. delta, parquet, kafka...).\n result_sink_options - extra spark options for configuring the result sink.\n E.g: can be used to configure a Kafka sink if result_sink_format is kafka.\n result_sink_explode - flag to determine if the output table/location should have\n the columns exploded (as True) or not (as False). Default: True.\n result_sink_extra_columns - list of extra columns to be exploded (following\n the pattern \"
\n"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tspec_id: str,\tinput_id: str,\tdq_type: str,\tdq_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tunexpected_rows_pk: Optional[List[str]] = None,\ttbl_to_derive_pk: Optional[str] = None,\tgx_result_format: Optional[str] = 'COMPLETE',\ttag_source_data: Optional[bool] = False,\tassistant_options: Optional[dict] = None,\tstore_backend: str = 's3',\tlocal_fs_root_dir: Optional[str] = None,\tbucket: Optional[str] = None,\tdata_docs_bucket: Optional[str] = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tdata_docs_prefix: str = 'dq/data_docs/site/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/',\tdata_asset_name: Optional[str] = None,\texpectation_suite_name: Optional[str] = None,\tresult_sink_db_table: Optional[str] = None,\tresult_sink_location: Optional[str] = None,\tresult_sink_partitions: Optional[List[str]] = None,\tresult_sink_format: str = 'delta',\tresult_sink_options: Optional[dict] = None,\tresult_sink_explode: bool = True,\tresult_sink_extra_columns: Optional[List[str]] = None,\tsource: Optional[str] = None,\tfail_on_error: bool = True,\tcache_df: bool = False,\tcritical_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tmax_percentage_failure: Optional[float] = None)"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions", "kind": "class", "doc": ".*\") or columns to be selected. It is only used when\n result_sink_explode is set to True.\n source - name of data source, to be easier to identify in analysis. If not\n specified, it is set as default . This will be only used\n when result_sink_explode is set to True.\n fail_on_error - whether to fail the algorithm if the validations of your data in\n the DQ process failed.\n cache_df - whether to cache the dataframe before running the DQ process or not.\n critical_functions - functions that should not fail. When this argument is\n defined, fail_on_error is nullified.\n max_percentage_failure - percentage of failure that should be allowed.\n This argument has priority over both fail_on_error and critical_functions. Options for a merge operation.
\n\nmerge_predicate: predicate to apply to the merge operation so that we can check if a\n new record corresponds to a record already included in the historical data.\ninsert_only: indicates if the merge should only insert data (e.g., deduplicate\n scenarios).\ndelete_predicate: predicate to apply to the delete operation.\nupdate_predicate: predicate to apply to the update operation.\ninsert_predicate: predicate to apply to the insert operation.\nupdate_column_set: rules to apply to the update operation which allows to set the\n value for each column to be updated.\n (e.g. {\"data\": \"new.data\", \"count\": \"current.count + 1\"} )\ninsert_column_set: rules to apply to the insert operation which allows to set the\n value for each column to be inserted.\n (e.g. {\"date\": \"updates.date\", \"count\": \"1\"} )
\n"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions.__init__", "kind": "function", "doc": "\n", "signature": "(\tmerge_predicate: str,\tinsert_only: bool = False,\tdelete_predicate: Optional[str] = None,\tupdate_predicate: Optional[str] = None,\tinsert_predicate: Optional[str] = None,\tupdate_column_set: Optional[dict] = None,\tinsert_column_set: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec", "kind": "class", "doc": "Specification of an algorithm output.
\n\nThis is very aligned with the way the execution environment connects to the output\nsystems (e.g., spark outputs).
\n\nspec_id: id of the output specification.\ninput_id: id of the corresponding input specification.\nwrite_type: type of write operation.\ndata_format: format of the output. Defaults to DELTA.\ndb_table: table name in the form of
. .\nlocation: uri that identifies from where to write data in the specified format.\npartitions: list of partition input_col names.\nmerge_opts: options to apply to the merge operation.\nstreaming_micro_batch_transformers: transformers to invoke for each streaming micro\n batch, before writing (i.e., in Spark's foreachBatch structured\n streaming function). Note: the lakehouse engine manages this for you, so\n you don't have to manually specify streaming transformations here, so we don't\n advise you to manually specify transformations through this parameter. Supply\n them as regular transformers in the transform_specs sections of an ACON.\nstreaming_once: if the streaming query is to be executed just once, or not,\n generating just one micro batch.\nstreaming_processing_time: if streaming query is to be kept alive, this indicates\n the processing time of each micro batch.\nstreaming_available_now: if set to True, set a trigger that processes all available\n data in multiple batches then terminates the query.\n When using streaming, this is the default trigger that the lakehouse-engine will\n use, unless you configure a different one.\nstreaming_continuous: set a trigger that runs a continuous query with a given\n checkpoint interval.\nstreaming_await_termination: whether to wait (True) for the termination of the\n streaming query (e.g. timeout or exception) or not (False). Default: True.\nstreaming_await_termination_timeout: a timeout to set to the\n streaming_await_termination. Default: None.\nwith_batch_id: whether to include the streaming batch id in the final data, or not.\n It only takes effect in streaming mode.\noptions: dict with other relevant options according to the execution environment\n (e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for\n streaming, etc.\nstreaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers\n but for the DQ functions to be executed. Used internally by the lakehouse\n engine, so you don't have to supply DQ functions through this parameter. Use the\n dq_specs of the acon instead.\n"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tspec_id: str,\tinput_id: str,\twrite_type: str,\tdata_format: str = 'delta',\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tmerge_opts: Optional[lakehouse_engine.core.definitions.MergeOptions] = None,\tpartitions: Optional[List[str]] = None,\tstreaming_micro_batch_transformers: Optional[List[lakehouse_engine.core.definitions.TransformerSpec]] = None,\tstreaming_once: Optional[bool] = None,\tstreaming_processing_time: Optional[str] = None,\tstreaming_available_now: bool = True,\tstreaming_continuous: Optional[str] = None,\tstreaming_await_termination: bool = True,\tstreaming_await_termination_timeout: Optional[int] = None,\twith_batch_id: bool = False,\toptions: Optional[dict] = None,\tstreaming_micro_batch_dq_processors: Optional[List[lakehouse_engine.core.definitions.DQSpec]] = None)"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec", "kind": "class", "doc": "
Terminator Specification.
\n\nI.e., the specification that defines a terminator operation to be executed. Examples\nare compute statistics, vacuum, optimize, etc.
\n\nspec_id: id of the terminate specification.\nfunction: terminator function to execute.\nargs: arguments of the terminator function.\ninput_id: id of the corresponding output specification (Optional).
\n"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tfunction: str,\targs: Optional[dict] = None,\tinput_id: Optional[str] = None)"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec", "kind": "class", "doc": "Reconciliator Specification.
\n\nmetrics: list of metrics in the form of:\n [{\n metric: name of the column present in both truth and current datasets,\n aggregation: sum, avg, max, min, ...,\n type: percentage or absolute,\n yellow: value,\n red: value\n }].\nrecon_type: reconciliation type (percentage or absolute). Percentage calculates\n the difference between truth and current results as a percentage (x-y/x), and\n absolute calculates the raw difference (x - y).\ntruth_input_spec: input specification of the truth data.\ncurrent_input_spec: input specification of the current results data\ntruth_preprocess_query: additional query on top of the truth input data to\n preprocess the truth data before it gets fueled into the reconciliation process.\n Important note: you need to assume that the data out of\n the truth_input_spec is referencable by a table called 'truth'.\ntruth_preprocess_query_args: optional dict having the functions/transformations to\n apply on top of the truth_preprocess_query and respective arguments. Note: cache\n is being applied on the Dataframe, by default. For turning the default behavior\n off, pass
\n"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tmetrics: List[dict],\ttruth_input_spec: lakehouse_engine.core.definitions.InputSpec,\tcurrent_input_spec: lakehouse_engine.core.definitions.InputSpec,\ttruth_preprocess_query: Optional[str] = None,\ttruth_preprocess_query_args: Optional[List[dict]] = None,\tcurrent_preprocess_query: Optional[str] = None,\tcurrent_preprocess_query_args: Optional[List[dict]] = None,\tignore_empty_df: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec", "kind": "class", "doc": "\"truth_preprocess_query_args\": []
.\ncurrent_preprocess_query: additional query on top of the current results input data\n to preprocess the current results data before it gets fueled into the\n reconciliation process. Important note: you need to assume that the data out of\n the current_results_input_spec is referencable by a table called 'current'.\ncurrent_preprocess_query_args: optional dict having the functions/transformations to\n apply on top of the current_preprocess_query and respective arguments. Note:\n cache is being applied on the Dataframe, by default. For turning the default\n behavior off, pass\"current_preprocess_query_args\": []
.\nignore_empty_df: optional boolean, to ignore the recon process if source & target\n dataframes are empty, recon will exit success code (passed)Data Quality Validator Specification.
\n\ninput_spec: input specification of the data to be checked/validated.\ndq_spec: data quality specification.\nrestore_prev_version: specify if, having\ndelta table/files as input, they should be restored to the\nprevious version if the data quality process fails. Note: this\nis only considered if fail_on_error is kept as True.
\n"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\trestore_prev_version: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions", "kind": "class", "doc": "SQL definitions statements.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.compute_table_stats", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.compute_table_stats", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.compute_table_stats: 'ANALYZE TABLE {} COMPUTE STATISTICS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_table_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_table_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.drop_table_stmt: 'DROP TABLE IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_view_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_view_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.drop_view_stmt: 'DROP VIEW IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.truncate_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.truncate_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.truncate_stmt: 'TRUNCATE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.describe_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.describe_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.describe_stmt: 'DESCRIBE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.optimize_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.optimize_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.optimize_stmt: 'OPTIMIZE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.show_tbl_props_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.show_tbl_props_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.show_tbl_props_stmt: 'SHOW TBLPROPERTIES'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.delete_where_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.delete_where_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.delete_where_stmt: 'DELETE FROM {} WHERE {}'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys", "kind": "class", "doc": "File Manager s3 api keys.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTENTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTENTS", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.CONTENTS: 'Contents'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.KEY", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.KEY", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.KEY: 'Key'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTINUATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTINUATION", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.CONTINUATION: 'NextContinuationToken'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.BUCKET", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.BUCKET", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.BUCKET: 'Bucket'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.OBJECTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.OBJECTS", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.OBJECTS: 'Objects'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec", "kind": "class", "doc": "Sensor Specification.
\n\nsensor_id: sensor id.\nassets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.\ncontrol_db_table_name: db.table to store sensor metadata.\ninput_spec: input specification of the source to be checked for new data.\npreprocess_query: SQL query to transform/filter the result from the\n upstream. Consider that we should refer to 'new_data' whenever\n we are referring to the input of the sensor. E.g.:\n \"SELECT dummy_col FROM new_data WHERE ...\"\ncheckpoint_location: optional location to store checkpoints to resume\n from. These checkpoints use the same as Spark checkpoint strategy.\n For Spark readers that do not support checkpoints, use the\n preprocess_query parameter to form a SQL query to filter the result\n from the upstream accordingly.\nfail_on_empty_result: if the sensor should throw an error if there is no new\n data in the upstream. Default: True.
\n"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tsensor_id: str,\tassets: List[str],\tcontrol_db_table_name: str,\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tpreprocess_query: Optional[str],\tcheckpoint_location: Optional[str],\tfail_on_empty_result: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.create_from_acon", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.create_from_acon", "kind": "function", "doc": "Create SensorSpec from acon.
\n\nArgs:\n acon: sensor ACON.
\n", "signature": "(cls, acon: dict):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus", "kind": "class", "doc": "Status for a sensor.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.ACQUIRED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.ACQUIRED_NEW_DATA", "kind": "variable", "doc": "\n", "default_value": "<SensorStatus.ACQUIRED_NEW_DATA: 'ACQUIRED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.PROCESSED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.PROCESSED_NEW_DATA", "kind": "variable", "doc": "\n", "default_value": "<SensorStatus.PROCESSED_NEW_DATA: 'PROCESSED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain", "kind": "class", "doc": "Defaults used on consuming data from SAP Logchain.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.DBTABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.DBTABLE", "kind": "variable", "doc": "\n", "default_value": "<SAPLogchain.DBTABLE: 'SAPPHA.RSPCLOGCHAIN'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.GREEN_STATUS", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.GREEN_STATUS", "kind": "variable", "doc": "\n", "default_value": "<SAPLogchain.GREEN_STATUS: 'G'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.ENGINE_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.ENGINE_TABLE", "kind": "variable", "doc": "\n", "default_value": "<SAPLogchain.ENGINE_TABLE: 'sensor_new_data'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType", "kind": "class", "doc": "Archive types.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.BULK", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.BULK", "kind": "variable", "doc": "\n", "default_value": "<RestoreType.BULK: 'Bulk'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.STANDARD", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.STANDARD", "kind": "variable", "doc": "\n", "default_value": "<RestoreType.STANDARD: 'Standard'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.EXPEDITED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.EXPEDITED", "kind": "variable", "doc": "\n", "default_value": "<RestoreType.EXPEDITED: 'Expedited'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.values", "kind": "function", "doc": "Generates a list containing all enum values.
\n\nReturn:\n A list with all enum values.
\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.exists", "kind": "function", "doc": "Checks if the restore type exists in the enum values.
\n\nArgs:\n restore_type: restore type to check if exists.
\n\nReturn:\n If the restore type exists in our enum.
\n", "signature": "(cls, restore_type: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus", "kind": "class", "doc": "Archive types.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.NOT_STARTED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.NOT_STARTED", "kind": "variable", "doc": "\n", "default_value": "<RestoreStatus.NOT_STARTED: 'not_started'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.ONGOING", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.ONGOING", "kind": "variable", "doc": "\n", "default_value": "<RestoreStatus.ONGOING: 'ongoing'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.RESTORED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.RESTORED", "kind": "variable", "doc": "\n", "default_value": "<RestoreStatus.RESTORED: 'restored'>"}, {"fullname": "lakehouse_engine.core.exec_env", "modulename": "lakehouse_engine.core.exec_env", "kind": "module", "doc": "Module to take care of creating a singleton of the execution environment class.
\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv", "kind": "class", "doc": "Represents the basic resources regarding the engine execution environment.
\n\nCurrently, it is solely used to encapsulate the logic to get a Spark session.
\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv.get_or_create", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv.get_or_create", "kind": "function", "doc": "Get or create an execution environment session (currently Spark).
\n\nIt instantiates a singleton session that can be accessed anywhere from the\nlakehouse engine.
\n\nArgs:\n session: spark session.\n enable_hive_support: whether to enable hive support or not.\n app_name: application name.\n config: extra spark configs to supply to the spark session.
\n", "signature": "(\tcls,\tsession: pyspark.sql.session.SparkSession = None,\tenable_hive_support: bool = True,\tapp_name: str = None,\tconfig: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.executable", "modulename": "lakehouse_engine.core.executable", "kind": "module", "doc": "Module representing an executable lakehouse engine component.
\n"}, {"fullname": "lakehouse_engine.core.executable.Executable", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable", "kind": "class", "doc": "Abstract class defining the behaviour of an executable component.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.executable.Executable.execute", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable.execute", "kind": "function", "doc": "Define the executable component behaviour.
\n\nE.g., the behaviour of an algorithm inheriting from this.
\n", "signature": "(self) -> Optional[Any]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager", "modulename": "lakehouse_engine.core.file_manager", "kind": "module", "doc": "File manager module.
\n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager", "kind": "class", "doc": "Set of actions to manipulate files in several ways.
\n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.__init__", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.__init__", "kind": "function", "doc": "Construct FileManager algorithm instances.
\n\nArgs:\n configs: configurations for the FileManager algorithm.
\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.get_function", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.get_function", "kind": "function", "doc": "Get a specific function to execute.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.delete_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.delete_objects", "kind": "function", "doc": "Delete objects and 'directories' in s3.
\n\nIf dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.copy_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.copy_objects", "kind": "function", "doc": "Copies objects and 'directories' in s3.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.request_restore", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.request_restore", "kind": "function", "doc": "Request the restore of archived data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.check_restore_status", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.check_restore_status", "kind": "function", "doc": "Check the restore status of archived data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.request_restore_to_destination_and_wait", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.request_restore_to_destination_and_wait", "kind": "function", "doc": "Request and wait for the restore to complete, polling the restore status.
\n\nAfter the restore is done, copy the restored files to destination
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager", "kind": "class", "doc": "Set of actions to restore archives.
\n"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.check_restore_status", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.check_restore_status", "kind": "function", "doc": "Check the restore status of archived data.
\n\nArgs:\n source_bucket: name of bucket to check the restore status.\n source_object: object to check the restore status.
\n\nReturns:\n A dict containing the amount of objects in each status.
\n", "signature": "(source_bucket: str, source_object: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.request_restore", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.request_restore", "kind": "function", "doc": "Request the restore of archived data.
\n\nArgs:\n source_bucket: name of bucket to perform the restore.\n source_object: object to be restored.\n restore_expiration: restore expiration in days.\n retrieval_tier: type of restore, possible values are:\n Bulk, Standard or Expedited.\n dry_run: if dry_run is set to True the function will print a dict with\n all the paths that would be deleted based on the given keys.
\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.request_restore_and_wait", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.request_restore_and_wait", "kind": "function", "doc": "Request and wait for the restore to complete, polling the restore status.
\n\nArgs:\n source_bucket: name of bucket to perform the restore.\n source_object: object to be restored.\n restore_expiration: restore expiration in days.\n retrieval_tier: type of restore, possible values are:\n Bulk, Standard or Expedited.\n dry_run: if dry_run is set to True the function will print a dict with\n all the paths that would be deleted based on the given keys.
\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager", "modulename": "lakehouse_engine.core.sensor_manager", "kind": "module", "doc": "Module to define Sensor Manager classes.
\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager", "kind": "class", "doc": "Class to control the Sensor execution.
\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.check_if_sensor_has_acquired_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.check_if_sensor_has_acquired_data", "kind": "function", "doc": "Check if sensor has acquired new data.
\n\nArgs:\n sensor_id: sensor id.\n control_db_table_name: db.table to control sensor runs.
\n\nReturns:\n True if acquired new data, otherwise False
\n", "signature": "(cls, sensor_id: str, control_db_table_name: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.update_sensor_status", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.update_sensor_status", "kind": "function", "doc": "Control sensor execution storing the execution data in a delta table.
\n\nArgs:\n sensor_spec: sensor spec containing all sensor\n information we need to update the control status.\n status: status of the sensor.\n upstream_key: upstream key (e.g., used to store an attribute\n name from the upstream so that new data can be detected\n automatically).\n upstream_value: upstream value (e.g., used to store the max\n attribute value from the upstream so that new data can be\n detected automatically).
\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec,\tstatus: str,\tupstream_key: str = None,\tupstream_value: str = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.read_sensor_table_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.read_sensor_table_data", "kind": "function", "doc": "Read data from delta table containing sensor status info.
\n\nArgs:\n sensor_id: sensor id. If this parameter is defined search occurs\n only considering this parameter. Otherwise, it considers sensor\n assets and checkpoint location.\n control_db_table_name: db.table to control sensor runs.\n assets: list of assets that are fueled by the pipeline\n where this sensor is.
\n\nReturn:\n Row containing the data for the provided sensor_id.
\n", "signature": "(\tcls,\tcontrol_db_table_name: str,\tsensor_id: str = None,\tassets: list = None) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager", "kind": "class", "doc": "Class to deal with Sensor Upstream data.
\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_filter_exp_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_filter_exp_query", "kind": "function", "doc": "Generates a sensor preprocess query based on timestamp logic.
\n\nArgs:\n sensor_id: sensor id.\n filter_exp: expression to filter incoming new data.\n You can use the placeholder
\n\n?upstream_value
so that\n it can be replaced by the upstream_value in the\n control_db_table_name for this specific sensor_id.\n control_db_table_name: db.table to retrieve the last status change\n timestamp. This is only relevant for the jdbc sensor.\n upstream_key: the key of custom sensor information\n to control how to identify new data from the\n upstream (e.g., a time column in the upstream).\n upstream_value: value for custom sensor\n to identify new data from the upstream\n (e.g., the value of a time present in the upstream)\n If none we will set the default value.\n Note: This parameter is used just to override the\n default value-2147483647
.\n upstream_table_name: value for custom sensor\n to query new data from the upstream.\n If none we will set the default value,\n oursensor_new_data
view.Return:\n The query string.
\n", "signature": "(\tcls,\tsensor_id: str,\tfilter_exp: str,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_table_preprocess_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_table_preprocess_query", "kind": "function", "doc": "Generates a query to be used for a sensor having other sensor as upstream.
\n\nArgs:\n sensor_id: sensor id.
\n\nReturn:\n The query string.
\n", "signature": "(cls, sensor_id: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.read_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.read_new_data", "kind": "function", "doc": "Read new data from the upstream into the sensor 'new_data_df'.
\n\nArgs:\n sensor_spec: sensor spec containing all sensor information.
\n\nReturn:\n An empty dataframe if it doesn't have new data otherwise the new data
\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.get_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.get_new_data", "kind": "function", "doc": "Get new data from upstream df if it's present.
\n\nArgs:\n new_data_df: DataFrame possibly containing new data.
\n\nReturn:\n Optional row, present if there is new data in the upstream,\n absent otherwise.
\n", "signature": "(\tcls,\tnew_data_df: pyspark.sql.dataframe.DataFrame) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_sap_logchain_query", "kind": "function", "doc": "Generates a sensor query based in the SAP Logchain table.
\n\nArgs:\n chain_id: chain id to query the status on SAP.\n dbtable: db.table to retrieve the data to\n check if the sap chain is already finished.\n status: db.table to retrieve the last status change\n timestamp.\n engine_table_name: table name exposed with the SAP LOGCHAIN data.\n This table will be used in the jdbc query.
\n\nReturn:\n The query string.
\n", "signature": "(\tcls,\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager", "modulename": "lakehouse_engine.core.table_manager", "kind": "module", "doc": "Table manager module.
\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager", "kind": "class", "doc": "Set of actions to manipulate tables/views in several ways.
\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.__init__", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.__init__", "kind": "function", "doc": "Construct TableManager algorithm instances.
\n\nArgs:\n configs: configurations for the TableManager algorithm.
\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_function", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_function", "kind": "function", "doc": "Get a specific function to execute.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create", "kind": "function", "doc": "Create a new table or view on metastore.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create_many", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create_many", "kind": "function", "doc": "Create multiple tables or views on metastore.
\n\nIn this function the path to the ddl files can be separated by comma.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.compute_table_statistics", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.compute_table_statistics", "kind": "function", "doc": "Compute table statistics.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_table", "kind": "function", "doc": "Delete table function deletes table from metastore and erases all data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_view", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_view", "kind": "function", "doc": "Delete view function deletes view from metastore and erases all data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.truncate", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.truncate", "kind": "function", "doc": "Truncate function erases all data but keeps metadata.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.vacuum", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.vacuum", "kind": "function", "doc": "Vacuum function erases older versions from Delta Lake tables or locations.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.describe", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.describe", "kind": "function", "doc": "Describe function describes metadata from some table or view.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.optimize", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.optimize", "kind": "function", "doc": "Optimize function optimizes the layout of Delta Lake data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_multiple_sql_files", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_multiple_sql_files", "kind": "function", "doc": "Execute multiple statements in multiple sql files.
\n\nIn this function the path to the files is separated by comma.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_sql", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_sql", "kind": "function", "doc": "Execute sql commands separated by semicolon (;).
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.show_tbl_properties", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.show_tbl_properties", "kind": "function", "doc": "Show Table Properties.
\n\nReturns: a dataframe with the table properties.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_tbl_pk", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_tbl_pk", "kind": "function", "doc": "Get the primary key of a particular table.
\n\nReturns: the list of columns that are part of the primary key.
\n", "signature": "(self) -> List[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.repair_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.repair_table", "kind": "function", "doc": "Run the repair table command.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.delete_where", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.delete_where", "kind": "function", "doc": "Run the delete where command.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors", "modulename": "lakehouse_engine.dq_processors", "kind": "module", "doc": "Package to define data quality processes available in the lakehouse engine.
\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "kind": "module", "doc": "Module containing the definition of a data assistant.
\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant.Assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "qualname": "Assistant", "kind": "class", "doc": "Class containing the data assistant.
\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant.Assistant.run_data_assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "qualname": "Assistant.run_data_assistant", "kind": "function", "doc": "Entrypoint to run the data assistant.
\n\nBased on the data, it uses GE Onboarding Data Assistant to generate expectations\nthat can be applied to the data. Then, it returns the generated expectations\nand, depending on your configuration, it can display plots of the metrics,\nexpectations and also display or store the profiling of the data, for you to get\na better sense of it.
\n\nArgs:\n context: the BaseDataContext containing the configurations for the data\n source and store backend.\n batch_request: batch request to be able to query underlying data.\n expectation_suite_name: name of the expectation suite.\n assistant_options: additional options to pass to the DQ assistant processor.\n data: the input dataframe for which the DQ is running.\n profile_file_name: file name for storing the profiling html file.
\n\nReturns:\n The context with the expectation suite stored.
\n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tassistant_options: dict,\tdata: pyspark.sql.dataframe.DataFrame,\tprofile_file_name: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations", "modulename": "lakehouse_engine.dq_processors.custom_expectations", "kind": "module", "doc": "Package containing custom DQ expectations available in the lakehouse engine.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "kind": "module", "doc": "Expectation to check if column 'a' is lower or equal than column 'b'.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ColumnPairCustom", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ColumnPairCustom", "kind": "class", "doc": "Asserts that column 'A' is lower or equal than column 'B'.
\n\nAdditionally, the 'margin' parameter can be used to add a margin to the\ncheck between column 'A' and 'B': 'A' <= 'B' + 'margin'.
\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_pair_map_metric_provider.ColumnPairMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ExpectColumnPairAToBeSmallerOrEqualThanB", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ExpectColumnPairAToBeSmallerOrEqualThanB", "kind": "class", "doc": "Expect values in column A to be lower or equal than column B.
\n\nArgs:\n column_A: The first column name.\n column_B: The second column name.\n margin: additional approximation to column B value.
\n\nKeyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).\n result_format: Which output mode to use:\n
\n\nBOOLEAN_ONLY
,BASIC
(default),COMPLETE
, orSUMMARY
.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.Returns:\n An ExpectationSuiteValidationResult.
\n", "bases": "great_expectations.expectations.expectation.ColumnPairMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "kind": "module", "doc": "Expectation to check if column value is a date within a timeframe.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ColumnValuesDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ColumnValuesDateNotOlderThan", "kind": "class", "doc": "Asserts that column values are a date that isn't older than a given date.
\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_map_metric_provider.ColumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ExpectColumnValuesToBeDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ExpectColumnValuesToBeDateNotOlderThan", "kind": "class", "doc": "Expect value in column to be date that is not older than a given time.
\n\nSince timedelta can only define an interval up to weeks, a month is defined\nas 4 weeks and a year is defined as 52 weeks.
\n\nArgs:\n column: Name of column to validate\n Note: Column must be of type Date, Timestamp or String (with Timestamp format).\n Format: yyyy-MM-ddTHH:mm:ss\n timeframe: dict with the definition of the timeframe.\n kwargs: dict with additional parameters.
\n\nKeyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).\n result_format: Which output mode to use:\n
\n\nBOOLEAN_ONLY
,BASIC
(default),COMPLETE
, orSUMMARY
.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.Returns:\n An ExpectationSuiteValidationResult.
\n", "bases": "great_expectations.expectations.expectation.ColumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "kind": "module", "doc": "Expectation to check if column 'a' equals 'b', or 'c'.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.MulticolumnCustomMetric", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "MulticolumnCustomMetric", "kind": "class", "doc": "Expectation metric definition.
\n\nThis expectation asserts that column 'a' must equal to column 'b' or column 'c'.\nIn addition to this it is possible to validate that column 'b' or 'c' match a regex.
\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.multicolumn_map_metric_provider.MulticolumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.ExpectMulticolumnColumnAMustEqualBOrC", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "ExpectMulticolumnColumnAMustEqualBOrC", "kind": "class", "doc": "MultiColumn Expectation.
\n\nExpect that the column 'a' is equal to 'b' when this is\nnot empty; otherwise 'a' must be equal to 'c'.
\n\nArgs:\n column_list: The column names to evaluate.
\n\nKeyword Args:\n ignore_row_if: default to \"never\".\n result_format: Which output mode to use:\n
\n\nBOOLEAN_ONLY
,BASIC
,COMPLETE
, orSUMMARY
.\n Default set toBASIC
.\n include_config: If True, then include the expectation\n config as part of the result object.\n Default set to True.\n catch_exceptions: If True, then catch exceptions\n and include them as part of the result object.\n Default set to False.Returns:\n An ExpectationSuiteValidationResult.
\n", "bases": "great_expectations.expectations.expectation.MulticolumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "kind": "module", "doc": "Expectation to check if aggregated column satisfy the condition.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe", "kind": "class", "doc": "Expect agg of column to satisfy the condition specified.
\n\nArgs:\n template_dict: dict with the following keys:\n column (column to check sum).\n group_column_list (group by column names to be listed).\n condition (how to validate the aggregated value eg: between,\n greater, lesser).\n max_value (maximum allowed value).\n min_value (minimum allowed value).\n agg_type (sum/count/max/min).
\n", "bases": "great_expectations.expectations.expectation.QueryExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe.validate_configuration", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe.validate_configuration", "kind": "function", "doc": "Validates that a configuration has been set.
\n\nArgs:\n configuration (OPTIONAL[ExpectationConfiguration]):\n An optional Expectation Configuration entry.
\n\nReturns:\n None. Raises InvalidExpectationConfigurationError
\n", "signature": "(\tself,\tconfiguration: Optional[great_expectations.core.expectation_configuration.ExpectationConfiguration] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "kind": "module", "doc": "Module containing the class definition of the Data Quality Factory.
\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory", "kind": "class", "doc": "Class for the Data Quality Factory.
\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory.run_dq_process", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory.run_dq_process", "kind": "function", "doc": "Run the specified data quality process on a dataframe.
\n\nBased on the dq_specs we apply the defined expectations on top of the dataframe\nin order to apply the necessary validations and then output the result of\nthe data quality process.
\n\nArgs:\n dq_spec: data quality specification.\n data: input dataframe to run the dq process on.
\n\nReturns:\n The DataFrame containing the results of the DQ process.
\n", "signature": "(\tcls,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.exceptions", "modulename": "lakehouse_engine.dq_processors.exceptions", "kind": "module", "doc": "Package defining all the DQ custom exceptions.
\n"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQValidationsFailedException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQValidationsFailedException", "kind": "class", "doc": "Exception for when the data quality validations fail.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQCheckpointsResultsException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQCheckpointsResultsException", "kind": "class", "doc": "Exception for when the checkpoint results parsing fail.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.validator", "modulename": "lakehouse_engine.dq_processors.validator", "kind": "module", "doc": "Module containing the definition of a data quality validator.
\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator", "kind": "class", "doc": "Class containing the data quality validator.
\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.get_dq_validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.get_dq_validator", "kind": "function", "doc": "Get a validator according to the specification.
\n\nWe use getattr to dynamically execute any expectation available.\ngetattr(validator, function) is similar to validator.function(). With this\napproach, we can execute any expectation supported.
\n\nArgs:\n context: the BaseDataContext containing the configurations for the data\n source and store backend.\n batch_request: run time batch request to be able to query underlying data.\n expectation_suite_name: name of the expectation suite.\n dq_functions: a list of DQFunctionSpec to consider in the expectation suite.\n critical_functions: list of critical expectations in the expectation suite.
\n\nReturns:\n The validator with the expectation suite stored.
\n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tdq_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec],\tcritical_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec]) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.tag_source_with_dq", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.tag_source_with_dq", "kind": "function", "doc": "Tags the source dataframe with a new column having the DQ results.
\n\nArgs:\n source_pk: the primary key of the source data.\n source_df: the source dataframe to be tagged with DQ results.\n results_df: dq results dataframe.
\n\nReturns: a dataframe tagged with the DQ results.
\n", "signature": "(\tcls,\tsource_pk: List[str],\tsource_df: pyspark.sql.dataframe.DataFrame,\tresults_df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine", "modulename": "lakehouse_engine.engine", "kind": "module", "doc": "Contract of the lakehouse engine with all the available functions to be executed.
\n"}, {"fullname": "lakehouse_engine.engine.load_data", "modulename": "lakehouse_engine.engine", "qualname": "load_data", "kind": "function", "doc": "Load data using the DataLoader algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).
\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_reconciliation", "modulename": "lakehouse_engine.engine", "qualname": "execute_reconciliation", "kind": "function", "doc": "Execute the Reconciliator algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_dq_validation", "modulename": "lakehouse_engine.engine", "qualname": "execute_dq_validation", "kind": "function", "doc": "Execute the DQValidator algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_table", "modulename": "lakehouse_engine.engine", "qualname": "manage_table", "kind": "function", "doc": "Manipulate tables/views using Table Manager algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_files", "modulename": "lakehouse_engine.engine", "qualname": "manage_files", "kind": "function", "doc": "Manipulate s3 files using File Manager algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_sensor", "modulename": "lakehouse_engine.engine", "qualname": "execute_sensor", "kind": "function", "doc": "Execute a sensor based on a Sensor Algorithm Configuration.
\n\nA sensor is useful to check if an upstream system has new data.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.update_sensor_status", "modulename": "lakehouse_engine.engine", "qualname": "update_sensor_status", "kind": "function", "doc": "Update internal sensor status.
\n\nUpdate the sensor status in the control table,\nit should be used to tell the system\nthat the sensor has processed all new data that was previously identified,\nhence updating the shifted sensor status.\nUsually used to move from
\n\nSensorStatus.ACQUIRED_NEW_DATA
to\nSensorStatus.PROCESSED_NEW_DATA
,\nbut there might be scenarios - still to identify -\nwhere we can update the sensor status from/to different statuses.Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to store sensor checkpoints.\n status: status of the sensor.\n assets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.
\n", "signature": "(\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_query", "kind": "function", "doc": "Generates a preprocess query to be used in a sensor configuration.
\n\nArgs:\n sensor_id: sensor id.\n filter_exp: expression to filter incoming new data.\n You can use the placeholder ?default_upstream_key and\n ?default_upstream_value, so that it can be replaced by the\n respective values in the control_db_table_name for this specific\n sensor_id.\n control_db_table_name: db.table to retrieve the last status change\n timestamp. This is only relevant for the jdbc sensor.\n upstream_key: the key of custom sensor information to control how to\n identify new data from the upstream (e.g., a time column in the\n upstream).\n upstream_value: the upstream value\n to identify new data from the upstream (e.g., the value of a time\n present in the upstream).\n upstream_table_name: value for custom sensor\n to query new data from the upstream\n If none we will set the default value,\n our
\n\nsensor_new_data
view.Return:\n The query string.
\n", "signature": "(\tsensor_id: str,\tfilter_exp: str = None,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_sap_logchain_query", "kind": "function", "doc": "Generates a sensor query based in the SAP Logchain table.
\n\nArgs:\n chain_id: chain id to query the status on SAP.\n dbtable: db.table to retrieve the data to\n check if the sap chain is already finished.\n status: db.table to retrieve the last status change\n timestamp.\n engine_table_name: table name exposed with the SAP LOGCHAIN data.\n This table will be used in the jdbc query.
\n\nReturn:\n The query string.
\n", "signature": "(\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.send_notification", "modulename": "lakehouse_engine.engine", "qualname": "send_notification", "kind": "function", "doc": "Send a notification using a notifier.
\n\nArgs:\n args: arguments for the notifier.
\n", "signature": "(args: dict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io", "modulename": "lakehouse_engine.io", "kind": "module", "doc": "Input and Output package responsible for the behaviour of reading and writing.
\n"}, {"fullname": "lakehouse_engine.io.exceptions", "modulename": "lakehouse_engine.io.exceptions", "kind": "module", "doc": "Package defining all the io custom exceptions.
\n"}, {"fullname": "lakehouse_engine.io.exceptions.IncrementalFilterInputNotFoundException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "IncrementalFilterInputNotFoundException", "kind": "class", "doc": "Exception for when the input of an incremental filter is not found.
\n\nThis may occur when tables are being loaded in incremental way, taking the increment\ndefinition out of a specific table, but the table still does not exist, mainly\nbecause probably it was not loaded for the first time yet.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.WrongIOFormatException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "WrongIOFormatException", "kind": "class", "doc": "Exception for when a user provides a wrong I/O format.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.NotSupportedException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "NotSupportedException", "kind": "class", "doc": "Exception for when a user provides a not supported operation.
\n", "bases": "builtins.RuntimeError"}, {"fullname": "lakehouse_engine.io.reader", "modulename": "lakehouse_engine.io.reader", "kind": "module", "doc": "Defines abstract reader behaviour.
\n"}, {"fullname": "lakehouse_engine.io.reader.Reader", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader", "kind": "class", "doc": "Abstract Reader class.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader.Reader.__init__", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.__init__", "kind": "function", "doc": "Construct Reader instances.
\n\nArgs:\n input_spec: input specification for reading data.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.reader.Reader.read", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.read", "kind": "function", "doc": "Abstract read method.
\n\nReturns:\n A dataframe read according to the input specification.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.reader_factory", "modulename": "lakehouse_engine.io.reader_factory", "kind": "module", "doc": "Module for reader factory.
\n"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory", "kind": "class", "doc": "Class for reader factory.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory.get_data", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory.get_data", "kind": "function", "doc": "Get data according to the input specification following a factory pattern.
\n\nArgs:\n spec: input specification to get the data.
\n\nReturns:\n A dataframe containing the data.
\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.InputSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers", "modulename": "lakehouse_engine.io.readers", "kind": "module", "doc": "Readers package to define reading behaviour.
\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "kind": "module", "doc": "Module to define behaviour to read from dataframes.
\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader", "kind": "class", "doc": "Class to read data from a dataframe.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.__init__", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.__init__", "kind": "function", "doc": "Construct DataFrameReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.read", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.read", "kind": "function", "doc": "Read data from a dataframe.
\n\nReturns:\n A dataframe containing the data from a dataframe previously\n computed.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.file_reader", "modulename": "lakehouse_engine.io.readers.file_reader", "kind": "module", "doc": "Module to define behaviour to read from files.
\n"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader", "kind": "class", "doc": "Class to read from files.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.__init__", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.__init__", "kind": "function", "doc": "Construct FileReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.read", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.read", "kind": "function", "doc": "Read file data.
\n\nReturns:\n A dataframe containing the data from the files.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "kind": "module", "doc": "Module to define behaviour to read from JDBC sources.
\n"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader", "kind": "class", "doc": "Class to read from JDBC source.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.__init__", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.__init__", "kind": "function", "doc": "Construct JDBCReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.read", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.read", "kind": "function", "doc": "Read data from JDBC source.
\n\nReturns:\n A dataframe containing the data from the JDBC source.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "kind": "module", "doc": "Module to define behaviour to read from Kafka.
\n"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader", "kind": "class", "doc": "Class to read from Kafka.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.__init__", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.__init__", "kind": "function", "doc": "Construct KafkaReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.read", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.read", "kind": "function", "doc": "Read Kafka data.
\n\nReturns:\n A dataframe containing the data from Kafka.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.query_reader", "modulename": "lakehouse_engine.io.readers.query_reader", "kind": "module", "doc": "Module to define behaviour to read from a query.
\n"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader", "kind": "class", "doc": "Class to read data from a query.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.__init__", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.__init__", "kind": "function", "doc": "Construct QueryReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.read", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.read", "kind": "function", "doc": "Read data from a query.
\n\nReturns:\n A dataframe containing the data from the query.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "kind": "module", "doc": "Module to define behaviour to read from SAP B4 sources.
\n"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader", "kind": "class", "doc": "Class to read from SAP B4 source.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.__init__", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.__init__", "kind": "function", "doc": "Construct SAPB4Reader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.read", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.read", "kind": "function", "doc": "Read data from SAP B4 source.
\n\nReturns:\n A dataframe containing the data from the SAP B4 source.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "kind": "module", "doc": "Module to define behaviour to read from SAP BW sources.
\n"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader", "kind": "class", "doc": "Class to read from SAP BW source.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.__init__", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.__init__", "kind": "function", "doc": "Construct SAPBWReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.read", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.read", "kind": "function", "doc": "Read data from SAP BW source.
\n\nReturns:\n A dataframe containing the data from the SAP BW source.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "kind": "module", "doc": "Module to define behaviour to read from SFTP.
\n"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader", "kind": "class", "doc": "Class to read from SFTP.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.__init__", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.__init__", "kind": "function", "doc": "Construct SFTPReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.read", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.read", "kind": "function", "doc": "Read SFTP data.
\n\nReturns:\n A dataframe containing the data from SFTP.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.table_reader", "modulename": "lakehouse_engine.io.readers.table_reader", "kind": "module", "doc": "Module to define behaviour to read from tables.
\n"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader", "kind": "class", "doc": "Class to read data from a table.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.__init__", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.__init__", "kind": "function", "doc": "Construct TableReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.read", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.read", "kind": "function", "doc": "Read data from a table.
\n\nReturns:\n A dataframe containing the data from the table.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer", "modulename": "lakehouse_engine.io.writer", "kind": "module", "doc": "Defines abstract writer behaviour.
\n"}, {"fullname": "lakehouse_engine.io.writer.Writer", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer", "kind": "class", "doc": "Abstract Writer class.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer.Writer.__init__", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.__init__", "kind": "function", "doc": "Construct Writer instances.
\n\nArgs:\n output_spec: output specification to write data.\n df: dataframe to write.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict = None)"}, {"fullname": "lakehouse_engine.io.writer.Writer.write", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write", "kind": "function", "doc": "Abstract write method.
\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.write_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write_transformed_micro_batch", "kind": "function", "doc": "Define how to write a streaming micro batch after transforming it.
\n\nThis function must define an inner function that manipulates a streaming batch,\nand then return that function. Look for concrete implementations of this\nfunction for more clarity.
\n\nArgs:\n kwargs: any keyword arguments.
\n\nReturns:\n A function to be executed in the foreachBatch spark write method.
\n", "signature": "(**kwargs: Any) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_transformed_micro_batch", "kind": "function", "doc": "Get the result of the transformations applied to a micro batch dataframe.
\n\nArgs:\n output_spec: output specification associated with the writer.\n batch_df: batch dataframe (given from streaming foreachBatch).\n batch_id: if of the batch (given from streaming foreachBatch).\n data: list of all dfs generated on previous steps before writer\n to be available on micro batch transforms.
\n\nReturns:\n The transformed dataframe.
\n", "signature": "(\tcls,\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tbatch_df: pyspark.sql.dataframe.DataFrame,\tbatch_id: int,\tdata: OrderedDict) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_streaming_trigger", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_streaming_trigger", "kind": "function", "doc": "Define which streaming trigger will be used.
\n\nArgs:\n output_spec: output specification.
\n\nReturns:\n A dict containing streaming trigger.
\n", "signature": "(cls, output_spec: lakehouse_engine.core.definitions.OutputSpec) -> Dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.run_micro_batch_dq_process", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.run_micro_batch_dq_process", "kind": "function", "doc": "Run the data quality process in a streaming micro batch dataframe.
\n\nIterates over the specs and performs the checks or analysis depending on the\ndata quality specification provided in the configuration.
\n\nArgs:\n df: the dataframe in which to run the dq process on.\n dq_spec: data quality specification.
\n\nReturns: the validated dataframe.
\n", "signature": "(\tdf: pyspark.sql.dataframe.DataFrame,\tdq_spec: List[lakehouse_engine.core.definitions.DQSpec]) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer_factory", "modulename": "lakehouse_engine.io.writer_factory", "kind": "module", "doc": "Module for writer factory.
\n"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory", "kind": "class", "doc": "Class for writer factory.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory.get_writer", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory.get_writer", "kind": "function", "doc": "Get a writer according to the output specification using a factory pattern.
\n\nArgs:\n OutputSpec spec: output specification to write data.\n DataFrame df: dataframe to be written.\n OrderedDict data: list of all dfs generated on previous steps before writer.
\n\nReturns:\n Writer: writer that will write the data.
\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict) -> lakehouse_engine.io.writer.Writer:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers", "modulename": "lakehouse_engine.io.writers", "kind": "module", "doc": "Package containing the writers responsible for writing data.
\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer", "modulename": "lakehouse_engine.io.writers.console_writer", "kind": "module", "doc": "Module to define behaviour to write to console.
\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter", "kind": "class", "doc": "Class to write data to console.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.__init__", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.__init__", "kind": "function", "doc": "Construct ConsoleWriter instances.
\n\nArgs:\n output_spec: output specification\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.write", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.write", "kind": "function", "doc": "Write data to console.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "kind": "module", "doc": "Module to define behaviour to write to dataframe.
\n"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter", "kind": "class", "doc": "Class to write data to dataframe.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.__init__", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.__init__", "kind": "function", "doc": "Construct DataFrameWriter instances.
\n\nArgs:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.write", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.write", "kind": "function", "doc": "Write data to dataframe.
\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "kind": "module", "doc": "Module to define the behaviour of delta merges.
\n"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter", "kind": "class", "doc": "Class to merge data using delta lake.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.__init__", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.__init__", "kind": "function", "doc": "Construct DeltaMergeWriter instances.
\n\nArgs:\n output_spec: output specification containing merge options and\n relevant information.\n df: the dataframe containing the new data to be merged.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.write", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.write", "kind": "function", "doc": "Merge new data with current data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.file_writer", "modulename": "lakehouse_engine.io.writers.file_writer", "kind": "module", "doc": "Module to define behaviour to write to files.
\n"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter", "kind": "class", "doc": "Class to write data to files.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.__init__", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.__init__", "kind": "function", "doc": "Construct FileWriter instances.
\n\nArgs:\n output_spec: output specification\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.write", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.write", "kind": "function", "doc": "Write data to files.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "kind": "module", "doc": "Module that defines the behaviour to write to JDBC targets.
\n"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter", "kind": "class", "doc": "Class to write to JDBC targets.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.__init__", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.__init__", "kind": "function", "doc": "Construct JDBCWriter instances.
\n\nArgs:\n output_spec: output specification.\n df: dataframe to be writen.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.write", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.write", "kind": "function", "doc": "Write data into JDBC target.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer", "modulename": "lakehouse_engine.io.writers.kafka_writer", "kind": "module", "doc": "Module that defines the behaviour to write to Kafka.
\n"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter", "kind": "class", "doc": "Class to write to a Kafka target.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.__init__", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.__init__", "kind": "function", "doc": "Construct KafkaWriter instances.
\n\nArgs:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.write", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.write", "kind": "function", "doc": "Write data to Kafka.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.table_writer", "modulename": "lakehouse_engine.io.writers.table_writer", "kind": "module", "doc": "Module that defines the behaviour to write to tables.
\n"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter", "kind": "class", "doc": "Class to write to a table.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.__init__", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.__init__", "kind": "function", "doc": "Construct TableWriter instances.
\n\nArgs:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.write", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.write", "kind": "function", "doc": "Write data to a table.
\n\nAfter the write operation we repair the table (e.g., update partitions).\nHowever, there's a caveat to this, which is the fact that this repair\noperation is not reachable if we are running long-running streaming mode.\nTherefore, we recommend not using the TableWriter with formats other than\ndelta lake for those scenarios (as delta lake does not need msck repair).\nSo, you can: 1) use delta lake format for the table; 2) use the FileWriter\nand run the repair with a certain frequency in a separate task of your\npipeline.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators", "modulename": "lakehouse_engine.terminators", "kind": "module", "doc": "Package to define algorithm terminators (e.g., vacuum, optimize, compute stats).
\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor", "modulename": "lakehouse_engine.terminators.cdf_processor", "kind": "module", "doc": "Defines change data feed processor behaviour.
\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor", "kind": "class", "doc": "Change data feed processor class.
\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.expose_cdf", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.expose_cdf", "kind": "function", "doc": "Expose CDF to external location.
\n\nArgs:\n spec: terminator specification.
\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.delete_old_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.delete_old_data", "kind": "function", "doc": "Delete old data from cdf delta table.
\n\nArgs:\n spec: terminator specifications.
\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.vacuum_cdf_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.vacuum_cdf_data", "kind": "function", "doc": "Vacuum old data from cdf delta table.
\n\nArgs:\n spec: terminator specifications.
\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "kind": "module", "doc": "Module with dataset optimizer terminator.
\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer", "kind": "class", "doc": "Class with dataset optimizer terminator.
\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer.optimize_dataset", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer.optimize_dataset", "kind": "function", "doc": "Optimize a dataset based on a set of pre-conceived optimizations.
\n\nMost of the times the dataset is a table, but it can be a file-based one only.
\n\nArgs:\n db_table: database_name.table_name.\n location: dataset/table filesystem location.\n compute_table_stats: to compute table statistics or not.\n vacuum: (delta lake tables only) whether to vacuum the delta lake\n table or not.\n vacuum_hours: (delta lake tables only) number of hours to consider\n in vacuum operation.\n optimize: (delta lake tables only) whether to optimize the table or\n not. Custom optimize parameters can be supplied through ExecEnv (Spark)\n configs\n optimize_where: expression to use in the optimize function.\n optimize_zorder_col_list: (delta lake tables only) list of\n columns to consider in the zorder optimization process. Custom optimize\n parameters can be supplied through ExecEnv (Spark) configs.\n debug: flag indicating if we are just debugging this for local\n tests and therefore pass through all the exceptions to perform some\n assertions in local tests.
\n", "signature": "(\tcls,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tcompute_table_stats: bool = True,\tvacuum: bool = True,\tvacuum_hours: int = 720,\toptimize: bool = True,\toptimize_where: Optional[str] = None,\toptimize_zorder_col_list: Optional[List[str]] = None,\tdebug: bool = False) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier", "modulename": "lakehouse_engine.terminators.notifier", "kind": "module", "doc": "Module with notification terminator.
\n"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier", "kind": "class", "doc": "Abstract Notification class.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.__init__", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.__init__", "kind": "function", "doc": "Construct Notification instances.
\n\nArgs:\n notification_spec: notification specification.
\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.create_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.create_notification", "kind": "function", "doc": "Abstract create notification method.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.send_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.send_notification", "kind": "function", "doc": "Abstract send notification method.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.check_if_notification_is_failure_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.check_if_notification_is_failure_notification", "kind": "function", "doc": "Check if given notification is a failure notification.
\n\nArgs:\n spec: spec to validate if it is a failure notification.
\n\nReturns:\n A boolean telling if the notification is a failure notification
\n", "signature": "(spec: lakehouse_engine.core.definitions.TerminatorSpec) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory", "modulename": "lakehouse_engine.terminators.notifier_factory", "kind": "module", "doc": "Module for notifier factory.
\n"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory", "kind": "class", "doc": "Class for notification factory.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.get_notifier", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.get_notifier", "kind": "function", "doc": "Get a notifier according to the terminator specs using a factory.
\n\nArgs:\n spec: terminator specification.
\n\nReturns:\n Notifier: notifier that will handle notifications.
\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.TerminatorSpec) -> lakehouse_engine.terminators.notifier.Notifier:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.generate_failure_notification", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.generate_failure_notification", "kind": "function", "doc": "Check if it is necessary to send a failure notification and generate it.
\n\nArgs:\n spec: List of termination specs\n exception: Exception that caused the failure.
\n", "signature": "(spec: list, exception: Exception) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers", "modulename": "lakehouse_engine.terminators.notifiers", "kind": "module", "doc": "Notifications module.
\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "kind": "module", "doc": "Module with email notifier.
\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier", "kind": "class", "doc": "Base Notification class.
\n", "bases": "lakehouse_engine.terminators.notifier.Notifier"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.__init__", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.__init__", "kind": "function", "doc": "Construct Email Notification instance.
\n\nArgs:\n notification_spec: notification specification.
\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.create_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.create_notification", "kind": "function", "doc": "Creates the notification to be sent.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.send_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.send_notification", "kind": "function", "doc": "Sends the notification by using a series of methods.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "kind": "module", "doc": "Email notification templates.
\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates.NotificationsTemplates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "qualname": "NotificationsTemplates", "kind": "class", "doc": "Templates for notifications.
\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "kind": "module", "doc": "Defines terminator behaviour.
\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator", "kind": "class", "doc": "Sensor Terminator class.
\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator.update_sensor_status", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator.update_sensor_status", "kind": "function", "doc": "Update internal sensor status.
\n\nUpdate the sensor status in the control table, it should be used to tell the\nsystem that the sensor has processed all new data that was previously\nidentified, hence updating the shifted sensor status.\nUsually used to move from
\n\nSensorStatus.ACQUIRED_NEW_DATA
to\nSensorStatus.PROCESSED_NEW_DATA
, but there might be scenarios - still\nto identify - where we can update the sensor status from/to different statuses.Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to store sensor checkpoints.\n status: status of the sensor.\n assets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.
\n", "signature": "(\tcls,\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.spark_terminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "kind": "module", "doc": "Defines terminator behaviour.
\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator", "kind": "class", "doc": "Spark Terminator class.
\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator.terminate_spark", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator.terminate_spark", "kind": "function", "doc": "Terminate spark session.
\n", "signature": "(cls) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.terminator_factory", "modulename": "lakehouse_engine.terminators.terminator_factory", "kind": "module", "doc": "Module with the factory pattern to return terminators.
\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory", "kind": "class", "doc": "TerminatorFactory class following the factory pattern.
\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory.execute_terminator", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory.execute_terminator", "kind": "function", "doc": "Execute a terminator following the factory pattern.
\n\nArgs:\n spec: terminator specification.\n df: dataframe to be used in the terminator. Needed when a\n terminator requires one dataframe as input.
\n\nReturns:\n Transformer function to be executed in .transform() spark function.
\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TerminatorSpec,\tdf: Optional[pyspark.sql.dataframe.DataFrame] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers", "modulename": "lakehouse_engine.transformers", "kind": "module", "doc": "Package to define transformers available in the lakehouse engine.
\n"}, {"fullname": "lakehouse_engine.transformers.aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "kind": "module", "doc": "Aggregators module.
\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators", "kind": "class", "doc": "Class containing all aggregation functions.
\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators.get_max_value", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators.get_max_value", "kind": "function", "doc": "Get the maximum value of a given column of a dataframe.
\n\nArgs:\n input_col: name of the input column.\n output_col: name of the output column (defaults to \"latest\").
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(input_col: str, output_col: str = 'latest') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators", "modulename": "lakehouse_engine.transformers.column_creators", "kind": "module", "doc": "Column creators transformers module.
\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators", "kind": "class", "doc": "Class containing all functions that can create columns to add value.
\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_row_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_row_id", "kind": "function", "doc": "Create a sequential but not consecutive id.
\n\nArgs:\n output_col: optional name of the output column.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_auto_increment_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_auto_increment_id", "kind": "function", "doc": "Create a sequential and consecutive id.
\n\nArgs:\n output_col: optional name of the output column.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_literals", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_literals", "kind": "function", "doc": "Create columns given a map of column names and literal values (constants).
\n\nArgs:\n Dict[str, Any] literals: map of column names and literal values (constants).
\n\nReturns:\n Callable: A function to be executed in the .transform() spark function.
\n", "signature": "(cls, literals: Dict[str, Any]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "kind": "module", "doc": "Module with column reshaping transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers", "kind": "class", "doc": "Class containing column reshaping transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.cast", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.cast", "kind": "function", "doc": "Cast specific columns into the designated type.
\n\nArgs:\n cols: dict with columns and respective target types.\n Target types need to have the exact name of spark types:\n https://spark.apache.org/docs/latest/sql-ref-datatypes.html
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, cols: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.column_selector", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.column_selector", "kind": "function", "doc": "Select specific columns with specific output aliases.
\n\nArgs:\n cols: dict with columns to select and respective aliases.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, cols: collections.OrderedDict) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.flatten_schema", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.flatten_schema", "kind": "function", "doc": "Flatten the schema of the dataframe.
\n\nArgs:\n max_level: level until which you want to flatten the schema.\n Default: None.\n shorten_names: whether to shorten the names of the prefixes\n of the fields being flattened or not. Default: False.\n alias: whether to define alias for the columns being flattened\n or not. Default: True.\n num_chars: number of characters to consider when shortening\n the names of the fields. Default: 7.\n ignore_cols: columns which you don't want to flatten.\n Default: None.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.explode_columns", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.explode_columns", "kind": "function", "doc": "Explode columns with types like ArrayType and MapType.
\n\nAfter it can be applied the flatten_schema transformation,\nif we desired for example to explode the map (as we explode a StructType)\nor to explode a StructType inside the array.\nWe recommend you to specify always the columns desired to explode\nand not explode all columns.
\n\nArgs:\n explode_arrays: whether you want to explode array columns (True)\n or not (False). Default: False.\n array_cols_to_explode: array columns which you want to explode.\n If you don't specify it will get all array columns and explode them.\n Default: None.\n explode_maps: whether you want to explode map columns (True)\n or not (False). Default: False.\n map_cols_to_explode: map columns which you want to explode.\n If you don't specify it will get all map columns and explode them.\n Default: None.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\texplode_arrays: bool = False,\tarray_cols_to_explode: List[str] = None,\texplode_maps: bool = False,\tmap_cols_to_explode: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.with_expressions", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.with_expressions", "kind": "function", "doc": "Execute Spark SQL expressions to create the specified columns.
\n\nThis function uses the Spark expr function:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/\npyspark.sql.functions.expr.html
\n\nArgs:\n cols_and_exprs: dict with columns and respective expressions to compute\n (Spark SQL expressions).
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, cols_and_exprs: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.rename", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.rename", "kind": "function", "doc": "Rename specific columns into the designated name.
\n\nArgs:\n cols: dict with columns and respective target names.\n escape_col_names: whether to escape column names (e.g.
\n\n/BIC/COL1
) or not.\n If True it creates a column with the new name and drop the old one.\n If False, uses the native withColumnRenamed Spark function. Default: True.Returns:\n Function to be called in .transform() spark function.
\n", "signature": "(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro", "kind": "function", "doc": "Select all attributes from avro.
\n\nArgs:\n schema: the schema string.\n key_col: the name of the key column.\n value_col: the name of the value column.\n options: extra options (e.g., mode: \"PERMISSIVE\").\n expand_key: whether you want to expand the content inside the key\n column or not. Default: false.\n expand_value: whether you want to expand the content inside the value\n column or not. Default: true.
\n\nReturns:\n Function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tschema: str = None,\tkey_col: str = 'key',\tvalue_col: str = 'value',\toptions: dict = None,\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro_with_registry", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro_with_registry", "kind": "function", "doc": "Select all attributes from avro using a schema registry.
\n\nArgs:\n schema_registry: the url to the schema registry.\n value_schema: the name of the value schema entry in the schema registry.\n value_col: the name of the value column.\n key_schema: the name of the key schema entry in the schema\n registry. Default: None.\n key_col: the name of the key column.\n expand_key: whether you want to expand the content inside the key\n column or not. Default: false.\n expand_value: whether you want to expand the content inside the value\n column or not. Default: true.
\n\nReturns:\n Function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tschema_registry: str,\tvalue_schema: str,\tvalue_col: str = 'value',\tkey_schema: str = None,\tkey_col: str = 'key',\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_json", "kind": "function", "doc": "Convert a json string into a json column (struct).
\n\nThe new json column can be added to the existing columns (default) or it can\nreplace all the others, being the only one to output. The new column gets the\nsame name as the original one suffixed with '_json'.
\n\nArgs:\n input_col: dict with columns and respective target names.\n schema_path: path to the StructType schema (spark schema).\n schema: dict with the StructType schema (spark schema).\n json_options: options to parse the json value.\n drop_all_cols: whether to drop all the input columns or not.\n Defaults to False.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tinput_col: str,\tschema_path: Optional[str] = None,\tschema: Optional[dict] = None,\tjson_options: Optional[dict] = None,\tdrop_all_cols: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.to_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.to_json", "kind": "function", "doc": "Convert dataframe columns into a json value.
\n\nArgs:\n in_cols: name(s) of the input column(s).\n Example values:\n \"*\" - all\n columns; \"my_col\" - one column named \"my_col\";\n \"my_col1, my_col2\" - two columns.\n out_col: name of the output column.\n json_options: options to parse the json value.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tin_cols: List[str],\tout_col: str,\tjson_options: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers", "modulename": "lakehouse_engine.transformers.condensers", "kind": "module", "doc": "Condensers module.
\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers", "kind": "class", "doc": "Class containing all the functions to condensate data for later merges.
\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.condense_record_mode_cdc", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.condense_record_mode_cdc", "kind": "function", "doc": "Condense Change Data Capture (CDC) based on record_mode strategy.
\n\nThis CDC data is particularly seen in some CDC enabled systems. Other systems\nmay have different CDC strategies.
\n\nArgs:\n business_key: The business key (logical primary key) of the data.\n ranking_key_desc: In this type of CDC condensation the data needs to be\n ordered descendingly in a certain way, using columns specified in this\n parameter.\n ranking_key_asc: In this type of CDC condensation the data needs to be\n ordered ascendingly in a certain way, using columns specified in\n this parameter.\n record_mode_col: Name of the record mode input_col.\n valid_record_modes: Depending on the context, not all record modes may be\n considered for condensation. Use this parameter to skip those.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(\tcls,\tbusiness_key: List[str],\trecord_mode_col: str,\tvalid_record_modes: List[str],\tranking_key_desc: Optional[List[str]] = None,\tranking_key_asc: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.group_and_rank", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.group_and_rank", "kind": "function", "doc": "Condense data based on a simple group by + take latest mechanism.
\n\nArgs:\n group_key: list of column names to use in the group by.\n ranking_key: the data needs to be ordered descendingly using columns\n specified in this parameter.\n descending: if the ranking considers descending order or not. Defaults to\n True.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(\tcls,\tgroup_key: List[str],\tranking_key: List[str],\tdescending: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.custom_transformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "kind": "module", "doc": "Custom transformers module.
\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers", "kind": "class", "doc": "Class representing a CustomTransformers.
\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers.custom_transformation", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers.custom_transformation", "kind": "function", "doc": "Execute a custom transformation provided by the user.
\n\nThis transformer can be very useful whenever the user cannot use our provided\ntransformers, or they want to write complex logic in the transform step of the\nalgorithm.
\n\nAttention!!! Please bare in mind that the custom_transformer function provided\nas argument needs to receive a DataFrame and return a DataFrame, because it is\nhow Spark's .transform method is able to chain the transformations.\nExample:\n def my_custom_logic(df: DataFrame) -> DataFrame:
\n\nArgs:\n custom_transformer: custom transformer function. A python function with all\n required pyspark logic provided by the user.
\n\nReturns:\n Callable: the same function provided as parameter, in order to e called\n later in the TransformerFactory.
\n", "signature": "(custom_transformer: Callable) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers", "modulename": "lakehouse_engine.transformers.data_maskers", "kind": "module", "doc": "Module with data masking transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers", "kind": "class", "doc": "Class containing data masking transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.hash_masker", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.hash_masker", "kind": "function", "doc": "Mask specific columns using an hashing approach.
\n\nArgs:\n cols: list of column names to mask.\n approach: hashing approach. Defaults to 'SHA'. There's \"MURMUR3\" as well.\n num_bits: number of bits of the SHA approach. Only applies to SHA approach.\n suffix: suffix to apply to new column name. Defaults to \"_hash\".\n Note: you can pass an empty suffix to have the original column replaced.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tcols: List[str],\tapproach: str = 'SHA',\tnum_bits: int = 256,\tsuffix: str = '_hash') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.column_dropper", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.column_dropper", "kind": "function", "doc": "Drop specific columns.
\n\nArgs:\n cols: list of column names to drop.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, cols: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers", "modulename": "lakehouse_engine.transformers.date_transformers", "kind": "module", "doc": "Module containing date transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers", "kind": "class", "doc": "Class with set of transformers to transform dates in several forms.
\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.add_current_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.add_current_date", "kind": "function", "doc": "Add column with current date.
\n\nThe current date comes from the driver as a constant, not from every executor.
\n\nArgs:\n output_col: name of the output column.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(output_col: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_date", "kind": "function", "doc": "Convert multiple string columns with a source format into dates.
\n\nArgs:\n cols: list of names of the string columns to convert.\n source_format: dates source format (e.g., YYYY-MM-dd). Check here:\n https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_timestamp", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_timestamp", "kind": "function", "doc": "Convert multiple string columns with a source format into timestamps.
\n\nArgs:\n cols: list of names of the string columns to convert.\n source_format: dates source format (e.g., MM-dd-yyyy HH:mm:ss.SSS). Check\n here: https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.format_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.format_date", "kind": "function", "doc": "Convert multiple date/timestamp columns into strings with the target format.
\n\nArgs:\n cols: list of names of the string columns to convert.\n target_format: strings target format (e.g., YYYY-MM-dd). Check here:\n https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cols: List[str], target_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.get_date_hierarchy", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.get_date_hierarchy", "kind": "function", "doc": "Create day/month/week/quarter/year hierarchy for the provided date columns.
\n\nUses Spark's extract function.
\n\nArgs:\n cols: list of names of the date columns to create the hierarchy.\n formats: dict with the correspondence between the hierarchy and the format\n to apply.\n Example: {\n \"year\": \"year\",\n \"month\": \"month\",\n \"day\": \"day\",\n \"week\": \"week\",\n \"quarter\": \"quarter\"\n }\n Check here: https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cols: List[str], formats: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.exceptions", "modulename": "lakehouse_engine.transformers.exceptions", "kind": "module", "doc": "Module for all the transformers exceptions.
\n"}, {"fullname": "lakehouse_engine.transformers.exceptions.WrongArgumentsException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "WrongArgumentsException", "kind": "class", "doc": "Exception for when a user provides wrong arguments to a transformer.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.exceptions.UnsupportedStreamingTransformerException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "UnsupportedStreamingTransformerException", "kind": "class", "doc": "Exception for when a user requests a transformer not supported in streaming.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.filters", "modulename": "lakehouse_engine.transformers.filters", "kind": "module", "doc": "Module containing the filters transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters", "kind": "class", "doc": "Class containing the filters transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.incremental_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.incremental_filter", "kind": "function", "doc": "Incrementally Filter a certain dataframe given an increment logic.
\n\nThis logic can either be an increment value or an increment dataframe from which\nthe get the latest value from. By default the operator for the filtering process\nis greater or equal to cover cases where we receive late arriving data not cover\nin a previous load. You can change greater_or_equal to false to use greater,\nwhen you trust the source will never output more data with the increment after\nyou have load the data (e.g., you will never load data until the source is still\ndumping data, which may cause you to get an incomplete picture of the last\narrived data).
\n\nArgs:\n input_col: input column name\n increment_value: value to which to filter the data, considering the\n provided input_Col.\n increment_df: a dataframe to get the increment value from.\n you either specify this or the increment_value (this takes precedence).\n This is a good approach to get the latest value from a given dataframe\n that was read and apply that value as filter here. In this way you can\n perform incremental loads based on the last value of a given dataframe\n (e.g., table or file based). Can be used together with the\n get_max_value transformer to accomplish these incremental based loads.\n See our append load feature tests to see how to provide an acon for\n incremental loads, taking advantage of the scenario explained here.\n increment_col: name of the column from which to get the increment\n value from from (when using increment_df approach). This assumes there's\n only one row in the increment_df, reason why is a good idea to use\n together with the get_max_value transformer. Defaults to \"latest\"\n because that's the default output column name provided by the\n get_max_value transformer.\n greater_or_equal: if filtering should be done by also including the\n increment value or not (useful for scenarios where you are performing\n increment loads but still want to include data considering the increment\n value, and not only values greater than that increment... examples may\n include scenarios where you already loaded data including those values,\n but the source produced more data containing those values).\n Defaults to false.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tinput_col: str,\tincrement_value: Optional[Any] = None,\tincrement_df: Optional[pyspark.sql.dataframe.DataFrame] = None,\tincrement_col: str = 'latest',\tgreater_or_equal: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.expression_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.expression_filter", "kind": "function", "doc": "Filter a dataframe based on an expression.
\n\nArgs:\n exp: filter expression.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(exp: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.column_filter_exp", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.column_filter_exp", "kind": "function", "doc": "Filter a dataframe's columns based on a list of SQL expressions.
\n\nArgs:\n exp: column filter expressions.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(exp: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.drop_duplicate_rows", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.drop_duplicate_rows", "kind": "function", "doc": "Drop duplicate rows using spark function dropDuplicates().
\n\nThis transformer can be used with or without arguments.\nThe provided argument needs to be a list of columns.\nFor example: [\u201cName\u201d,\u201dVAT\u201d] will drop duplicate records within\n\"Name\" and \"VAT\" columns.\nIf the transformer is used without providing any columns list or providing\nan empty list, such as [] the result will be the same as using\nthe distinct() pyspark function. If the watermark dict is present it will\nensure that the drop operation will apply to rows within the watermark timeline\nwindow.
\n\nArgs:\n cols: column names.\n watermarker: properties to apply watermarker to the transformer.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cols: List[str] = None, watermarker: dict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.joiners", "modulename": "lakehouse_engine.transformers.joiners", "kind": "module", "doc": "Module with join transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners", "kind": "class", "doc": "Class containing join transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners.join", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners.join", "kind": "function", "doc": "Join two dataframes based on specified type and columns.
\n\nSome stream to stream joins are only possible if you apply Watermark, so this\nmethod also provides a parameter to enable watermarking specification.
\n\nArgs:\n left_df_alias: alias of the first dataframe.\n join_with: right dataframe.\n right_df_alias: alias of the second dataframe.\n join_condition: condition to join dataframes.\n join_type: type of join. Defaults to inner.\n Available values: inner, cross, outer, full, full outer,\n left, left outer, right, right outer, semi,\n left semi, anti, and left anti.\n broadcast_join: whether to perform a broadcast join or not.\n select_cols: list of columns to select at the end.\n watermarker: properties to apply watermarking.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tjoin_with: pyspark.sql.dataframe.DataFrame,\tjoin_condition: str,\tleft_df_alias: str = 'a',\tright_df_alias: str = 'b',\tjoin_type: str = 'inner',\tbroadcast_join: bool = True,\tselect_cols: Optional[List[str]] = None,\twatermarker: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.null_handlers", "modulename": "lakehouse_engine.transformers.null_handlers", "kind": "module", "doc": "Module with null handlers transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers", "kind": "class", "doc": "Class containing null handler transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers.replace_nulls", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers.replace_nulls", "kind": "function", "doc": "Replace nulls in a dataframe.
\n\nArgs:\n replace_on_nums: if it is to replace nulls on numeric columns.\n Applies to ints, longs and floats.\n default_num_value: default integer value to use as replacement.\n replace_on_strings: if it is to replace nulls on string columns.\n default_string_value: default string value to use as replacement.\n subset_cols: list of columns in which to replace nulls. If not\n provided, all nulls in all columns will be replaced as specified.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\treplace_on_nums: bool = True,\tdefault_num_value: int = -999,\treplace_on_strings: bool = True,\tdefault_string_value: str = 'UNKNOWN',\tsubset_cols: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "kind": "module", "doc": "Optimizers module.
\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers", "kind": "class", "doc": "Class containing all the functions that can provide optimizations.
\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.cache", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.cache", "kind": "function", "doc": "Caches the current dataframe.
\n\nThe default storage level used is MEMORY_AND_DISK.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.persist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.persist", "kind": "function", "doc": "Caches the current dataframe with a specific StorageLevel.
\n\nArgs:\n storage_level: the type of StorageLevel, as default MEMORY_AND_DISK_DESER.\n More options here: https://spark.apache.org/docs/latest/api/python/\n reference/api/pyspark.StorageLevel.html
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, storage_level: str = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.unpersist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.unpersist", "kind": "function", "doc": "Removes the dataframe from the disk and memory.
\n\nArgs:\n blocking: whether to block until all the data blocks are\n removed from disk/memory or run asynchronously.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, blocking: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.regex_transformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "kind": "module", "doc": "Regex transformers module.
\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers", "kind": "class", "doc": "Class containing all regex functions.
\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers.with_regex_value", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers.with_regex_value", "kind": "function", "doc": "Get the result of applying a regex to an input column (via regexp_extract).
\n\nArgs:\n input_col: name of the input column.\n output_col: name of the output column.\n regex: regular expression.\n drop_input_col: whether to drop input_col or not.\n idx: index to return.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(\tinput_col: str,\toutput_col: str,\tregex: str,\tdrop_input_col: bool = False,\tidx: int = 1) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "kind": "module", "doc": "Module with repartitioners transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners", "kind": "class", "doc": "Class containing repartitioners transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.coalesce", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.coalesce", "kind": "function", "doc": "Coalesce a dataframe into n partitions.
\n\nArgs:\n num_partitions: num of partitions to coalesce.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, num_partitions: int) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.repartition", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.repartition", "kind": "function", "doc": "Repartition a dataframe into n partitions.
\n\nIf num_partitions is provided repartitioning happens based on the provided\nnumber, otherwise it happens based on the values of the provided cols (columns).
\n\nArgs:\n num_partitions: num of partitions to repartition.\n cols: list of columns to use for repartitioning.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tnum_partitions: Optional[int] = None,\tcols: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.transformer_factory", "modulename": "lakehouse_engine.transformers.transformer_factory", "kind": "module", "doc": "Module with the factory pattern to return transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory", "kind": "class", "doc": "TransformerFactory class following the factory pattern.
\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory.get_transformer", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory.get_transformer", "kind": "function", "doc": "Get a transformer following the factory pattern.
\n\nArgs:\n spec: transformer specification (individual transformation... not to be\n confused with list of all transformations).\n data: ordered dict of dataframes to be transformed. Needed when a\n transformer requires more than one dataframe as input.
\n\nReturns:\n Transformer function to be executed in .transform() spark function.
\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TransformerSpec,\tdata: OrderedDict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions", "modulename": "lakehouse_engine.transformers.unions", "kind": "module", "doc": "Module with union transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions", "kind": "class", "doc": "Class containing union transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union", "kind": "function", "doc": "Union dataframes, resolving columns by position (not by name).
\n\nArgs:\n union_with: list of dataframes to union.\n deduplication: whether to perform deduplication of elements or not.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union_by_name", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union_by_name", "kind": "function", "doc": "Union dataframes, resolving columns by name (not by position).
\n\nArgs:\n union_with: list of dataframes to union.\n deduplication: whether to perform deduplication of elements or not.\n allow_missing_columns: allow the union of DataFrames with different\n schemas.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True,\tallow_missing_columns: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "kind": "module", "doc": "Watermarker module.
\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker", "kind": "class", "doc": "Class containing all watermarker transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker.with_watermark", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker.with_watermark", "kind": "function", "doc": "Get the dataframe with watermarker defined.
\n\nArgs:\n watermarker_column: name of the input column to be considered for\n the watermarking. Note: it must be a timestamp.\n watermarker_time: time window to define the watermark value.
\n\nReturns:\n A function to be executed on other transformers.
\n", "signature": "(watermarker_column: str, watermarker_time: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils", "modulename": "lakehouse_engine.utils", "kind": "module", "doc": "Utilities package.
\n"}, {"fullname": "lakehouse_engine.utils.configs", "modulename": "lakehouse_engine.utils.configs", "kind": "module", "doc": "Config utilities package.
\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils", "modulename": "lakehouse_engine.utils.configs.config_utils", "kind": "module", "doc": "Module to read configurations.
\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils", "kind": "class", "doc": "Config utilities class.
\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_acon", "kind": "function", "doc": "Get acon based on a filesystem path or on a dict.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).
\n\nReturns:\n Dict representation of an acon.
\n", "signature": "(\tcls,\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_config", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_config", "kind": "function", "doc": "Get Lakehouse Engine configurations.
\n\nReturns:\n A dictionary with the engine configurations.
\n", "signature": "() -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_json_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_json_acon", "kind": "function", "doc": "Read an acon (algorithm configuration) file.
\n\nArgs:\n path: path to the acon file.
\n\nReturns:\n The acon file content as a dict.
\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_sql", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_sql", "kind": "function", "doc": "Read a DDL file in Spark SQL format from a cloud object storage system.
\n\nArgs:\n path: path to the acon (algorithm configuration) file.
\n\nReturns:\n Content of the SQL file.
\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "kind": "module", "doc": "Utilities for databricks operations.
\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils", "kind": "class", "doc": "Databricks utilities class.
\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_db_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_db_utils", "kind": "function", "doc": "Get db utils on databricks.
\n\nArgs:\n spark: spark session.
\n\nReturns:\n Dbutils from databricks.
\n", "signature": "(spark: pyspark.sql.session.SparkSession) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_databricks_job_information", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_databricks_job_information", "kind": "function", "doc": "Get notebook context from running acon.
\n\nReturns:\n Dict containing databricks notebook context.
\n", "signature": "() -> Tuple[str, str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.expectations_utils", "modulename": "lakehouse_engine.utils.expectations_utils", "kind": "module", "doc": "Utilities to be used by custom expectations.
\n"}, {"fullname": "lakehouse_engine.utils.expectations_utils.validate_result", "modulename": "lakehouse_engine.utils.expectations_utils", "qualname": "validate_result", "kind": "function", "doc": "Validates the test results of the custom expectations.
\n\nIf you need to make additional validations on your custom expectation\nand/or require additional fields to be returned you can add them before\ncalling this function. The partial_success and partial_result\noptional parameters can be used to pass the result of additional\nvalidations and add more information to the result key of the\nreturned dict respectively.
\n\nArgs:\n expectation: Expectation to validate.\n configuration: Configuration used in the test.\n metrics: Test result metrics.\n partial_success: Result of validations done before calling this method.\n partial_result: Extra fields to be returned to the user.
\n\nReturns:\n The result of the validation.
\n", "signature": "(\texpectation: great_expectations.expectations.expectation.Expectation,\tconfiguration: great_expectations.core.expectation_configuration.ExpectationConfiguration,\tmetrics: Dict,\tpartial_success: bool = True,\tpartial_result: dict = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction", "modulename": "lakehouse_engine.utils.extraction", "kind": "module", "doc": "Extraction utilities package.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "kind": "module", "doc": "Utilities module for JDBC extraction processes.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType", "kind": "class", "doc": "Standardize the types of extractions we can have from a JDBC source.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.INIT", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.INIT", "kind": "variable", "doc": "\n", "default_value": "<JDBCExtractionType.INIT: 'init'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.DELTA", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.DELTA", "kind": "variable", "doc": "\n", "default_value": "<JDBCExtractionType.DELTA: 'delta'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction", "kind": "class", "doc": "Configurations available for an Extraction from a JDBC source.
\n\nThese configurations cover:\n user: username to connect to JDBC source.\n password: password to connect to JDBC source (always use secrets,\n don't use text passwords in your code).\n url: url to connect to JDBC source.\n dbtable: database.table to extract data from.\n calc_upper_bound_schema: custom schema used for the upper bound calculation.\n changelog_table: table of type changelog from which to extract data,\n when the extraction type is delta.\n partition_column: column used to split the extraction.\n latest_timestamp_data_location: data location (e.g., s3) containing the data\n to get the latest timestamp already loaded into bronze.\n latest_timestamp_data_format: the format of the dataset in\n latest_timestamp_data_location. Default: delta.\n extraction_type: type of extraction (delta or init). Default: \"delta\".\n driver: JDBC driver name. Default: \"com.sap.db.jdbc.Driver\".\n num_partitions: number of Spark partitions to split the extraction.\n lower_bound: lower bound to decide the partition stride.\n upper_bound: upper bound to decide the partition stride. If\n calculate_upper_bound is True, then upperBound will be\n derived by our upper bound optimizer, using the partition column.\n default_upper_bound: the value to use as default upper bound in case\n the result of the upper bound calculation is None. Default: \"1\".\n fetch_size: how many rows to fetch per round trip. Default: \"100000\".\n compress: enable network compression. Default: True.\n custom_schema: specify custom_schema for particular columns of the\n returned dataframe in the init/delta extraction of the source table.\n min_timestamp: min timestamp to consider to filter the changelog data.\n Default: None and automatically derived from the location provided.\n In case this one is provided it has precedence and the calculation\n is not done.\n max_timestamp: max timestamp to consider to filter the changelog data.\n Default: None and automatically derived from the table having information\n about the extraction requests, their timestamps and their status.\n In case this one is provided it has precedence and the calculation\n is not done.\n generate_predicates: whether to generate predicates automatically or not.\n Default: False.\n predicates: list containing all values to partition (if generate_predicates\n is used, the manual values provided are ignored). Default: None.\n predicates_add_null: whether to consider null on predicates list.\n Default: True.\n extraction_timestamp: the timestamp of the extraction. Default: current time\n following the format \"%Y%m%d%H%M%S\".\n max_timestamp_custom_schema: custom schema used on the max_timestamp derivation\n from the table holding the extraction requests information.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction.__init__", "kind": "function", "doc": "\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231012165159',\tmax_timestamp_custom_schema: Optional[str] = None)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils", "kind": "class", "doc": "Utils for managing data extraction from particularly relevant JDBC sources.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.__init__", "kind": "function", "doc": "Construct JDBCExtractionUtils.
\n\nArgs:\n jdbc_extraction: JDBC Extraction configurations. Can be of type:\n JDBCExtraction, SAPB4Extraction or SAPBWExtraction.
\n", "signature": "(jdbc_extraction: Any)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_additional_spark_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_additional_spark_options", "kind": "function", "doc": "Helper to get additional Spark Options initially passed.
\n\nIf people provide additional Spark options, not covered by the util function\narguments (get_spark_jdbc_options), we need to consider them.\nThus, we update the options retrieved by the utils, by checking if there is\nany Spark option initially provided that is not yet considered in the retrieved\noptions or function arguments and if the value for the key is not None.\nIf these conditions are filled, we add the options and return the complete dict.
\n\nArgs:\n input_spec: the input specification.\n options: dict with Spark options.\n ignore_options: list of options to be ignored by the process.\n Spark read has two different approaches to parallelize\n reading process, one of them is using upper/lower bound,\n another one is using predicates, those process can't be\n executed at the same time, you must choose one of them.\n By choosing predicates you can't pass lower and upper bound,\n also can't pass number of partitions and partition column\n otherwise spark will interpret the execution partitioned by\n upper and lower bound and will expect to fill all variables.\n To avoid fill all predicates hardcoded at the acon, there is\n a feature that automatically generates all predicates for init\n or delta load based on input partition column, but at the end\n of the process, partition column can't be passed to the options,\n because we are choosing predicates execution, that is why to\n generate predicates we need to pass some options to ignore.
\n\nReturns:\n a dict with all the options passed as argument, plus the options that\n were initially provided, but were not used in the util\n (get_spark_jdbc_options).
\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\toptions: dict,\tignore_options: List = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_predicates", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_predicates", "kind": "function", "doc": "Get the predicates list, based on a predicates query.
\n\nArgs:\n predicates_query: query to use as the basis to get the distinct values for\n a specified column, based on which predicates are generated.
\n\nReturns:\n List containing the predicates to use to split the extraction from\n JDBC sources.
\n", "signature": "(self, predicates_query: str) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_options", "kind": "function", "doc": "Get the Spark options to extract data from a JDBC source.
\n\nReturns:\n The Spark jdbc args dictionary, including the query to submit\n and also options args dictionary.
\n", "signature": "(self) -> Tuple[dict, dict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "kind": "function", "doc": "Get an optimal upperBound to properly split a Spark JDBC extraction.
\n\nReturns:\n Either an int, date or timestamp to serve as upperBound Spark JDBC option.
\n", "signature": "(self) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "kind": "module", "doc": "Utilities module for SAP B4 extraction processes.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes", "kind": "class", "doc": "Standardise the types of ADSOs we can have for Extractions from SAP B4.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.AQ", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.AQ", "kind": "variable", "doc": "\n", "annotation": ": str", "default_value": "<ADSOTypes.AQ: 'AQ'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.CL", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.CL", "kind": "variable", "doc": "\n", "annotation": ": str", "default_value": "<ADSOTypes.CL: 'CL'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.SUPPORTED_TYPES", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.SUPPORTED_TYPES", "kind": "variable", "doc": "\n", "annotation": ": list", "default_value": "<ADSOTypes.SUPPORTED_TYPES: ['AQ', 'CL']>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction", "kind": "class", "doc": "Configurations available for an Extraction from SAP B4.
\n\nIt inherits from JDBCExtraction configurations, so it can use\nand/or overwrite those configurations.
\n\nThese configurations cover:\n latest_timestamp_input_col: the column containing the request timestamps\n in the dataset in latest_timestamp_data_location. Default: REQTSN.\n request_status_tbl: the name of the SAP B4 table having information\n about the extraction requests. Composed of database.table.\n Default: SAPHANADB.RSPMREQUEST.\n request_col_name: name of the column having the request timestamp to join\n with the request status table. Default: REQUEST_TSN.\n data_target: the data target to extract from. User in the join operation with\n the request status table.\n act_req_join_condition: the join condition into activation table\n can be changed using this property.\n Default: 'tbl.reqtsn = req.request_col_name'.\n include_changelog_tech_cols: whether to include the technical columns\n (usually coming from the changelog) table or not.\n extra_cols_req_status_tbl: columns to be added from request status table.\n It needs to contain the prefix \"req.\". E.g. \"req.col1 as column_one,\n req.col2 as column_two\".\n request_status_tbl_filter: filter to use for filtering the request status table,\n influencing the calculation of the max timestamps and the delta extractions.\n adso_type: the type of ADSO that you are extracting from. Can be \"AQ\" or \"CL\".\n max_timestamp_custom_schema: the custom schema to apply on the calculation of\n the max timestamp to consider for the delta extractions.\n Default: timestamp DECIMAL(23,0).\n default_max_timestamp: the timestamp to use as default, when it is not possible\n to derive one.\n custom_schema: specify custom_schema for particular columns of the\n returned dataframe in the init/delta extraction of the source table.
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction.__init__", "kind": "function", "doc": "\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: str = 'REQTSN DECIMAL(23,0)',\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231012165159',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)',\tlatest_timestamp_input_col: str = 'REQTSN',\trequest_status_tbl: str = 'SAPHANADB.RSPMREQUEST',\trequest_col_name: str = 'REQUEST_TSN',\tdata_target: Optional[str] = None,\tact_req_join_condition: Optional[str] = None,\tinclude_changelog_tech_cols: Optional[bool] = None,\textra_cols_req_status_tbl: Optional[str] = None,\trequest_status_tbl_filter: Optional[str] = None,\tadso_type: Optional[str] = None,\tdefault_max_timestamp: str = '1970000000000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils", "kind": "class", "doc": "Utils for managing data extraction from SAP B4.
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.__init__", "kind": "function", "doc": "Construct SAPB4ExtractionUtils.
\n\nArgs:\n sap_b4_extraction: SAP B4 Extraction configurations.
\n", "signature": "(\tsap_b4_extraction: lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.get_data_target", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.get_data_target", "kind": "function", "doc": "Get the data_target from the data_target option or derive it.
\n\nBy definition data_target is the same for the table and changelog table and\nis the same string ignoring everything before / and the first and last\ncharacter after /. E.g. for a dbtable /BIC/abtable12, the data_target\nwould be btable1.
\n\nArgs:\n input_spec_opt: options from the input_spec.
\n\nReturns:\n A string with the data_target.
\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "kind": "module", "doc": "Utilities module for SAP BW extraction processes.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction", "kind": "class", "doc": "Configurations available for an Extraction from SAP BW.
\n\nIt inherits from SAPBWExtraction configurations, so it can use\nand/or overwrite those configurations.
\n\nThese configurations cover:\n latest_timestamp_input_col: the column containing the actrequest timestamp\n in the dataset in latest_timestamp_data_location. Default:\n \"actrequest_timestamp\".\n act_request_table: the name of the SAP BW activation requests table.\n Composed of database.table. Default: SAPPHA.RSODSACTREQ.\n request_col_name: name of the column having the request to join\n with the activation request table. Default: actrequest.\n act_req_join_condition: the join condition into activation table\n can be changed using this property.\n Default: 'changelog_tbl.request = act_req.request_col_name'.\n odsobject: name of BW Object, used for joining with the activation request\n table to get the max actrequest_timestamp to consider while filtering\n the changelog table.\n include_changelog_tech_cols: whether to include the technical columns\n (usually coming from the changelog) table or not. Default: True.\n extra_cols_act_request: list of columns to be added from act request table.\n It needs to contain the prefix \"act_req.\". E.g. \"act_req.col1\n as column_one, act_req.col2 as column_two\".\n get_timestamp_from_act_request: whether to get init timestamp\n from act request table or assume current/given timestamp.\n sap_bw_schema: sap bw schema. Default: SAPPHA.\n max_timestamp_custom_schema: the custom schema to apply on the calculation of\n the max timestamp to consider for the delta extractions.\n Default: timestamp DECIMAL(23,0).\n default_max_timestamp: the timestamp to use as default, when it is not possible\n to derive one.
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction.__init__", "kind": "function", "doc": "\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231012165159',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)',\tlatest_timestamp_input_col: str = 'actrequest_timestamp',\tact_request_table: str = 'SAPPHA.RSODSACTREQ',\trequest_col_name: str = 'actrequest',\tact_req_join_condition: Optional[str] = None,\todsobject: Optional[str] = None,\tinclude_changelog_tech_cols: bool = True,\textra_cols_act_request: Optional[str] = None,\tget_timestamp_from_act_request: bool = False,\tsap_bw_schema: str = 'SAPPHA',\tdefault_max_timestamp: str = '197000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils", "kind": "class", "doc": "Utils for managing data extraction from particularly relevant JDBC sources.
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.__init__", "kind": "function", "doc": "Construct SAPBWExtractionUtils.
\n\nArgs:\n sap_bw_extraction: SAP BW Extraction configurations.
\n", "signature": "(\tsap_bw_extraction: lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_changelog_table", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_changelog_table", "kind": "function", "doc": "Get the changelog table, given an odsobject.
\n\nReturns:\n String to use as changelog_table.
\n", "signature": "(self) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_odsobject", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_odsobject", "kind": "function", "doc": "Get the odsobject based on the provided options.
\n\nWith the table name we may also get the db name, so we need to split.\nMoreover, there might be the need for people to specify odsobject if\nit is different from the dbtable.
\n\nArgs:\n input_spec_opt: options from the input_spec.
\n\nReturns:\n A string with the odsobject.
\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "kind": "module", "doc": "Utilities module for SFTP extraction processes.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat", "kind": "class", "doc": "Formats of algorithm input.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.CSV", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.CSV", "kind": "variable", "doc": "\n", "default_value": "<SFTPInputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.FWF", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.FWF", "kind": "variable", "doc": "\n", "default_value": "<SFTPInputFormat.FWF: 'fwf'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.JSON", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.JSON", "kind": "variable", "doc": "\n", "default_value": "<SFTPInputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.XML", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.XML", "kind": "variable", "doc": "\n", "default_value": "<SFTPInputFormat.XML: 'xml'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter", "kind": "class", "doc": "Standardize the types of filters we can have from a SFTP source.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.file_name_contains", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.file_name_contains", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.file_name_contains: 'file_name_contains'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LATEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LATEST_FILE", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.LATEST_FILE: 'latest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.EARLIEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.EARLIEST_FILE", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.EARLIEST_FILE: 'earliest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.GREATER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.GREATER_THAN", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.GREATER_THAN: 'date_time_gt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LOWER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LOWER_THAN", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.LOWER_THAN: 'date_time_lt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils", "kind": "class", "doc": "Utils for managing data extraction from particularly relevant SFTP sources.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_files_list", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_files_list", "kind": "function", "doc": "Get a list of files to be extracted from SFTP.
\n\nThe arguments (options_args) to list files are:\ndate_time_gt(str):\n Filter the files greater than the string datetime\n formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".\ndate_time_lt(str):\n Filter the files lower than the string datetime\n formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".\nearliest_file(bool):\n Filter the earliest dated file in the directory.\nfile_name_contains(str):\n Filter files when match the pattern.\nlatest_file(bool):\n Filter the most recent dated file in the directory.\nsub_dir(bool):\n When true, the engine will search files into subdirectories\n of the remote_path.\n It will consider one level below the remote_path.\n When sub_dir is used with latest_file/earliest_file argument,\n the engine will retrieve the latest_file/earliest_file\n for each subdirectory.
\n\nArgs:\n sftp: the SFTP client object.\n remote_path: path of files to be filtered.\n options_args: options from the acon.
\n\nReturns:\n A list containing the file names to be passed to Spark.
\n", "signature": "(\tcls,\tsftp: paramiko.sftp_client.SFTPClient,\tremote_path: str,\toptions_args: dict) -> Set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_sftp_client", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_sftp_client", "kind": "function", "doc": "Get the SFTP client.
\n\nThe SFTP client is used to open an SFTP session across an open\nSSH Transport and perform remote file operations.
\n\nArgs:\n options_args: dictionary containing SFTP connection parameters.\n The Paramiko arguments expected to connect are:\n \"hostname\": the server to connect to.\n \"port\": the server port to connect to.\n \"username\": the username to authenticate as.\n \"password\": used for password authentication.\n \"pkey\": optional - an optional public key to use for authentication.\n \"passphrase\" \u2013 optional - options used for decrypting private keys.\n \"key_filename\" \u2013 optional - the filename, or list of filenames,\n of optional private key(s) and/or certs to try for authentication.\n \"timeout\" \u2013 an optional timeout (in seconds) for the TCP connect.\n \"allow_agent\" \u2013 optional - set to False to disable\n connecting to the SSH agent.\n \"look_for_keys\" \u2013 optional - set to False to disable searching\n for discoverable private key files in ~/.ssh/.\n \"compress\" \u2013 optional - set to True to turn on compression.\n \"sock\" - optional - an open socket or socket-like object\n to use for communication to the target host.\n \"gss_auth\" \u2013 optional - True if you want to use GSS-API authentication.\n \"gss_kex\" \u2013 optional - Perform GSS-API Key Exchange and\n user authentication.\n \"gss_deleg_creds\" \u2013 optional - Delegate GSS-API client\n credentials or not.\n \"gss_host\" \u2013 optional - The targets name in the kerberos database.\n \"gss_trust_dns\" \u2013 optional - Indicates whether or\n not the DNS is trusted to securely canonicalize the name of the\n host being connected to (default True).\n \"banner_timeout\" \u2013 an optional timeout (in seconds)\n to wait for the SSH banner to be presented.\n \"auth_timeout\" \u2013 an optional timeout (in seconds)\n to wait for an authentication response.\n \"disabled_algorithms\" \u2013 an optional dict passed directly to Transport\n and its keyword argument of the same name.\n \"transport_factory\" \u2013 an optional callable which is handed a subset of\n the constructor arguments (primarily those related to the socket,\n GSS functionality, and algorithm selection) and generates a\n Transport instance to be used by this client.\n Defaults to Transport.__init__.
\n\n\n\nThe parameter to specify the private key is expected to be in RSA format.\nAttempting a connection with a blank host key is not allowed\nunless the argument \"add_auto_policy\" is explicitly set to True.\n
Returns:\n sftp -> a new SFTPClient session object.\n transport -> the Transport for this connection.
\n", "signature": "(\tcls,\toptions_args: dict) -> Tuple[paramiko.sftp_client.SFTPClient, paramiko.transport.Transport]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_format", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_format", "kind": "function", "doc": "Validate the file extension based on the format definitions.
\n\nArgs:\n files_format: a string containing the file extension.
\n\nReturns:\n The string validated and formatted.
\n", "signature": "(cls, files_format: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_location", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_location", "kind": "function", "doc": "Validate the location. Add \"/\" in the case it does not exist.
\n\nArgs:\n location: file path.
\n\nReturns:\n The location validated.
\n", "signature": "(cls, location: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.file_utils", "modulename": "lakehouse_engine.utils.file_utils", "kind": "module", "doc": "Utilities for file name based operations.
\n"}, {"fullname": "lakehouse_engine.utils.file_utils.get_file_names_without_file_type", "modulename": "lakehouse_engine.utils.file_utils", "qualname": "get_file_names_without_file_type", "kind": "function", "doc": "Function to retrieve list of file names in a folder.
\n\nThis function filters by file type and removes the extension of the file name\nit returns.
\n\nArgs:\n path: path to the folder to list files\n file_type: type of the file to include in list\n exclude_regex: regex of file names to exclude
\n\nReturns:\n A list of file names without file type.
\n", "signature": "(path: str, file_type: str, exclude_regex: str) -> list:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler", "modulename": "lakehouse_engine.utils.logging_handler", "kind": "module", "doc": "Module to configure project logging.
\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData", "kind": "class", "doc": "Logging filter to hide sensitive data from being shown in the logs.
\n", "bases": "logging.Filter"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData.filter", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData.filter", "kind": "function", "doc": "Hide sensitive information from being shown in the logs.
\n\nBased on the configured regex and replace strings, the content of the log\nrecords is replaced and then all the records are allowed to be logged\n(return True).
\n\nArgs:\n record: the LogRecord event being logged.
\n\nReturns:\n The transformed record to be logged.
\n", "signature": "(self, record: logging.LogRecord) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler", "kind": "class", "doc": "Handle the logging of the lakehouse engine project.
\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.__init__", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.__init__", "kind": "function", "doc": "Construct a LoggingHandler instance.
\n\nArgs:\n class_name: name of the class to be indicated in the logs.
\n", "signature": "(class_name: str)"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.get_logger", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.get_logger", "kind": "function", "doc": "Get the _logger instance variable.
\n\nReturns
\n\n\n\n", "signature": "(self) -> logging.Logger:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils", "modulename": "lakehouse_engine.utils.schema_utils", "kind": "module", "doc": "the logger object.
\nUtilities to facilitate dataframe schema management.
\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils", "kind": "class", "doc": "Schema utils that help retrieve and manage schemas of dataframes.
\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file", "kind": "function", "doc": "Get a spark schema from a file (spark StructType json file) in a file system.
\n\nArgs:\n file_path: path of the file in a file system. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html
\n\nReturns:\n Spark schema struct type.
\n", "signature": "(file_path: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file_to_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file_to_dict", "kind": "function", "doc": "Get a dict with the spark schema from a file in a file system.
\n\nArgs:\n file_path: path of the file in a file system. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html
\n\nReturns:\n Spark schema in a dict.
\n", "signature": "(file_path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_dict", "kind": "function", "doc": "Get a spark schema from a dict.
\n\nArgs:\n struct_type: dict containing a spark schema structure. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html
\n\nReturns:\n Spark schema struct type.
\n", "signature": "(struct_type: dict) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_table_schema", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_table_schema", "kind": "function", "doc": "Get a spark schema from a table.
\n\nArgs:\n table: table name from which to inherit the schema.
\n\nReturns:\n Spark schema struct type.
\n", "signature": "(table: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_input_spec", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_input_spec", "kind": "function", "doc": "Get a spark schema from an input specification.
\n\nThis covers scenarios where the schema is provided as part of the input\nspecification of the algorithm. Schema can come from the table specified in the\ninput specification (enforce_schema_from_table) or by the dict with the spark\nschema provided there also.
\n\nArgs:\n input_spec: input specification.
\n\nReturns:\n spark schema struct type.
\n", "signature": "(\tcls,\tinput_spec: lakehouse_engine.core.definitions.InputSpec) -> Optional[pyspark.sql.types.StructType]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.schema_flattener", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.schema_flattener", "kind": "function", "doc": "Recursive method to flatten the schema of the dataframe.
\n\nArgs:\n schema: schema to be flattened.\n prefix: prefix of the struct to get the value for. Only relevant\n for being used in the internal recursive logic.\n level: level of the depth in the schema being flattened. Only relevant\n for being used in the internal recursive logic.\n max_level: level until which you want to flatten the schema. Default: None.\n shorten_names: whether to shorten the names of the prefixes of the fields\n being flattened or not. Default: False.\n alias: whether to define alias for the columns being flattened or\n not. Default: True.\n num_chars: number of characters to consider when shortening the names of\n the fields. Default: 7.\n ignore_cols: columns which you don't want to flatten. Default: None.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tschema: pyspark.sql.types.StructType,\tprefix: str = None,\tlevel: int = 1,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage", "modulename": "lakehouse_engine.utils.storage", "kind": "module", "doc": "Utilities to interact with storage systems.
\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage", "modulename": "lakehouse_engine.utils.storage.file_storage", "kind": "module", "doc": "Module for abstract representation of a storage system holding files.
\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage", "kind": "class", "doc": "Abstract file storage class.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.get_file_payload", "kind": "function", "doc": "Get the payload of a file.
\n\nArgs:\n url: url of the file.
\n\nReturns:\n File payload/content.
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.write_payload_to_file", "kind": "function", "doc": "Write payload into a file.
\n\nArgs:\n url: url of the file.\n content: content to write into the file.
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "kind": "module", "doc": "Module for common file storage functions.
\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions", "kind": "class", "doc": "Class for common file storage functions.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.read_json", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.read_json", "kind": "function", "doc": "Read a json file.
\n\nThe file should be in a supported file system (e.g., s3 or local filesystem -\nfor local tests only).
\n\nArgs:\n path: path to the json file.
\n\nReturns:\n Dict with json file content.
\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "kind": "module", "doc": "Module to represent a local file storage system.
\n"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage", "kind": "class", "doc": "Class to represent a local file storage system.
\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.get_file_payload", "kind": "function", "doc": "Get the payload of a file.
\n\nArgs:\n url: url of the file.
\n\nReturns:\n file payload/content.
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> <class 'TextIO'>:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.write_payload_to_file", "kind": "function", "doc": "Write payload into a file.
\n\nArgs:\n url: url of the file.\n content: content to write into the file.
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "kind": "module", "doc": "Module to represent a s3 file storage system.
\n"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage", "kind": "class", "doc": "Class to represent a s3 file storage system.
\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.get_file_payload", "kind": "function", "doc": "Get the payload of a config file.
\n\nArgs:\n url: url of the file.
\n\nReturns:\n File payload/content.
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.write_payload_to_file", "kind": "function", "doc": "Write payload into a file.
\n\nArgs:\n url: url of the file.\n content: content to write into the file.
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}]; + /** pdoc search index */const docs = [{"fullname": "lakehouse_engine", "modulename": "lakehouse_engine", "kind": "module", "doc": "Lakehouse engine package containing all the system subpackages.
\n"}, {"fullname": "lakehouse_engine.algorithms", "modulename": "lakehouse_engine.algorithms", "kind": "module", "doc": "Package containing all the lakehouse engine algorithms.
\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "kind": "module", "doc": "Module containing the Algorithm class.
\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm", "kind": "class", "doc": "Class to define the behavior of every algorithm based on ACONs.
\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.__init__", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.__init__", "kind": "function", "doc": "Construct Algorithm instances.
\n\nArgs:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.get_dq_spec", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.get_dq_spec", "kind": "function", "doc": "Get data quality specification object from acon.
\n\nArgs:\n spec: data quality specifications.
\n\nReturns:\n The DQSpec and the List of DQ Functions Specs.
\n", "signature": "(\tcls,\tspec: dict) -> Tuple[lakehouse_engine.core.definitions.DQSpec, List[lakehouse_engine.core.definitions.DQFunctionSpec], List[lakehouse_engine.core.definitions.DQFunctionSpec]]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader", "modulename": "lakehouse_engine.algorithms.data_loader", "kind": "module", "doc": "Module to define DataLoader class.
\n"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader", "kind": "class", "doc": "Load data using an algorithm configuration (ACON represented as dict).
\n\nThis algorithm focuses on the cases where users will be specifying all the algorithm\nsteps and configurations through a dict based configuration, which we name ACON\nin our framework.
\n\nSince an ACON is a dict you can pass a custom transformer through a python function\nand, therefore, the DataLoader can also be used to load data with custom\ntransformations not provided in our transformers package.
\n\nAs the algorithm base class of the lakehouse-engine framework is based on the\nconcept of ACON, this DataLoader algorithm simply inherits from Algorithm,\nwithout overriding anything. We designed the codebase like this to avoid\ninstantiating the Algorithm class directly, which was always meant to be an\nabstraction for any specific algorithm included in the lakehouse-engine framework.
\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.__init__", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.__init__", "kind": "function", "doc": "Construct DataLoader algorithm instances.
\n\nA data loader needs several specifications to work properly,\nbut some of them might be optional. The available specifications are:
\n\n\n\n- input specifications (mandatory): specify how to read data.\n- transform specifications (optional): specify how to transform data.\n- data quality specifications (optional): specify how to execute the data\n quality process.\n- output specifications (mandatory): specify how to write data to the\n target.\n- terminate specifications (optional): specify what to do after writing into\n the target (e.g., optimizing target table, vacuum, compute stats, etc).\n
Args:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.read", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.read", "kind": "function", "doc": "Read data from an input location into a distributed dataframe.
\n\nReturns:\n An ordered dict with all the dataframes that were read.
\n", "signature": "(self) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.transform", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.transform", "kind": "function", "doc": "Transform (optionally) the data that was read.
\n\nIf there isn't a transformation specification this step will be skipped, and the\noriginal dataframes that were read will be returned.\nTransformations can have dependency from another transformation result, however\nwe need to keep in mind if we are using streaming source and for some reason we\nneed to enable micro batch processing, this result cannot be used as input to\nanother transformation. Micro batch processing in pyspark streaming is only\navailable in .write(), which means this transformation with micro batch needs\nto be the end of the process.
\n\nArgs:\n data: input dataframes in an ordered dict.
\n\nReturns:\n Another ordered dict with the transformed dataframes, according to the\n transformation specification.
\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.process_dq", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.process_dq", "kind": "function", "doc": "Process the data quality tasks for the data that was read and/or transformed.
\n\nIt supports multiple input dataframes. Although just one is advisable.
\n\nIt is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.
\n\nArgs:\n data: dataframes from previous steps of the algorithm that we which to\n run the DQ process on.
\n\nReturns:\n Another ordered dict with the validated dataframes.
\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.write", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.write", "kind": "function", "doc": "Write the data that was read and transformed (if applicable).
\n\nIt supports writing multiple datasets. However, we only recommend to write one\ndataframe. This recommendation is based on easy debugging and reproducibility,\nsince if we start mixing several datasets being fueled by the same algorithm, it\nwould unleash an infinite sea of reproducibility issues plus tight coupling and\ndependencies between datasets. Having said that, there may be cases where\nwriting multiple datasets is desirable according to the use case requirements.\nUse it accordingly.
\n\nArgs:\n data: dataframes that were read and transformed (if applicable).
\n\nReturns:\n Dataframes that were written.
\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.terminate", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.terminate", "kind": "function", "doc": "Terminate the algorithm.
\n\nArgs:\n data: dataframes that were written.
\n", "signature": "(self, data: collections.OrderedDict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.execute", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.execute", "kind": "function", "doc": "Define the algorithm execution behaviour.
\n", "signature": "(self) -> Optional[collections.OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator", "modulename": "lakehouse_engine.algorithms.dq_validator", "kind": "module", "doc": "Module to define Data Validator class.
\n"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator", "kind": "class", "doc": "Validate data using an algorithm configuration (ACON represented as dict).
\n\nThis algorithm focuses on isolate Data Quality Validations from loading,\napplying a set of data quality functions to a specific input dataset,\nwithout the need to define any output specification.\nYou can use any input specification compatible with the lakehouse engine\n(dataframe, table, files, etc).
\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.__init__", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.__init__", "kind": "function", "doc": "Construct DQValidator algorithm instances.
\n\nA data quality validator needs the following specifications to work\nproperly:\n - input specification (mandatory): specify how and what data to\n read.\n - data quality specification (mandatory): specify how to execute\n the data quality process.\n - restore_prev_version (optional): specify if, having\n delta table/files as input, they should be restored to the\n previous version if the data quality process fails. Note: this\n is only considered if fail_on_error is kept as True.
\n\nArgs:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.read", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.read", "kind": "function", "doc": "Read data from an input location into a distributed dataframe.
\n\nReturns:\n Dataframe with data that was read.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.process_dq", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.process_dq", "kind": "function", "doc": "Process the data quality tasks for the data that was read.
\n\nIt supports a single input dataframe.
\n\nIt is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.
\n\nArgs:\n data: input dataframe on which to run the DQ process.
\n\nReturns:\n Validated dataframe.
\n", "signature": "(\tself,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.execute", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.execute", "kind": "function", "doc": "Define the algorithm execution behaviour.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.exceptions", "modulename": "lakehouse_engine.algorithms.exceptions", "kind": "module", "doc": "Package defining all the algorithm custom exceptions.
\n"}, {"fullname": "lakehouse_engine.algorithms.exceptions.ReconciliationFailedException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "ReconciliationFailedException", "kind": "class", "doc": "Exception for when the reconciliation process fails.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.NoNewDataException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "NoNewDataException", "kind": "class", "doc": "Exception for when no new data is available.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.SensorAlreadyExistsException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "SensorAlreadyExistsException", "kind": "class", "doc": "Exception for when a sensor with same sensor id already exists.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.RestoreTypeNotFoundException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "RestoreTypeNotFoundException", "kind": "class", "doc": "Exception for when the restore type is not found.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "kind": "module", "doc": "Module containing the Reconciliator class.
\n"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType", "kind": "class", "doc": "Type of Reconciliation.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.PCT", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.PCT", "kind": "variable", "doc": "\n", "default_value": "<ReconciliationType.PCT: 'percentage'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.ABS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.ABS", "kind": "variable", "doc": "\n", "default_value": "<ReconciliationType.ABS: 'absolute'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers", "kind": "class", "doc": "Transformers Available for the Reconciliation Algorithm.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "kind": "variable", "doc": "\n", "annotation": ": dict", "default_value": "<ReconciliationTransformers.AVAILABLE_TRANSFORMERS: {'cache': <bound method Optimizers.cache of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>, 'persist': <bound method Optimizers.persist of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>}>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator", "kind": "class", "doc": "Class to define the behavior of an algorithm that checks if data reconciles.
\n\nChecking if data reconciles, using this algorithm, is a matter of reading the\n'truth' data and the 'current' data. You can use any input specification compatible\nwith the lakehouse engine to read 'truth' or 'current' data. On top of that, you\ncan pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can\npreprocess the data before it goes into the actual reconciliation process.\nMoreover, you can use the 'truth_preprocess_query_args' and\n'current_preprocess_query_args' to pass additional arguments to be used to apply\nadditional operations on top of the dataframe, resulting from the previous steps.\nWith these arguments you can apply additional operations like caching or persisting\nthe Dataframe. The way to pass the additional arguments for the operations is\nsimilar to the TransformSpec, but only a few operations are allowed. Those are\ndefined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS.
\n\nThe reconciliation process is focused on joining 'truth' with 'current' by all\nprovided columns except the ones passed as 'metrics'. After that it calculates the\ndifferences in the metrics attributes (either percentage or absolute difference).\nFinally, it aggregates the differences, using the supplied aggregation function\n(e.g., sum, avg, min, max, etc).
\n\nAll of these configurations are passed via the ACON to instantiate a\nReconciliatorSpec object.
\n\nNotes:\n - It is crucial that both the current and truth datasets have exactly the same\n structure.\n - You should not use 0 as yellow or red threshold, as the algorithm will verify\n if the difference between the truth and current values is bigger\n or equal than those thresholds.\n - The reconciliation does not produce any negative values or percentages, as we\n use the absolute value of the differences. This means that the recon result\n will not indicate if it was the current values that were bigger or smaller\n than the truth values, or vice versa.
\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.__init__", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.__init__", "kind": "function", "doc": "Construct Algorithm instances.
\n\nArgs:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_source_of_truth", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_source_of_truth", "kind": "function", "doc": "Get the source of truth (expected result) for the reconciliation process.
\n\nReturns:\n DataFrame containing the source of truth.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_current_results", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_current_results", "kind": "function", "doc": "Get the current results from the table that we are checking if it reconciles.
\n\nReturns:\n DataFrame containing the current results.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.execute", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.execute", "kind": "function", "doc": "Reconcile the current results against the truth dataset.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.sensor", "modulename": "lakehouse_engine.algorithms.sensor", "kind": "module", "doc": "Module to define Sensor algorithm behavior.
\n"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor", "kind": "class", "doc": "Class representing a sensor to check if the upstream has new data.
\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.__init__", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.__init__", "kind": "function", "doc": "Construct Sensor instances.
\n\nArgs:\n acon: algorithm configuration.
\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.execute", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.execute", "kind": "function", "doc": "Execute the sensor.
\n", "signature": "(self) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.configs", "modulename": "lakehouse_engine.configs", "kind": "module", "doc": "This module receives a config file which is included in the wheel.
\n"}, {"fullname": "lakehouse_engine.core", "modulename": "lakehouse_engine.core", "kind": "module", "doc": "Package with the core behaviour of the lakehouse engine.
\n"}, {"fullname": "lakehouse_engine.core.definitions", "modulename": "lakehouse_engine.core.definitions", "kind": "module", "doc": "Definitions of standard values and structures for core components.
\n"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat", "kind": "class", "doc": "Formats of algorithm input.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JDBC", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.AVRO", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JSON", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CSV", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.PARQUET", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DELTAFILES", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CLOUDFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CLOUDFILES", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.CLOUDFILES: 'cloudfiles'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.KAFKA", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SQL", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SQL", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.SQL: 'sql'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_BW", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_BW", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.SAP_BW: 'sap_bw'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_B4", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_B4", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.SAP_B4: 'sap_b4'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DATAFRAME", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SFTP", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SFTP", "kind": "variable", "doc": "\n", "default_value": "<InputFormat.SFTP: 'sftp'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.values", "kind": "function", "doc": "Generates a list containing all enum values.
\n\nReturn:\n A list with all enum values.
\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.exists", "kind": "function", "doc": "Checks if the input format exists in the enum values.
\n\nArgs:\n input_format: format to check if exists.
\n\nReturn:\n If the input format exists in our enum.
\n", "signature": "(cls, input_format: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat", "kind": "class", "doc": "Formats of algorithm output.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JDBC", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.AVRO", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JSON", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CSV", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.PARQUET", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DELTAFILES", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.KAFKA", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CONSOLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CONSOLE", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.CONSOLE: 'console'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.NOOP", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.NOOP", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.NOOP: 'noop'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DATAFRAME", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.FILE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.FILE", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.FILE: 'file'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.TABLE", "kind": "variable", "doc": "\n", "default_value": "<OutputFormat.TABLE: 'table'>"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType", "kind": "class", "doc": "Type of notifier available.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType.EMAIL", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType.EMAIL", "kind": "variable", "doc": "\n", "default_value": "<NotifierType.EMAIL: 'email'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationEmailServers", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationEmailServers", "kind": "class", "doc": "Types of email server with special behaviour.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters", "kind": "class", "doc": "Parameters to be replaced in runtime.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "kind": "variable", "doc": "\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_JOB_NAME: 'databricks_job_name'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "kind": "variable", "doc": "\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID: 'databricks_workspace_id'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType", "kind": "class", "doc": "Define the types of read operations.
\n\nBATCH - read the data in batch mode (e.g., Spark batch).\nSTREAMING - read the data in streaming mode (e.g., Spark streaming).
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.BATCH", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.BATCH", "kind": "variable", "doc": "\n", "default_value": "<ReadType.BATCH: 'batch'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.STREAMING", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.STREAMING", "kind": "variable", "doc": "\n", "default_value": "<ReadType.STREAMING: 'streaming'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode", "kind": "class", "doc": "Different modes that control how we handle compliance to the provided schema.
\n\nThese read modes map to Spark's read modes at the moment.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.PERMISSIVE", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.PERMISSIVE", "kind": "variable", "doc": "\n", "default_value": "<ReadMode.PERMISSIVE: 'PERMISSIVE'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.FAILFAST", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.FAILFAST", "kind": "variable", "doc": "\n", "default_value": "<ReadMode.FAILFAST: 'FAILFAST'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.DROPMALFORMED", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.DROPMALFORMED", "kind": "variable", "doc": "\n", "default_value": "<ReadMode.DROPMALFORMED: 'DROPMALFORMED'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults", "kind": "class", "doc": "Defaults used on the data quality process.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_STORE", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.FILE_SYSTEM_STORE: 'file_system'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_S3_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_S3_STORE", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_BATCH_IDENTIFIERS", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_BATCH_IDENTIFIERS", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DQ_BATCH_IDENTIFIERS: ['spec_id', 'input_id', 'timestamp']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_CLASS_NAME", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATASOURCE_CLASS_NAME: 'Datasource'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_EXECUTION_ENGINE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_EXECUTION_ENGINE", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATASOURCE_EXECUTION_ENGINE: 'SparkDFExecutionEngine'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_CLASS_NAME", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_CONNECTORS_CLASS_NAME: 'RuntimeDataConnector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_MODULE_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_MODULE_NAME", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_CONNECTORS_MODULE_NAME: 'great_expectations.datasource.data_connector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CLASS_NAME: 'SimpleCheckpoint'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION: 1.0>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.STORE_BACKEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.STORE_BACKEND", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.EXPECTATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.EXPECTATIONS_STORE_PREFIX", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.EXPECTATIONS_STORE_PREFIX: 'dq/expectations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATIONS_STORE_PREFIX", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.VALIDATIONS_STORE_PREFIX: 'dq/validations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_DOCS_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_DOCS_PREFIX", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DATA_DOCS_PREFIX: 'dq/data_docs/site/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CHECKPOINT_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CHECKPOINT_STORE_PREFIX", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.CHECKPOINT_STORE_PREFIX: 'dq/checkpoints/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.VALIDATION_COLUMN_IDENTIFIER: 'validationresultidentifier'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CUSTOM_EXPECTATION_LIST", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CUSTOM_EXPECTATION_LIST", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.CUSTOM_EXPECTATION_LIST: ['expect_column_values_to_be_date_not_older_than', 'expect_column_pair_a_to_be_smaller_or_equal_than_b', 'expect_multicolumn_column_a_must_equal_b_or_c', 'expect_queried_column_agg_value_to_be']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_VALIDATIONS_SCHEMA", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_VALIDATIONS_SCHEMA", "kind": "variable", "doc": "\n", "default_value": "<DQDefaults.DQ_VALIDATIONS_SCHEMA: StructType([StructField('dq_validations', StructType([StructField('run_name', StringType(), True), StructField('run_success', BooleanType(), True), StructField('raised_exceptions', BooleanType(), True), StructField('run_row_success', BooleanType(), True), StructField('dq_failure_details', ArrayType(StructType([StructField('expectation_type', StringType(), True), StructField('kwargs', StringType(), True)]), True), True)]), True)])>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType", "kind": "class", "doc": "Types of write operations.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.OVERWRITE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.OVERWRITE", "kind": "variable", "doc": "\n", "default_value": "<WriteType.OVERWRITE: 'overwrite'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.COMPLETE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.COMPLETE", "kind": "variable", "doc": "\n", "default_value": "<WriteType.COMPLETE: 'complete'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.APPEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.APPEND", "kind": "variable", "doc": "\n", "default_value": "<WriteType.APPEND: 'append'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.UPDATE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.UPDATE", "kind": "variable", "doc": "\n", "default_value": "<WriteType.UPDATE: 'update'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.MERGE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.MERGE", "kind": "variable", "doc": "\n", "default_value": "<WriteType.MERGE: 'merge'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.ERROR_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.ERROR_IF_EXISTS", "kind": "variable", "doc": "\n", "default_value": "<WriteType.ERROR_IF_EXISTS: 'error'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.IGNORE_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.IGNORE_IF_EXISTS", "kind": "variable", "doc": "\n", "default_value": "<WriteType.IGNORE_IF_EXISTS: 'ignore'>"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec", "kind": "class", "doc": "Specification of an algorithm input.
\n\nThis is very aligned with the way the execution environment connects to the sources\n(e.g., spark sources).
\n\nspec_id: spec_id of the input specification read_type: ReadType type of read\noperation.\ndata_format: format of the input.\nsftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp\n directory.\ndf_name: dataframe name.\ndb_table: table name in the form of
. .\nlocation: uri that identifies from where to read data in the specified format.\nenforce_schema_from_table: if we want to enforce the table schema or not, by\n providing a table name in the form of
. .\nquery: sql query to execute and return the dataframe. Use it if you do not want to\n read from a file system nor from a table, but rather from a sql query instead.\nschema: dict representation of a schema of the input (e.g., Spark struct type\n schema).\nschema_path: path to a file with a representation of a schema of the input (e.g.,\n Spark struct type schema).\nwith_filepath: if we want to include the path of the file that is being read. Only\n works with the file reader (batch and streaming modes are supported).\noptions: dict with other relevant options according to the execution\n environment (e.g., spark) possible sources.\ncalculate_upper_bound: when to calculate upper bound to extract from SAP BW or not.\ncalc_upper_bound_schema: specific schema for the calculated upper_bound.\ngenerate_predicates: when to generate predicates to extract from SAP BW or not.\npredicates_add_null: if we want to include is null on partition by predicates.\n"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tspec_id: str,\tread_type: str,\tdata_format: Optional[str] = None,\tsftp_files_format: Optional[str] = None,\tdf_name: Optional[pyspark.sql.dataframe.DataFrame] = None,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tquery: Optional[str] = None,\tenforce_schema_from_table: Optional[str] = None,\tschema: Optional[dict] = None,\tschema_path: Optional[str] = None,\twith_filepath: bool = False,\toptions: Optional[dict] = None,\tjdbc_args: Optional[dict] = None,\tcalculate_upper_bound: bool = False,\tcalc_upper_bound_schema: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates_add_null: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec", "kind": "class", "doc": "
Transformer Specification, i.e., a single transformation amongst many.
\n\nfunction: name of the function (or callable function) to be executed.\nargs: (not applicable if using a callable function) dict with the arguments to pass\nto the function
\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec.__init__", "kind": "function", "doc": "\n", "signature": "(function: str, args: dict)"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec", "kind": "class", "doc": "pairs with the name of the parameter of the function and the\nrespective value. Transformation Specification.
\n\nI.e., the specification that defines the many transformations to be done to the data\nthat was read.
\n\nspec_id: id of the terminate specification input_id: id of the corresponding input\nspecification.\ntransformers: list of transformers to execute.\nforce_streaming_foreach_batch_processing: sometimes, when using streaming, we want\n to force the transform to be executed in the foreachBatch function to ensure\n non-supported streaming operations can be properly executed.
\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tspec_id: str,\tinput_id: str,\ttransformers: List[lakehouse_engine.core.definitions.TransformerSpec],\tforce_streaming_foreach_batch_processing: bool = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQType", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType", "kind": "class", "doc": "Available data quality tasks.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQType.VALIDATOR", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.VALIDATOR", "kind": "variable", "doc": "\n", "default_value": "<DQType.VALIDATOR: 'validator'>"}, {"fullname": "lakehouse_engine.core.definitions.DQType.ASSISTANT", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.ASSISTANT", "kind": "variable", "doc": "\n", "default_value": "<DQType.ASSISTANT: 'assistant'>"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec", "kind": "class", "doc": "Defines a data quality function specification.
\n\nfunction - name of the data quality function (expectation) to execute.\nIt follows the great_expectations api https://greatexpectations.io/expectations/.\nargs - args of the function (expectation). Follow the same api as above.
\n"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec.__init__", "kind": "function", "doc": "\n", "signature": "(function: str, args: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec", "kind": "class", "doc": "Data quality overall specification.
\n\n\n\nspec_id - id of the specification.\ninput_id - id of the input specification.\ndq_type - type of DQ process to execute (e.g. validator).\ndq_functions - list of function specifications to execute.\nunexpected_rows_pk - the list of columns composing the primary key of the\n source data to identify the rows failing the DQ validations. Note: only one\n of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It\n is mandatory to provide one of these arguments when using tag_source_data\n as True. When tag_source_data is False, this is not mandatory, but still\n recommended.\ntbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.\n Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to\n be provided. It is mandatory to provide one of these arguments when using\n tag_source_data as True. hen tag_source_data is False, this is not\n mandatory, but still recommended.\ngx_result_format - great expectations result format. Default: \"COMPLETE\".\n
\u00b4 tag_source_data - when set to true, this will ensure that the DQ process ends by\n tagging the source data with an additional column with information about the\n DQ results. This column makes it possible to identify if the DQ run was\n succeeded in general and, if not, it unlocks the insights to know what\n specific rows have made the DQ validations fail and why. Default: False.\n Note: it only works if result_sink_explode is True, gx_result_format is\n COMPLETE, fail_on_error is False (which is done automatically when\n you specify tag_source_data as True) and tbl_to_derive_pk or\n unexpected_rows_pk is configured.\n store_backend - which store_backend to use (e.g. s3 or file_system).\n local_fs_root_dir - path of the root directory. Note: only applicable for\n store_backend file_system.\n bucket - the bucket name to consider for the store_backend (store DQ artefacts).\n Note: only applicable for store_backend s3.\n data_docs_bucket - the bucket name for data docs only. When defined, it will\n supersede bucket parameter.\n expectations_store_prefix - prefix where to store expectations' data. Note: only\n applicable for store_backend s3.\n validations_store_prefix - prefix where to store validations' data. Note: only\n applicable for store_backend s3.\n data_docs_prefix - prefix where to store data_docs' data. Note: only applicable\n for store_backend s3.\n checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only\n applicable for store_backend s3.\n data_asset_name - name of the data asset to consider when configuring the great\n expectations' data source.\n expectation_suite_name - name to consider for great expectations' suite.\n assistant_options - additional options to pass to the DQ assistant processor.\n result_sink_db_table - db.table_name indicating the database and table in which\n to save the results of the DQ process.\n result_sink_location - file system location in which to save the results of the\n DQ process.\n result_sink_partitions - the list of partitions to consider.\n result_sink_format - format of the result table (e.g. delta, parquet, kafka...).\n result_sink_options - extra spark options for configuring the result sink.\n E.g: can be used to configure a Kafka sink if result_sink_format is kafka.\n result_sink_explode - flag to determine if the output table/location should have\n the columns exploded (as True) or not (as False). Default: True.\n result_sink_extra_columns - list of extra columns to be exploded (following\n the pattern \"
\n"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tspec_id: str,\tinput_id: str,\tdq_type: str,\tdq_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tunexpected_rows_pk: Optional[List[str]] = None,\ttbl_to_derive_pk: Optional[str] = None,\tgx_result_format: Optional[str] = 'COMPLETE',\ttag_source_data: Optional[bool] = False,\tassistant_options: Optional[dict] = None,\tstore_backend: str = 's3',\tlocal_fs_root_dir: Optional[str] = None,\tbucket: Optional[str] = None,\tdata_docs_bucket: Optional[str] = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tdata_docs_prefix: str = 'dq/data_docs/site/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/',\tdata_asset_name: Optional[str] = None,\texpectation_suite_name: Optional[str] = None,\tresult_sink_db_table: Optional[str] = None,\tresult_sink_location: Optional[str] = None,\tresult_sink_partitions: Optional[List[str]] = None,\tresult_sink_format: str = 'delta',\tresult_sink_options: Optional[dict] = None,\tresult_sink_explode: bool = True,\tresult_sink_extra_columns: Optional[List[str]] = None,\tsource: Optional[str] = None,\tfail_on_error: bool = True,\tcache_df: bool = False,\tcritical_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tmax_percentage_failure: Optional[float] = None)"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions", "kind": "class", "doc": ".*\") or columns to be selected. It is only used when\n result_sink_explode is set to True.\n source - name of data source, to be easier to identify in analysis. If not\n specified, it is set as default . This will be only used\n when result_sink_explode is set to True.\n fail_on_error - whether to fail the algorithm if the validations of your data in\n the DQ process failed.\n cache_df - whether to cache the dataframe before running the DQ process or not.\n critical_functions - functions that should not fail. When this argument is\n defined, fail_on_error is nullified.\n max_percentage_failure - percentage of failure that should be allowed.\n This argument has priority over both fail_on_error and critical_functions. Options for a merge operation.
\n\nmerge_predicate: predicate to apply to the merge operation so that we can check if a\n new record corresponds to a record already included in the historical data.\ninsert_only: indicates if the merge should only insert data (e.g., deduplicate\n scenarios).\ndelete_predicate: predicate to apply to the delete operation.\nupdate_predicate: predicate to apply to the update operation.\ninsert_predicate: predicate to apply to the insert operation.\nupdate_column_set: rules to apply to the update operation which allows to set the\n value for each column to be updated.\n (e.g. {\"data\": \"new.data\", \"count\": \"current.count + 1\"} )\ninsert_column_set: rules to apply to the insert operation which allows to set the\n value for each column to be inserted.\n (e.g. {\"date\": \"updates.date\", \"count\": \"1\"} )
\n"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions.__init__", "kind": "function", "doc": "\n", "signature": "(\tmerge_predicate: str,\tinsert_only: bool = False,\tdelete_predicate: Optional[str] = None,\tupdate_predicate: Optional[str] = None,\tinsert_predicate: Optional[str] = None,\tupdate_column_set: Optional[dict] = None,\tinsert_column_set: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec", "kind": "class", "doc": "Specification of an algorithm output.
\n\nThis is very aligned with the way the execution environment connects to the output\nsystems (e.g., spark outputs).
\n\nspec_id: id of the output specification.\ninput_id: id of the corresponding input specification.\nwrite_type: type of write operation.\ndata_format: format of the output. Defaults to DELTA.\ndb_table: table name in the form of
. .\nlocation: uri that identifies from where to write data in the specified format.\npartitions: list of partition input_col names.\nmerge_opts: options to apply to the merge operation.\nstreaming_micro_batch_transformers: transformers to invoke for each streaming micro\n batch, before writing (i.e., in Spark's foreachBatch structured\n streaming function). Note: the lakehouse engine manages this for you, so\n you don't have to manually specify streaming transformations here, so we don't\n advise you to manually specify transformations through this parameter. Supply\n them as regular transformers in the transform_specs sections of an ACON.\nstreaming_once: if the streaming query is to be executed just once, or not,\n generating just one micro batch.\nstreaming_processing_time: if streaming query is to be kept alive, this indicates\n the processing time of each micro batch.\nstreaming_available_now: if set to True, set a trigger that processes all available\n data in multiple batches then terminates the query.\n When using streaming, this is the default trigger that the lakehouse-engine will\n use, unless you configure a different one.\nstreaming_continuous: set a trigger that runs a continuous query with a given\n checkpoint interval.\nstreaming_await_termination: whether to wait (True) for the termination of the\n streaming query (e.g. timeout or exception) or not (False). Default: True.\nstreaming_await_termination_timeout: a timeout to set to the\n streaming_await_termination. Default: None.\nwith_batch_id: whether to include the streaming batch id in the final data, or not.\n It only takes effect in streaming mode.\noptions: dict with other relevant options according to the execution environment\n (e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for\n streaming, etc.\nstreaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers\n but for the DQ functions to be executed. Used internally by the lakehouse\n engine, so you don't have to supply DQ functions through this parameter. Use the\n dq_specs of the acon instead.\n"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tspec_id: str,\tinput_id: str,\twrite_type: str,\tdata_format: str = 'delta',\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tmerge_opts: Optional[lakehouse_engine.core.definitions.MergeOptions] = None,\tpartitions: Optional[List[str]] = None,\tstreaming_micro_batch_transformers: Optional[List[lakehouse_engine.core.definitions.TransformerSpec]] = None,\tstreaming_once: Optional[bool] = None,\tstreaming_processing_time: Optional[str] = None,\tstreaming_available_now: bool = True,\tstreaming_continuous: Optional[str] = None,\tstreaming_await_termination: bool = True,\tstreaming_await_termination_timeout: Optional[int] = None,\twith_batch_id: bool = False,\toptions: Optional[dict] = None,\tstreaming_micro_batch_dq_processors: Optional[List[lakehouse_engine.core.definitions.DQSpec]] = None)"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec", "kind": "class", "doc": "
Terminator Specification.
\n\nI.e., the specification that defines a terminator operation to be executed. Examples\nare compute statistics, vacuum, optimize, etc.
\n\nspec_id: id of the terminate specification.\nfunction: terminator function to execute.\nargs: arguments of the terminator function.\ninput_id: id of the corresponding output specification (Optional).
\n"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tfunction: str,\targs: Optional[dict] = None,\tinput_id: Optional[str] = None)"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec", "kind": "class", "doc": "Reconciliator Specification.
\n\nmetrics: list of metrics in the form of:\n [{\n metric: name of the column present in both truth and current datasets,\n aggregation: sum, avg, max, min, ...,\n type: percentage or absolute,\n yellow: value,\n red: value\n }].\nrecon_type: reconciliation type (percentage or absolute). Percentage calculates\n the difference between truth and current results as a percentage (x-y/x), and\n absolute calculates the raw difference (x - y).\ntruth_input_spec: input specification of the truth data.\ncurrent_input_spec: input specification of the current results data\ntruth_preprocess_query: additional query on top of the truth input data to\n preprocess the truth data before it gets fueled into the reconciliation process.\n Important note: you need to assume that the data out of\n the truth_input_spec is referencable by a table called 'truth'.\ntruth_preprocess_query_args: optional dict having the functions/transformations to\n apply on top of the truth_preprocess_query and respective arguments. Note: cache\n is being applied on the Dataframe, by default. For turning the default behavior\n off, pass
\n"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tmetrics: List[dict],\ttruth_input_spec: lakehouse_engine.core.definitions.InputSpec,\tcurrent_input_spec: lakehouse_engine.core.definitions.InputSpec,\ttruth_preprocess_query: Optional[str] = None,\ttruth_preprocess_query_args: Optional[List[dict]] = None,\tcurrent_preprocess_query: Optional[str] = None,\tcurrent_preprocess_query_args: Optional[List[dict]] = None,\tignore_empty_df: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec", "kind": "class", "doc": "\"truth_preprocess_query_args\": []
.\ncurrent_preprocess_query: additional query on top of the current results input data\n to preprocess the current results data before it gets fueled into the\n reconciliation process. Important note: you need to assume that the data out of\n the current_results_input_spec is referencable by a table called 'current'.\ncurrent_preprocess_query_args: optional dict having the functions/transformations to\n apply on top of the current_preprocess_query and respective arguments. Note:\n cache is being applied on the Dataframe, by default. For turning the default\n behavior off, pass\"current_preprocess_query_args\": []
.\nignore_empty_df: optional boolean, to ignore the recon process if source & target\n dataframes are empty, recon will exit success code (passed)Data Quality Validator Specification.
\n\ninput_spec: input specification of the data to be checked/validated.\ndq_spec: data quality specification.\nrestore_prev_version: specify if, having\ndelta table/files as input, they should be restored to the\nprevious version if the data quality process fails. Note: this\nis only considered if fail_on_error is kept as True.
\n"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\trestore_prev_version: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions", "kind": "class", "doc": "SQL definitions statements.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.compute_table_stats", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.compute_table_stats", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.compute_table_stats: 'ANALYZE TABLE {} COMPUTE STATISTICS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_table_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_table_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.drop_table_stmt: 'DROP TABLE IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_view_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_view_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.drop_view_stmt: 'DROP VIEW IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.truncate_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.truncate_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.truncate_stmt: 'TRUNCATE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.describe_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.describe_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.describe_stmt: 'DESCRIBE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.optimize_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.optimize_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.optimize_stmt: 'OPTIMIZE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.show_tbl_props_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.show_tbl_props_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.show_tbl_props_stmt: 'SHOW TBLPROPERTIES'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.delete_where_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.delete_where_stmt", "kind": "variable", "doc": "\n", "default_value": "<SQLDefinitions.delete_where_stmt: 'DELETE FROM {} WHERE {}'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys", "kind": "class", "doc": "File Manager s3 api keys.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTENTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTENTS", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.CONTENTS: 'Contents'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.KEY", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.KEY", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.KEY: 'Key'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTINUATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTINUATION", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.CONTINUATION: 'NextContinuationToken'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.BUCKET", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.BUCKET", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.BUCKET: 'Bucket'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.OBJECTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.OBJECTS", "kind": "variable", "doc": "\n", "default_value": "<FileManagerAPIKeys.OBJECTS: 'Objects'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec", "kind": "class", "doc": "Sensor Specification.
\n\nsensor_id: sensor id.\nassets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.\ncontrol_db_table_name: db.table to store sensor metadata.\ninput_spec: input specification of the source to be checked for new data.\npreprocess_query: SQL query to transform/filter the result from the\n upstream. Consider that we should refer to 'new_data' whenever\n we are referring to the input of the sensor. E.g.:\n \"SELECT dummy_col FROM new_data WHERE ...\"\ncheckpoint_location: optional location to store checkpoints to resume\n from. These checkpoints use the same as Spark checkpoint strategy.\n For Spark readers that do not support checkpoints, use the\n preprocess_query parameter to form a SQL query to filter the result\n from the upstream accordingly.\nfail_on_empty_result: if the sensor should throw an error if there is no new\n data in the upstream. Default: True.
\n"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.__init__", "kind": "function", "doc": "\n", "signature": "(\tsensor_id: str,\tassets: List[str],\tcontrol_db_table_name: str,\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tpreprocess_query: Optional[str],\tcheckpoint_location: Optional[str],\tfail_on_empty_result: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.create_from_acon", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.create_from_acon", "kind": "function", "doc": "Create SensorSpec from acon.
\n\nArgs:\n acon: sensor ACON.
\n", "signature": "(cls, acon: dict):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus", "kind": "class", "doc": "Status for a sensor.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.ACQUIRED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.ACQUIRED_NEW_DATA", "kind": "variable", "doc": "\n", "default_value": "<SensorStatus.ACQUIRED_NEW_DATA: 'ACQUIRED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.PROCESSED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.PROCESSED_NEW_DATA", "kind": "variable", "doc": "\n", "default_value": "<SensorStatus.PROCESSED_NEW_DATA: 'PROCESSED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain", "kind": "class", "doc": "Defaults used on consuming data from SAP Logchain.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.DBTABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.DBTABLE", "kind": "variable", "doc": "\n", "default_value": "<SAPLogchain.DBTABLE: 'SAPPHA.RSPCLOGCHAIN'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.GREEN_STATUS", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.GREEN_STATUS", "kind": "variable", "doc": "\n", "default_value": "<SAPLogchain.GREEN_STATUS: 'G'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.ENGINE_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.ENGINE_TABLE", "kind": "variable", "doc": "\n", "default_value": "<SAPLogchain.ENGINE_TABLE: 'sensor_new_data'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType", "kind": "class", "doc": "Archive types.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.BULK", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.BULK", "kind": "variable", "doc": "\n", "default_value": "<RestoreType.BULK: 'Bulk'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.STANDARD", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.STANDARD", "kind": "variable", "doc": "\n", "default_value": "<RestoreType.STANDARD: 'Standard'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.EXPEDITED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.EXPEDITED", "kind": "variable", "doc": "\n", "default_value": "<RestoreType.EXPEDITED: 'Expedited'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.values", "kind": "function", "doc": "Generates a list containing all enum values.
\n\nReturn:\n A list with all enum values.
\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.exists", "kind": "function", "doc": "Checks if the restore type exists in the enum values.
\n\nArgs:\n restore_type: restore type to check if exists.
\n\nReturn:\n If the restore type exists in our enum.
\n", "signature": "(cls, restore_type: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus", "kind": "class", "doc": "Archive types.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.NOT_STARTED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.NOT_STARTED", "kind": "variable", "doc": "\n", "default_value": "<RestoreStatus.NOT_STARTED: 'not_started'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.ONGOING", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.ONGOING", "kind": "variable", "doc": "\n", "default_value": "<RestoreStatus.ONGOING: 'ongoing'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.RESTORED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.RESTORED", "kind": "variable", "doc": "\n", "default_value": "<RestoreStatus.RESTORED: 'restored'>"}, {"fullname": "lakehouse_engine.core.exec_env", "modulename": "lakehouse_engine.core.exec_env", "kind": "module", "doc": "Module to take care of creating a singleton of the execution environment class.
\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv", "kind": "class", "doc": "Represents the basic resources regarding the engine execution environment.
\n\nCurrently, it is solely used to encapsulate the logic to get a Spark session.
\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv.get_or_create", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv.get_or_create", "kind": "function", "doc": "Get or create an execution environment session (currently Spark).
\n\nIt instantiates a singleton session that can be accessed anywhere from the\nlakehouse engine.
\n\nArgs:\n session: spark session.\n enable_hive_support: whether to enable hive support or not.\n app_name: application name.\n config: extra spark configs to supply to the spark session.
\n", "signature": "(\tcls,\tsession: pyspark.sql.session.SparkSession = None,\tenable_hive_support: bool = True,\tapp_name: str = None,\tconfig: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.executable", "modulename": "lakehouse_engine.core.executable", "kind": "module", "doc": "Module representing an executable lakehouse engine component.
\n"}, {"fullname": "lakehouse_engine.core.executable.Executable", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable", "kind": "class", "doc": "Abstract class defining the behaviour of an executable component.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.executable.Executable.execute", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable.execute", "kind": "function", "doc": "Define the executable component behaviour.
\n\nE.g., the behaviour of an algorithm inheriting from this.
\n", "signature": "(self) -> Optional[Any]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager", "modulename": "lakehouse_engine.core.file_manager", "kind": "module", "doc": "File manager module.
\n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager", "kind": "class", "doc": "Set of actions to manipulate files in several ways.
\n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.__init__", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.__init__", "kind": "function", "doc": "Construct FileManager algorithm instances.
\n\nArgs:\n configs: configurations for the FileManager algorithm.
\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.get_function", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.get_function", "kind": "function", "doc": "Get a specific function to execute.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.delete_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.delete_objects", "kind": "function", "doc": "Delete objects and 'directories' in s3.
\n\nIf dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.copy_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.copy_objects", "kind": "function", "doc": "Copies objects and 'directories' in s3.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.request_restore", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.request_restore", "kind": "function", "doc": "Request the restore of archived data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.check_restore_status", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.check_restore_status", "kind": "function", "doc": "Check the restore status of archived data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.request_restore_to_destination_and_wait", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.request_restore_to_destination_and_wait", "kind": "function", "doc": "Request and wait for the restore to complete, polling the restore status.
\n\nAfter the restore is done, copy the restored files to destination
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager", "kind": "class", "doc": "Set of actions to restore archives.
\n"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.check_restore_status", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.check_restore_status", "kind": "function", "doc": "Check the restore status of archived data.
\n\nArgs:\n source_bucket: name of bucket to check the restore status.\n source_object: object to check the restore status.
\n\nReturns:\n A dict containing the amount of objects in each status.
\n", "signature": "(source_bucket: str, source_object: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.request_restore", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.request_restore", "kind": "function", "doc": "Request the restore of archived data.
\n\nArgs:\n source_bucket: name of bucket to perform the restore.\n source_object: object to be restored.\n restore_expiration: restore expiration in days.\n retrieval_tier: type of restore, possible values are:\n Bulk, Standard or Expedited.\n dry_run: if dry_run is set to True the function will print a dict with\n all the paths that would be deleted based on the given keys.
\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.request_restore_and_wait", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.request_restore_and_wait", "kind": "function", "doc": "Request and wait for the restore to complete, polling the restore status.
\n\nArgs:\n source_bucket: name of bucket to perform the restore.\n source_object: object to be restored.\n restore_expiration: restore expiration in days.\n retrieval_tier: type of restore, possible values are:\n Bulk, Standard or Expedited.\n dry_run: if dry_run is set to True the function will print a dict with\n all the paths that would be deleted based on the given keys.
\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager", "modulename": "lakehouse_engine.core.sensor_manager", "kind": "module", "doc": "Module to define Sensor Manager classes.
\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager", "kind": "class", "doc": "Class to control the Sensor execution.
\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.check_if_sensor_has_acquired_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.check_if_sensor_has_acquired_data", "kind": "function", "doc": "Check if sensor has acquired new data.
\n\nArgs:\n sensor_id: sensor id.\n control_db_table_name: db.table to control sensor runs.
\n\nReturns:\n True if acquired new data, otherwise False
\n", "signature": "(cls, sensor_id: str, control_db_table_name: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.update_sensor_status", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.update_sensor_status", "kind": "function", "doc": "Control sensor execution storing the execution data in a delta table.
\n\nArgs:\n sensor_spec: sensor spec containing all sensor\n information we need to update the control status.\n status: status of the sensor.\n upstream_key: upstream key (e.g., used to store an attribute\n name from the upstream so that new data can be detected\n automatically).\n upstream_value: upstream value (e.g., used to store the max\n attribute value from the upstream so that new data can be\n detected automatically).
\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec,\tstatus: str,\tupstream_key: str = None,\tupstream_value: str = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.read_sensor_table_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.read_sensor_table_data", "kind": "function", "doc": "Read data from delta table containing sensor status info.
\n\nArgs:\n sensor_id: sensor id. If this parameter is defined search occurs\n only considering this parameter. Otherwise, it considers sensor\n assets and checkpoint location.\n control_db_table_name: db.table to control sensor runs.\n assets: list of assets that are fueled by the pipeline\n where this sensor is.
\n\nReturn:\n Row containing the data for the provided sensor_id.
\n", "signature": "(\tcls,\tcontrol_db_table_name: str,\tsensor_id: str = None,\tassets: list = None) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager", "kind": "class", "doc": "Class to deal with Sensor Upstream data.
\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_filter_exp_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_filter_exp_query", "kind": "function", "doc": "Generates a sensor preprocess query based on timestamp logic.
\n\nArgs:\n sensor_id: sensor id.\n filter_exp: expression to filter incoming new data.\n You can use the placeholder
\n\n?upstream_value
so that\n it can be replaced by the upstream_value in the\n control_db_table_name for this specific sensor_id.\n control_db_table_name: db.table to retrieve the last status change\n timestamp. This is only relevant for the jdbc sensor.\n upstream_key: the key of custom sensor information\n to control how to identify new data from the\n upstream (e.g., a time column in the upstream).\n upstream_value: value for custom sensor\n to identify new data from the upstream\n (e.g., the value of a time present in the upstream)\n If none we will set the default value.\n Note: This parameter is used just to override the\n default value-2147483647
.\n upstream_table_name: value for custom sensor\n to query new data from the upstream.\n If none we will set the default value,\n oursensor_new_data
view.Return:\n The query string.
\n", "signature": "(\tcls,\tsensor_id: str,\tfilter_exp: str,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_table_preprocess_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_table_preprocess_query", "kind": "function", "doc": "Generates a query to be used for a sensor having other sensor as upstream.
\n\nArgs:\n sensor_id: sensor id.
\n\nReturn:\n The query string.
\n", "signature": "(cls, sensor_id: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.read_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.read_new_data", "kind": "function", "doc": "Read new data from the upstream into the sensor 'new_data_df'.
\n\nArgs:\n sensor_spec: sensor spec containing all sensor information.
\n\nReturn:\n An empty dataframe if it doesn't have new data otherwise the new data
\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.get_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.get_new_data", "kind": "function", "doc": "Get new data from upstream df if it's present.
\n\nArgs:\n new_data_df: DataFrame possibly containing new data.
\n\nReturn:\n Optional row, present if there is new data in the upstream,\n absent otherwise.
\n", "signature": "(\tcls,\tnew_data_df: pyspark.sql.dataframe.DataFrame) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_sap_logchain_query", "kind": "function", "doc": "Generates a sensor query based in the SAP Logchain table.
\n\nArgs:\n chain_id: chain id to query the status on SAP.\n dbtable: db.table to retrieve the data to\n check if the sap chain is already finished.\n status: db.table to retrieve the last status change\n timestamp.\n engine_table_name: table name exposed with the SAP LOGCHAIN data.\n This table will be used in the jdbc query.
\n\nReturn:\n The query string.
\n", "signature": "(\tcls,\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager", "modulename": "lakehouse_engine.core.table_manager", "kind": "module", "doc": "Table manager module.
\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager", "kind": "class", "doc": "Set of actions to manipulate tables/views in several ways.
\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.__init__", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.__init__", "kind": "function", "doc": "Construct TableManager algorithm instances.
\n\nArgs:\n configs: configurations for the TableManager algorithm.
\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_function", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_function", "kind": "function", "doc": "Get a specific function to execute.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create", "kind": "function", "doc": "Create a new table or view on metastore.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create_many", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create_many", "kind": "function", "doc": "Create multiple tables or views on metastore.
\n\nIn this function the path to the ddl files can be separated by comma.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.compute_table_statistics", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.compute_table_statistics", "kind": "function", "doc": "Compute table statistics.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_table", "kind": "function", "doc": "Delete table function deletes table from metastore and erases all data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_view", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_view", "kind": "function", "doc": "Delete view function deletes view from metastore and erases all data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.truncate", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.truncate", "kind": "function", "doc": "Truncate function erases all data but keeps metadata.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.vacuum", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.vacuum", "kind": "function", "doc": "Vacuum function erases older versions from Delta Lake tables or locations.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.describe", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.describe", "kind": "function", "doc": "Describe function describes metadata from some table or view.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.optimize", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.optimize", "kind": "function", "doc": "Optimize function optimizes the layout of Delta Lake data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_multiple_sql_files", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_multiple_sql_files", "kind": "function", "doc": "Execute multiple statements in multiple sql files.
\n\nIn this function the path to the files is separated by comma.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_sql", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_sql", "kind": "function", "doc": "Execute sql commands separated by semicolon (;).
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.show_tbl_properties", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.show_tbl_properties", "kind": "function", "doc": "Show Table Properties.
\n\nReturns: a dataframe with the table properties.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_tbl_pk", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_tbl_pk", "kind": "function", "doc": "Get the primary key of a particular table.
\n\nReturns: the list of columns that are part of the primary key.
\n", "signature": "(self) -> List[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.repair_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.repair_table", "kind": "function", "doc": "Run the repair table command.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.delete_where", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.delete_where", "kind": "function", "doc": "Run the delete where command.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors", "modulename": "lakehouse_engine.dq_processors", "kind": "module", "doc": "Package to define data quality processes available in the lakehouse engine.
\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "kind": "module", "doc": "Module containing the definition of a data assistant.
\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant.Assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "qualname": "Assistant", "kind": "class", "doc": "Class containing the data assistant.
\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant.Assistant.run_data_assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "qualname": "Assistant.run_data_assistant", "kind": "function", "doc": "Entrypoint to run the data assistant.
\n\nBased on the data, it uses GE Onboarding Data Assistant to generate expectations\nthat can be applied to the data. Then, it returns the generated expectations\nand, depending on your configuration, it can display plots of the metrics,\nexpectations and also display or store the profiling of the data, for you to get\na better sense of it.
\n\nArgs:\n context: the BaseDataContext containing the configurations for the data\n source and store backend.\n batch_request: batch request to be able to query underlying data.\n expectation_suite_name: name of the expectation suite.\n assistant_options: additional options to pass to the DQ assistant processor.\n data: the input dataframe for which the DQ is running.\n profile_file_name: file name for storing the profiling html file.
\n\nReturns:\n The context with the expectation suite stored.
\n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tassistant_options: dict,\tdata: pyspark.sql.dataframe.DataFrame,\tprofile_file_name: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations", "modulename": "lakehouse_engine.dq_processors.custom_expectations", "kind": "module", "doc": "Package containing custom DQ expectations available in the lakehouse engine.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "kind": "module", "doc": "Expectation to check if column 'a' is lower or equal than column 'b'.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ColumnPairCustom", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ColumnPairCustom", "kind": "class", "doc": "Asserts that column 'A' is lower or equal than column 'B'.
\n\nAdditionally, the 'margin' parameter can be used to add a margin to the\ncheck between column 'A' and 'B': 'A' <= 'B' + 'margin'.
\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_pair_map_metric_provider.ColumnPairMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ExpectColumnPairAToBeSmallerOrEqualThanB", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ExpectColumnPairAToBeSmallerOrEqualThanB", "kind": "class", "doc": "Expect values in column A to be lower or equal than column B.
\n\nArgs:\n column_A: The first column name.\n column_B: The second column name.\n margin: additional approximation to column B value.
\n\nKeyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).\n result_format: Which output mode to use:\n
\n\nBOOLEAN_ONLY
,BASIC
(default),COMPLETE
, orSUMMARY
.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.Returns:\n An ExpectationSuiteValidationResult.
\n", "bases": "great_expectations.expectations.expectation.ColumnPairMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "kind": "module", "doc": "Expectation to check if column value is a date within a timeframe.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ColumnValuesDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ColumnValuesDateNotOlderThan", "kind": "class", "doc": "Asserts that column values are a date that isn't older than a given date.
\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_map_metric_provider.ColumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ExpectColumnValuesToBeDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ExpectColumnValuesToBeDateNotOlderThan", "kind": "class", "doc": "Expect value in column to be date that is not older than a given time.
\n\nSince timedelta can only define an interval up to weeks, a month is defined\nas 4 weeks and a year is defined as 52 weeks.
\n\nArgs:\n column: Name of column to validate\n Note: Column must be of type Date, Timestamp or String (with Timestamp format).\n Format: yyyy-MM-ddTHH:mm:ss\n timeframe: dict with the definition of the timeframe.\n kwargs: dict with additional parameters.
\n\nKeyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).\n result_format: Which output mode to use:\n
\n\nBOOLEAN_ONLY
,BASIC
(default),COMPLETE
, orSUMMARY
.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.Returns:\n An ExpectationSuiteValidationResult.
\n", "bases": "great_expectations.expectations.expectation.ColumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "kind": "module", "doc": "Expectation to check if column 'a' equals 'b', or 'c'.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.MulticolumnCustomMetric", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "MulticolumnCustomMetric", "kind": "class", "doc": "Expectation metric definition.
\n\nThis expectation asserts that column 'a' must equal to column 'b' or column 'c'.\nIn addition to this it is possible to validate that column 'b' or 'c' match a regex.
\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.multicolumn_map_metric_provider.MulticolumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.ExpectMulticolumnColumnAMustEqualBOrC", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "ExpectMulticolumnColumnAMustEqualBOrC", "kind": "class", "doc": "MultiColumn Expectation.
\n\nExpect that the column 'a' is equal to 'b' when this is\nnot empty; otherwise 'a' must be equal to 'c'.
\n\nArgs:\n column_list: The column names to evaluate.
\n\nKeyword Args:\n ignore_row_if: default to \"never\".\n result_format: Which output mode to use:\n
\n\nBOOLEAN_ONLY
,BASIC
,COMPLETE
, orSUMMARY
.\n Default set toBASIC
.\n include_config: If True, then include the expectation\n config as part of the result object.\n Default set to True.\n catch_exceptions: If True, then catch exceptions\n and include them as part of the result object.\n Default set to False.Returns:\n An ExpectationSuiteValidationResult.
\n", "bases": "great_expectations.expectations.expectation.MulticolumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "kind": "module", "doc": "Expectation to check if aggregated column satisfy the condition.
\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe", "kind": "class", "doc": "Expect agg of column to satisfy the condition specified.
\n\nArgs:\n template_dict: dict with the following keys:\n column (column to check sum).\n group_column_list (group by column names to be listed).\n condition (how to validate the aggregated value eg: between,\n greater, lesser).\n max_value (maximum allowed value).\n min_value (minimum allowed value).\n agg_type (sum/count/max/min).
\n", "bases": "great_expectations.expectations.expectation.QueryExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe.validate_configuration", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe.validate_configuration", "kind": "function", "doc": "Validates that a configuration has been set.
\n\nArgs:\n configuration (OPTIONAL[ExpectationConfiguration]):\n An optional Expectation Configuration entry.
\n\nReturns:\n None. Raises InvalidExpectationConfigurationError
\n", "signature": "(\tself,\tconfiguration: Optional[great_expectations.core.expectation_configuration.ExpectationConfiguration] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "kind": "module", "doc": "Module containing the class definition of the Data Quality Factory.
\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory", "kind": "class", "doc": "Class for the Data Quality Factory.
\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory.run_dq_process", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory.run_dq_process", "kind": "function", "doc": "Run the specified data quality process on a dataframe.
\n\nBased on the dq_specs we apply the defined expectations on top of the dataframe\nin order to apply the necessary validations and then output the result of\nthe data quality process.
\n\nArgs:\n dq_spec: data quality specification.\n data: input dataframe to run the dq process on.
\n\nReturns:\n The DataFrame containing the results of the DQ process.
\n", "signature": "(\tcls,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.exceptions", "modulename": "lakehouse_engine.dq_processors.exceptions", "kind": "module", "doc": "Package defining all the DQ custom exceptions.
\n"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQValidationsFailedException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQValidationsFailedException", "kind": "class", "doc": "Exception for when the data quality validations fail.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQCheckpointsResultsException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQCheckpointsResultsException", "kind": "class", "doc": "Exception for when the checkpoint results parsing fail.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.validator", "modulename": "lakehouse_engine.dq_processors.validator", "kind": "module", "doc": "Module containing the definition of a data quality validator.
\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator", "kind": "class", "doc": "Class containing the data quality validator.
\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.get_dq_validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.get_dq_validator", "kind": "function", "doc": "Get a validator according to the specification.
\n\nWe use getattr to dynamically execute any expectation available.\ngetattr(validator, function) is similar to validator.function(). With this\napproach, we can execute any expectation supported.
\n\nArgs:\n context: the BaseDataContext containing the configurations for the data\n source and store backend.\n batch_request: run time batch request to be able to query underlying data.\n expectation_suite_name: name of the expectation suite.\n dq_functions: a list of DQFunctionSpec to consider in the expectation suite.\n critical_functions: list of critical expectations in the expectation suite.
\n\nReturns:\n The validator with the expectation suite stored.
\n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tdq_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec],\tcritical_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec]) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.tag_source_with_dq", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.tag_source_with_dq", "kind": "function", "doc": "Tags the source dataframe with a new column having the DQ results.
\n\nArgs:\n source_pk: the primary key of the source data.\n source_df: the source dataframe to be tagged with DQ results.\n results_df: dq results dataframe.
\n\nReturns: a dataframe tagged with the DQ results.
\n", "signature": "(\tcls,\tsource_pk: List[str],\tsource_df: pyspark.sql.dataframe.DataFrame,\tresults_df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine", "modulename": "lakehouse_engine.engine", "kind": "module", "doc": "Contract of the lakehouse engine with all the available functions to be executed.
\n"}, {"fullname": "lakehouse_engine.engine.load_data", "modulename": "lakehouse_engine.engine", "qualname": "load_data", "kind": "function", "doc": "Load data using the DataLoader algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).
\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_reconciliation", "modulename": "lakehouse_engine.engine", "qualname": "execute_reconciliation", "kind": "function", "doc": "Execute the Reconciliator algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_dq_validation", "modulename": "lakehouse_engine.engine", "qualname": "execute_dq_validation", "kind": "function", "doc": "Execute the DQValidator algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_table", "modulename": "lakehouse_engine.engine", "qualname": "manage_table", "kind": "function", "doc": "Manipulate tables/views using Table Manager algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_files", "modulename": "lakehouse_engine.engine", "qualname": "manage_files", "kind": "function", "doc": "Manipulate s3 files using File Manager algorithm.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_sensor", "modulename": "lakehouse_engine.engine", "qualname": "execute_sensor", "kind": "function", "doc": "Execute a sensor based on a Sensor Algorithm Configuration.
\n\nA sensor is useful to check if an upstream system has new data.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).
\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.update_sensor_status", "modulename": "lakehouse_engine.engine", "qualname": "update_sensor_status", "kind": "function", "doc": "Update internal sensor status.
\n\nUpdate the sensor status in the control table,\nit should be used to tell the system\nthat the sensor has processed all new data that was previously identified,\nhence updating the shifted sensor status.\nUsually used to move from
\n\nSensorStatus.ACQUIRED_NEW_DATA
to\nSensorStatus.PROCESSED_NEW_DATA
,\nbut there might be scenarios - still to identify -\nwhere we can update the sensor status from/to different statuses.Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to store sensor checkpoints.\n status: status of the sensor.\n assets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.
\n", "signature": "(\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_query", "kind": "function", "doc": "Generates a preprocess query to be used in a sensor configuration.
\n\nArgs:\n sensor_id: sensor id.\n filter_exp: expression to filter incoming new data.\n You can use the placeholder ?default_upstream_key and\n ?default_upstream_value, so that it can be replaced by the\n respective values in the control_db_table_name for this specific\n sensor_id.\n control_db_table_name: db.table to retrieve the last status change\n timestamp. This is only relevant for the jdbc sensor.\n upstream_key: the key of custom sensor information to control how to\n identify new data from the upstream (e.g., a time column in the\n upstream).\n upstream_value: the upstream value\n to identify new data from the upstream (e.g., the value of a time\n present in the upstream).\n upstream_table_name: value for custom sensor\n to query new data from the upstream\n If none we will set the default value,\n our
\n\nsensor_new_data
view.Return:\n The query string.
\n", "signature": "(\tsensor_id: str,\tfilter_exp: str = None,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_sap_logchain_query", "kind": "function", "doc": "Generates a sensor query based in the SAP Logchain table.
\n\nArgs:\n chain_id: chain id to query the status on SAP.\n dbtable: db.table to retrieve the data to\n check if the sap chain is already finished.\n status: db.table to retrieve the last status change\n timestamp.\n engine_table_name: table name exposed with the SAP LOGCHAIN data.\n This table will be used in the jdbc query.
\n\nReturn:\n The query string.
\n", "signature": "(\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.send_notification", "modulename": "lakehouse_engine.engine", "qualname": "send_notification", "kind": "function", "doc": "Send a notification using a notifier.
\n\nArgs:\n args: arguments for the notifier.
\n", "signature": "(args: dict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io", "modulename": "lakehouse_engine.io", "kind": "module", "doc": "Input and Output package responsible for the behaviour of reading and writing.
\n"}, {"fullname": "lakehouse_engine.io.exceptions", "modulename": "lakehouse_engine.io.exceptions", "kind": "module", "doc": "Package defining all the io custom exceptions.
\n"}, {"fullname": "lakehouse_engine.io.exceptions.IncrementalFilterInputNotFoundException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "IncrementalFilterInputNotFoundException", "kind": "class", "doc": "Exception for when the input of an incremental filter is not found.
\n\nThis may occur when tables are being loaded in incremental way, taking the increment\ndefinition out of a specific table, but the table still does not exist, mainly\nbecause probably it was not loaded for the first time yet.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.WrongIOFormatException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "WrongIOFormatException", "kind": "class", "doc": "Exception for when a user provides a wrong I/O format.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.NotSupportedException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "NotSupportedException", "kind": "class", "doc": "Exception for when a user provides a not supported operation.
\n", "bases": "builtins.RuntimeError"}, {"fullname": "lakehouse_engine.io.reader", "modulename": "lakehouse_engine.io.reader", "kind": "module", "doc": "Defines abstract reader behaviour.
\n"}, {"fullname": "lakehouse_engine.io.reader.Reader", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader", "kind": "class", "doc": "Abstract Reader class.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader.Reader.__init__", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.__init__", "kind": "function", "doc": "Construct Reader instances.
\n\nArgs:\n input_spec: input specification for reading data.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.reader.Reader.read", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.read", "kind": "function", "doc": "Abstract read method.
\n\nReturns:\n A dataframe read according to the input specification.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.reader_factory", "modulename": "lakehouse_engine.io.reader_factory", "kind": "module", "doc": "Module for reader factory.
\n"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory", "kind": "class", "doc": "Class for reader factory.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory.get_data", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory.get_data", "kind": "function", "doc": "Get data according to the input specification following a factory pattern.
\n\nArgs:\n spec: input specification to get the data.
\n\nReturns:\n A dataframe containing the data.
\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.InputSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers", "modulename": "lakehouse_engine.io.readers", "kind": "module", "doc": "Readers package to define reading behaviour.
\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "kind": "module", "doc": "Module to define behaviour to read from dataframes.
\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader", "kind": "class", "doc": "Class to read data from a dataframe.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.__init__", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.__init__", "kind": "function", "doc": "Construct DataFrameReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.read", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.read", "kind": "function", "doc": "Read data from a dataframe.
\n\nReturns:\n A dataframe containing the data from a dataframe previously\n computed.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.file_reader", "modulename": "lakehouse_engine.io.readers.file_reader", "kind": "module", "doc": "Module to define behaviour to read from files.
\n"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader", "kind": "class", "doc": "Class to read from files.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.__init__", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.__init__", "kind": "function", "doc": "Construct FileReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.read", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.read", "kind": "function", "doc": "Read file data.
\n\nReturns:\n A dataframe containing the data from the files.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "kind": "module", "doc": "Module to define behaviour to read from JDBC sources.
\n"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader", "kind": "class", "doc": "Class to read from JDBC source.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.__init__", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.__init__", "kind": "function", "doc": "Construct JDBCReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.read", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.read", "kind": "function", "doc": "Read data from JDBC source.
\n\nReturns:\n A dataframe containing the data from the JDBC source.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "kind": "module", "doc": "Module to define behaviour to read from Kafka.
\n"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader", "kind": "class", "doc": "Class to read from Kafka.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.__init__", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.__init__", "kind": "function", "doc": "Construct KafkaReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.read", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.read", "kind": "function", "doc": "Read Kafka data.
\n\nReturns:\n A dataframe containing the data from Kafka.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.query_reader", "modulename": "lakehouse_engine.io.readers.query_reader", "kind": "module", "doc": "Module to define behaviour to read from a query.
\n"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader", "kind": "class", "doc": "Class to read data from a query.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.__init__", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.__init__", "kind": "function", "doc": "Construct QueryReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.read", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.read", "kind": "function", "doc": "Read data from a query.
\n\nReturns:\n A dataframe containing the data from the query.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "kind": "module", "doc": "Module to define behaviour to read from SAP B4 sources.
\n"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader", "kind": "class", "doc": "Class to read from SAP B4 source.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.__init__", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.__init__", "kind": "function", "doc": "Construct SAPB4Reader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.read", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.read", "kind": "function", "doc": "Read data from SAP B4 source.
\n\nReturns:\n A dataframe containing the data from the SAP B4 source.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "kind": "module", "doc": "Module to define behaviour to read from SAP BW sources.
\n"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader", "kind": "class", "doc": "Class to read from SAP BW source.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.__init__", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.__init__", "kind": "function", "doc": "Construct SAPBWReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.read", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.read", "kind": "function", "doc": "Read data from SAP BW source.
\n\nReturns:\n A dataframe containing the data from the SAP BW source.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "kind": "module", "doc": "Module to define behaviour to read from SFTP.
\n"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader", "kind": "class", "doc": "Class to read from SFTP.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.__init__", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.__init__", "kind": "function", "doc": "Construct SFTPReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.read", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.read", "kind": "function", "doc": "Read SFTP data.
\n\nReturns:\n A dataframe containing the data from SFTP.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.table_reader", "modulename": "lakehouse_engine.io.readers.table_reader", "kind": "module", "doc": "Module to define behaviour to read from tables.
\n"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader", "kind": "class", "doc": "Class to read data from a table.
\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.__init__", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.__init__", "kind": "function", "doc": "Construct TableReader instances.
\n\nArgs:\n input_spec: input specification.
\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.read", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.read", "kind": "function", "doc": "Read data from a table.
\n\nReturns:\n A dataframe containing the data from the table.
\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer", "modulename": "lakehouse_engine.io.writer", "kind": "module", "doc": "Defines abstract writer behaviour.
\n"}, {"fullname": "lakehouse_engine.io.writer.Writer", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer", "kind": "class", "doc": "Abstract Writer class.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer.Writer.__init__", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.__init__", "kind": "function", "doc": "Construct Writer instances.
\n\nArgs:\n output_spec: output specification to write data.\n df: dataframe to write.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict = None)"}, {"fullname": "lakehouse_engine.io.writer.Writer.write", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write", "kind": "function", "doc": "Abstract write method.
\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.write_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write_transformed_micro_batch", "kind": "function", "doc": "Define how to write a streaming micro batch after transforming it.
\n\nThis function must define an inner function that manipulates a streaming batch,\nand then return that function. Look for concrete implementations of this\nfunction for more clarity.
\n\nArgs:\n kwargs: any keyword arguments.
\n\nReturns:\n A function to be executed in the foreachBatch spark write method.
\n", "signature": "(**kwargs: Any) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_transformed_micro_batch", "kind": "function", "doc": "Get the result of the transformations applied to a micro batch dataframe.
\n\nArgs:\n output_spec: output specification associated with the writer.\n batch_df: batch dataframe (given from streaming foreachBatch).\n batch_id: if of the batch (given from streaming foreachBatch).\n data: list of all dfs generated on previous steps before writer\n to be available on micro batch transforms.
\n\nReturns:\n The transformed dataframe.
\n", "signature": "(\tcls,\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tbatch_df: pyspark.sql.dataframe.DataFrame,\tbatch_id: int,\tdata: OrderedDict) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_streaming_trigger", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_streaming_trigger", "kind": "function", "doc": "Define which streaming trigger will be used.
\n\nArgs:\n output_spec: output specification.
\n\nReturns:\n A dict containing streaming trigger.
\n", "signature": "(cls, output_spec: lakehouse_engine.core.definitions.OutputSpec) -> Dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.run_micro_batch_dq_process", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.run_micro_batch_dq_process", "kind": "function", "doc": "Run the data quality process in a streaming micro batch dataframe.
\n\nIterates over the specs and performs the checks or analysis depending on the\ndata quality specification provided in the configuration.
\n\nArgs:\n df: the dataframe in which to run the dq process on.\n dq_spec: data quality specification.
\n\nReturns: the validated dataframe.
\n", "signature": "(\tdf: pyspark.sql.dataframe.DataFrame,\tdq_spec: List[lakehouse_engine.core.definitions.DQSpec]) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer_factory", "modulename": "lakehouse_engine.io.writer_factory", "kind": "module", "doc": "Module for writer factory.
\n"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory", "kind": "class", "doc": "Class for writer factory.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory.get_writer", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory.get_writer", "kind": "function", "doc": "Get a writer according to the output specification using a factory pattern.
\n\nArgs:\n OutputSpec spec: output specification to write data.\n DataFrame df: dataframe to be written.\n OrderedDict data: list of all dfs generated on previous steps before writer.
\n\nReturns:\n Writer: writer that will write the data.
\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict) -> lakehouse_engine.io.writer.Writer:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers", "modulename": "lakehouse_engine.io.writers", "kind": "module", "doc": "Package containing the writers responsible for writing data.
\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer", "modulename": "lakehouse_engine.io.writers.console_writer", "kind": "module", "doc": "Module to define behaviour to write to console.
\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter", "kind": "class", "doc": "Class to write data to console.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.__init__", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.__init__", "kind": "function", "doc": "Construct ConsoleWriter instances.
\n\nArgs:\n output_spec: output specification\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.write", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.write", "kind": "function", "doc": "Write data to console.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "kind": "module", "doc": "Module to define behaviour to write to dataframe.
\n"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter", "kind": "class", "doc": "Class to write data to dataframe.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.__init__", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.__init__", "kind": "function", "doc": "Construct DataFrameWriter instances.
\n\nArgs:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.write", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.write", "kind": "function", "doc": "Write data to dataframe.
\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "kind": "module", "doc": "Module to define the behaviour of delta merges.
\n"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter", "kind": "class", "doc": "Class to merge data using delta lake.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.__init__", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.__init__", "kind": "function", "doc": "Construct DeltaMergeWriter instances.
\n\nArgs:\n output_spec: output specification containing merge options and\n relevant information.\n df: the dataframe containing the new data to be merged.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.write", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.write", "kind": "function", "doc": "Merge new data with current data.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.file_writer", "modulename": "lakehouse_engine.io.writers.file_writer", "kind": "module", "doc": "Module to define behaviour to write to files.
\n"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter", "kind": "class", "doc": "Class to write data to files.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.__init__", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.__init__", "kind": "function", "doc": "Construct FileWriter instances.
\n\nArgs:\n output_spec: output specification\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.write", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.write", "kind": "function", "doc": "Write data to files.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "kind": "module", "doc": "Module that defines the behaviour to write to JDBC targets.
\n"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter", "kind": "class", "doc": "Class to write to JDBC targets.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.__init__", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.__init__", "kind": "function", "doc": "Construct JDBCWriter instances.
\n\nArgs:\n output_spec: output specification.\n df: dataframe to be writen.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.write", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.write", "kind": "function", "doc": "Write data into JDBC target.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer", "modulename": "lakehouse_engine.io.writers.kafka_writer", "kind": "module", "doc": "Module that defines the behaviour to write to Kafka.
\n"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter", "kind": "class", "doc": "Class to write to a Kafka target.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.__init__", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.__init__", "kind": "function", "doc": "Construct KafkaWriter instances.
\n\nArgs:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.write", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.write", "kind": "function", "doc": "Write data to Kafka.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.table_writer", "modulename": "lakehouse_engine.io.writers.table_writer", "kind": "module", "doc": "Module that defines the behaviour to write to tables.
\n"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter", "kind": "class", "doc": "Class to write to a table.
\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.__init__", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.__init__", "kind": "function", "doc": "Construct TableWriter instances.
\n\nArgs:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.
\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.write", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.write", "kind": "function", "doc": "Write data to a table.
\n\nAfter the write operation we repair the table (e.g., update partitions).\nHowever, there's a caveat to this, which is the fact that this repair\noperation is not reachable if we are running long-running streaming mode.\nTherefore, we recommend not using the TableWriter with formats other than\ndelta lake for those scenarios (as delta lake does not need msck repair).\nSo, you can: 1) use delta lake format for the table; 2) use the FileWriter\nand run the repair with a certain frequency in a separate task of your\npipeline.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators", "modulename": "lakehouse_engine.terminators", "kind": "module", "doc": "Package to define algorithm terminators (e.g., vacuum, optimize, compute stats).
\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor", "modulename": "lakehouse_engine.terminators.cdf_processor", "kind": "module", "doc": "Defines change data feed processor behaviour.
\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor", "kind": "class", "doc": "Change data feed processor class.
\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.expose_cdf", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.expose_cdf", "kind": "function", "doc": "Expose CDF to external location.
\n\nArgs:\n spec: terminator specification.
\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.delete_old_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.delete_old_data", "kind": "function", "doc": "Delete old data from cdf delta table.
\n\nArgs:\n spec: terminator specifications.
\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.vacuum_cdf_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.vacuum_cdf_data", "kind": "function", "doc": "Vacuum old data from cdf delta table.
\n\nArgs:\n spec: terminator specifications.
\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "kind": "module", "doc": "Module with dataset optimizer terminator.
\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer", "kind": "class", "doc": "Class with dataset optimizer terminator.
\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer.optimize_dataset", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer.optimize_dataset", "kind": "function", "doc": "Optimize a dataset based on a set of pre-conceived optimizations.
\n\nMost of the times the dataset is a table, but it can be a file-based one only.
\n\nArgs:\n db_table: database_name.table_name.\n location: dataset/table filesystem location.\n compute_table_stats: to compute table statistics or not.\n vacuum: (delta lake tables only) whether to vacuum the delta lake\n table or not.\n vacuum_hours: (delta lake tables only) number of hours to consider\n in vacuum operation.\n optimize: (delta lake tables only) whether to optimize the table or\n not. Custom optimize parameters can be supplied through ExecEnv (Spark)\n configs\n optimize_where: expression to use in the optimize function.\n optimize_zorder_col_list: (delta lake tables only) list of\n columns to consider in the zorder optimization process. Custom optimize\n parameters can be supplied through ExecEnv (Spark) configs.\n debug: flag indicating if we are just debugging this for local\n tests and therefore pass through all the exceptions to perform some\n assertions in local tests.
\n", "signature": "(\tcls,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tcompute_table_stats: bool = True,\tvacuum: bool = True,\tvacuum_hours: int = 720,\toptimize: bool = True,\toptimize_where: Optional[str] = None,\toptimize_zorder_col_list: Optional[List[str]] = None,\tdebug: bool = False) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier", "modulename": "lakehouse_engine.terminators.notifier", "kind": "module", "doc": "Module with notification terminator.
\n"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier", "kind": "class", "doc": "Abstract Notification class.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.__init__", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.__init__", "kind": "function", "doc": "Construct Notification instances.
\n\nArgs:\n notification_spec: notification specification.
\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.create_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.create_notification", "kind": "function", "doc": "Abstract create notification method.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.send_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.send_notification", "kind": "function", "doc": "Abstract send notification method.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.check_if_notification_is_failure_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.check_if_notification_is_failure_notification", "kind": "function", "doc": "Check if given notification is a failure notification.
\n\nArgs:\n spec: spec to validate if it is a failure notification.
\n\nReturns:\n A boolean telling if the notification is a failure notification
\n", "signature": "(spec: lakehouse_engine.core.definitions.TerminatorSpec) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory", "modulename": "lakehouse_engine.terminators.notifier_factory", "kind": "module", "doc": "Module for notifier factory.
\n"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory", "kind": "class", "doc": "Class for notification factory.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.get_notifier", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.get_notifier", "kind": "function", "doc": "Get a notifier according to the terminator specs using a factory.
\n\nArgs:\n spec: terminator specification.
\n\nReturns:\n Notifier: notifier that will handle notifications.
\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.TerminatorSpec) -> lakehouse_engine.terminators.notifier.Notifier:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.generate_failure_notification", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.generate_failure_notification", "kind": "function", "doc": "Check if it is necessary to send a failure notification and generate it.
\n\nArgs:\n spec: List of termination specs\n exception: Exception that caused the failure.
\n", "signature": "(spec: list, exception: Exception) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers", "modulename": "lakehouse_engine.terminators.notifiers", "kind": "module", "doc": "Notifications module.
\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "kind": "module", "doc": "Module with email notifier.
\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier", "kind": "class", "doc": "Base Notification class.
\n", "bases": "lakehouse_engine.terminators.notifier.Notifier"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.__init__", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.__init__", "kind": "function", "doc": "Construct Email Notification instance.
\n\nArgs:\n notification_spec: notification specification.
\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.create_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.create_notification", "kind": "function", "doc": "Creates the notification to be sent.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.send_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.send_notification", "kind": "function", "doc": "Sends the notification by using a series of methods.
\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "kind": "module", "doc": "Email notification templates.
\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates.NotificationsTemplates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "qualname": "NotificationsTemplates", "kind": "class", "doc": "Templates for notifications.
\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "kind": "module", "doc": "Defines terminator behaviour.
\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator", "kind": "class", "doc": "Sensor Terminator class.
\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator.update_sensor_status", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator.update_sensor_status", "kind": "function", "doc": "Update internal sensor status.
\n\nUpdate the sensor status in the control table, it should be used to tell the\nsystem that the sensor has processed all new data that was previously\nidentified, hence updating the shifted sensor status.\nUsually used to move from
\n\nSensorStatus.ACQUIRED_NEW_DATA
to\nSensorStatus.PROCESSED_NEW_DATA
, but there might be scenarios - still\nto identify - where we can update the sensor status from/to different statuses.Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to store sensor checkpoints.\n status: status of the sensor.\n assets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.
\n", "signature": "(\tcls,\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.spark_terminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "kind": "module", "doc": "Defines terminator behaviour.
\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator", "kind": "class", "doc": "Spark Terminator class.
\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator.terminate_spark", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator.terminate_spark", "kind": "function", "doc": "Terminate spark session.
\n", "signature": "(cls) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.terminator_factory", "modulename": "lakehouse_engine.terminators.terminator_factory", "kind": "module", "doc": "Module with the factory pattern to return terminators.
\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory", "kind": "class", "doc": "TerminatorFactory class following the factory pattern.
\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory.execute_terminator", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory.execute_terminator", "kind": "function", "doc": "Execute a terminator following the factory pattern.
\n\nArgs:\n spec: terminator specification.\n df: dataframe to be used in the terminator. Needed when a\n terminator requires one dataframe as input.
\n\nReturns:\n Transformer function to be executed in .transform() spark function.
\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TerminatorSpec,\tdf: Optional[pyspark.sql.dataframe.DataFrame] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers", "modulename": "lakehouse_engine.transformers", "kind": "module", "doc": "Package to define transformers available in the lakehouse engine.
\n"}, {"fullname": "lakehouse_engine.transformers.aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "kind": "module", "doc": "Aggregators module.
\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators", "kind": "class", "doc": "Class containing all aggregation functions.
\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators.get_max_value", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators.get_max_value", "kind": "function", "doc": "Get the maximum value of a given column of a dataframe.
\n\nArgs:\n input_col: name of the input column.\n output_col: name of the output column (defaults to \"latest\").
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(input_col: str, output_col: str = 'latest') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators", "modulename": "lakehouse_engine.transformers.column_creators", "kind": "module", "doc": "Column creators transformers module.
\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators", "kind": "class", "doc": "Class containing all functions that can create columns to add value.
\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_row_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_row_id", "kind": "function", "doc": "Create a sequential but not consecutive id.
\n\nArgs:\n output_col: optional name of the output column.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_auto_increment_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_auto_increment_id", "kind": "function", "doc": "Create a sequential and consecutive id.
\n\nArgs:\n output_col: optional name of the output column.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_literals", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_literals", "kind": "function", "doc": "Create columns given a map of column names and literal values (constants).
\n\nArgs:\n Dict[str, Any] literals: map of column names and literal values (constants).
\n\nReturns:\n Callable: A function to be executed in the .transform() spark function.
\n", "signature": "(cls, literals: Dict[str, Any]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "kind": "module", "doc": "Module with column reshaping transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers", "kind": "class", "doc": "Class containing column reshaping transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.cast", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.cast", "kind": "function", "doc": "Cast specific columns into the designated type.
\n\nArgs:\n cols: dict with columns and respective target types.\n Target types need to have the exact name of spark types:\n https://spark.apache.org/docs/latest/sql-ref-datatypes.html
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, cols: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.column_selector", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.column_selector", "kind": "function", "doc": "Select specific columns with specific output aliases.
\n\nArgs:\n cols: dict with columns to select and respective aliases.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, cols: collections.OrderedDict) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.flatten_schema", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.flatten_schema", "kind": "function", "doc": "Flatten the schema of the dataframe.
\n\nArgs:\n max_level: level until which you want to flatten the schema.\n Default: None.\n shorten_names: whether to shorten the names of the prefixes\n of the fields being flattened or not. Default: False.\n alias: whether to define alias for the columns being flattened\n or not. Default: True.\n num_chars: number of characters to consider when shortening\n the names of the fields. Default: 7.\n ignore_cols: columns which you don't want to flatten.\n Default: None.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.explode_columns", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.explode_columns", "kind": "function", "doc": "Explode columns with types like ArrayType and MapType.
\n\nAfter it can be applied the flatten_schema transformation,\nif we desired for example to explode the map (as we explode a StructType)\nor to explode a StructType inside the array.\nWe recommend you to specify always the columns desired to explode\nand not explode all columns.
\n\nArgs:\n explode_arrays: whether you want to explode array columns (True)\n or not (False). Default: False.\n array_cols_to_explode: array columns which you want to explode.\n If you don't specify it will get all array columns and explode them.\n Default: None.\n explode_maps: whether you want to explode map columns (True)\n or not (False). Default: False.\n map_cols_to_explode: map columns which you want to explode.\n If you don't specify it will get all map columns and explode them.\n Default: None.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\texplode_arrays: bool = False,\tarray_cols_to_explode: List[str] = None,\texplode_maps: bool = False,\tmap_cols_to_explode: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.with_expressions", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.with_expressions", "kind": "function", "doc": "Execute Spark SQL expressions to create the specified columns.
\n\nThis function uses the Spark expr function:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/\npyspark.sql.functions.expr.html
\n\nArgs:\n cols_and_exprs: dict with columns and respective expressions to compute\n (Spark SQL expressions).
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, cols_and_exprs: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.rename", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.rename", "kind": "function", "doc": "Rename specific columns into the designated name.
\n\nArgs:\n cols: dict with columns and respective target names.\n escape_col_names: whether to escape column names (e.g.
\n\n/BIC/COL1
) or not.\n If True it creates a column with the new name and drop the old one.\n If False, uses the native withColumnRenamed Spark function. Default: True.Returns:\n Function to be called in .transform() spark function.
\n", "signature": "(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro", "kind": "function", "doc": "Select all attributes from avro.
\n\nArgs:\n schema: the schema string.\n key_col: the name of the key column.\n value_col: the name of the value column.\n options: extra options (e.g., mode: \"PERMISSIVE\").\n expand_key: whether you want to expand the content inside the key\n column or not. Default: false.\n expand_value: whether you want to expand the content inside the value\n column or not. Default: true.
\n\nReturns:\n Function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tschema: str = None,\tkey_col: str = 'key',\tvalue_col: str = 'value',\toptions: dict = None,\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro_with_registry", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro_with_registry", "kind": "function", "doc": "Select all attributes from avro using a schema registry.
\n\nArgs:\n schema_registry: the url to the schema registry.\n value_schema: the name of the value schema entry in the schema registry.\n value_col: the name of the value column.\n key_schema: the name of the key schema entry in the schema\n registry. Default: None.\n key_col: the name of the key column.\n expand_key: whether you want to expand the content inside the key\n column or not. Default: false.\n expand_value: whether you want to expand the content inside the value\n column or not. Default: true.
\n\nReturns:\n Function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tschema_registry: str,\tvalue_schema: str,\tvalue_col: str = 'value',\tkey_schema: str = None,\tkey_col: str = 'key',\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_json", "kind": "function", "doc": "Convert a json string into a json column (struct).
\n\nThe new json column can be added to the existing columns (default) or it can\nreplace all the others, being the only one to output. The new column gets the\nsame name as the original one suffixed with '_json'.
\n\nArgs:\n input_col: dict with columns and respective target names.\n schema_path: path to the StructType schema (spark schema).\n schema: dict with the StructType schema (spark schema).\n json_options: options to parse the json value.\n drop_all_cols: whether to drop all the input columns or not.\n Defaults to False.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tinput_col: str,\tschema_path: Optional[str] = None,\tschema: Optional[dict] = None,\tjson_options: Optional[dict] = None,\tdrop_all_cols: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.to_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.to_json", "kind": "function", "doc": "Convert dataframe columns into a json value.
\n\nArgs:\n in_cols: name(s) of the input column(s).\n Example values:\n \"*\" - all\n columns; \"my_col\" - one column named \"my_col\";\n \"my_col1, my_col2\" - two columns.\n out_col: name of the output column.\n json_options: options to parse the json value.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tin_cols: List[str],\tout_col: str,\tjson_options: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers", "modulename": "lakehouse_engine.transformers.condensers", "kind": "module", "doc": "Condensers module.
\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers", "kind": "class", "doc": "Class containing all the functions to condensate data for later merges.
\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.condense_record_mode_cdc", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.condense_record_mode_cdc", "kind": "function", "doc": "Condense Change Data Capture (CDC) based on record_mode strategy.
\n\nThis CDC data is particularly seen in some CDC enabled systems. Other systems\nmay have different CDC strategies.
\n\nArgs:\n business_key: The business key (logical primary key) of the data.\n ranking_key_desc: In this type of CDC condensation the data needs to be\n ordered descendingly in a certain way, using columns specified in this\n parameter.\n ranking_key_asc: In this type of CDC condensation the data needs to be\n ordered ascendingly in a certain way, using columns specified in\n this parameter.\n record_mode_col: Name of the record mode input_col.\n valid_record_modes: Depending on the context, not all record modes may be\n considered for condensation. Use this parameter to skip those.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(\tcls,\tbusiness_key: List[str],\trecord_mode_col: str,\tvalid_record_modes: List[str],\tranking_key_desc: Optional[List[str]] = None,\tranking_key_asc: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.group_and_rank", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.group_and_rank", "kind": "function", "doc": "Condense data based on a simple group by + take latest mechanism.
\n\nArgs:\n group_key: list of column names to use in the group by.\n ranking_key: the data needs to be ordered descendingly using columns\n specified in this parameter.\n descending: if the ranking considers descending order or not. Defaults to\n True.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(\tcls,\tgroup_key: List[str],\tranking_key: List[str],\tdescending: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.custom_transformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "kind": "module", "doc": "Custom transformers module.
\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers", "kind": "class", "doc": "Class representing a CustomTransformers.
\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers.custom_transformation", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers.custom_transformation", "kind": "function", "doc": "Execute a custom transformation provided by the user.
\n\nThis transformer can be very useful whenever the user cannot use our provided\ntransformers, or they want to write complex logic in the transform step of the\nalgorithm.
\n\nAttention!!! Please bare in mind that the custom_transformer function provided\nas argument needs to receive a DataFrame and return a DataFrame, because it is\nhow Spark's .transform method is able to chain the transformations.\nExample:\n def my_custom_logic(df: DataFrame) -> DataFrame:
\n\nArgs:\n custom_transformer: custom transformer function. A python function with all\n required pyspark logic provided by the user.
\n\nReturns:\n Callable: the same function provided as parameter, in order to e called\n later in the TransformerFactory.
\n", "signature": "(custom_transformer: Callable) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers", "modulename": "lakehouse_engine.transformers.data_maskers", "kind": "module", "doc": "Module with data masking transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers", "kind": "class", "doc": "Class containing data masking transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.hash_masker", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.hash_masker", "kind": "function", "doc": "Mask specific columns using an hashing approach.
\n\nArgs:\n cols: list of column names to mask.\n approach: hashing approach. Defaults to 'SHA'. There's \"MURMUR3\" as well.\n num_bits: number of bits of the SHA approach. Only applies to SHA approach.\n suffix: suffix to apply to new column name. Defaults to \"_hash\".\n Note: you can pass an empty suffix to have the original column replaced.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tcols: List[str],\tapproach: str = 'SHA',\tnum_bits: int = 256,\tsuffix: str = '_hash') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.column_dropper", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.column_dropper", "kind": "function", "doc": "Drop specific columns.
\n\nArgs:\n cols: list of column names to drop.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, cols: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers", "modulename": "lakehouse_engine.transformers.date_transformers", "kind": "module", "doc": "Module containing date transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers", "kind": "class", "doc": "Class with set of transformers to transform dates in several forms.
\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.add_current_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.add_current_date", "kind": "function", "doc": "Add column with current date.
\n\nThe current date comes from the driver as a constant, not from every executor.
\n\nArgs:\n output_col: name of the output column.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(output_col: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_date", "kind": "function", "doc": "Convert multiple string columns with a source format into dates.
\n\nArgs:\n cols: list of names of the string columns to convert.\n source_format: dates source format (e.g., YYYY-MM-dd). Check here:\n https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_timestamp", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_timestamp", "kind": "function", "doc": "Convert multiple string columns with a source format into timestamps.
\n\nArgs:\n cols: list of names of the string columns to convert.\n source_format: dates source format (e.g., MM-dd-yyyy HH:mm:ss.SSS). Check\n here: https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.format_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.format_date", "kind": "function", "doc": "Convert multiple date/timestamp columns into strings with the target format.
\n\nArgs:\n cols: list of names of the string columns to convert.\n target_format: strings target format (e.g., YYYY-MM-dd). Check here:\n https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cols: List[str], target_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.get_date_hierarchy", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.get_date_hierarchy", "kind": "function", "doc": "Create day/month/week/quarter/year hierarchy for the provided date columns.
\n\nUses Spark's extract function.
\n\nArgs:\n cols: list of names of the date columns to create the hierarchy.\n formats: dict with the correspondence between the hierarchy and the format\n to apply.\n Example: {\n \"year\": \"year\",\n \"month\": \"month\",\n \"day\": \"day\",\n \"week\": \"week\",\n \"quarter\": \"quarter\"\n }\n Check here: https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(cols: List[str], formats: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.exceptions", "modulename": "lakehouse_engine.transformers.exceptions", "kind": "module", "doc": "Module for all the transformers exceptions.
\n"}, {"fullname": "lakehouse_engine.transformers.exceptions.WrongArgumentsException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "WrongArgumentsException", "kind": "class", "doc": "Exception for when a user provides wrong arguments to a transformer.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.exceptions.UnsupportedStreamingTransformerException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "UnsupportedStreamingTransformerException", "kind": "class", "doc": "Exception for when a user requests a transformer not supported in streaming.
\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.filters", "modulename": "lakehouse_engine.transformers.filters", "kind": "module", "doc": "Module containing the filters transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters", "kind": "class", "doc": "Class containing the filters transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.incremental_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.incremental_filter", "kind": "function", "doc": "Incrementally Filter a certain dataframe given an increment logic.
\n\nThis logic can either be an increment value or an increment dataframe from which\nthe get the latest value from. By default the operator for the filtering process\nis greater or equal to cover cases where we receive late arriving data not cover\nin a previous load. You can change greater_or_equal to false to use greater,\nwhen you trust the source will never output more data with the increment after\nyou have load the data (e.g., you will never load data until the source is still\ndumping data, which may cause you to get an incomplete picture of the last\narrived data).
\n\nArgs:\n input_col: input column name\n increment_value: value to which to filter the data, considering the\n provided input_Col.\n increment_df: a dataframe to get the increment value from.\n you either specify this or the increment_value (this takes precedence).\n This is a good approach to get the latest value from a given dataframe\n that was read and apply that value as filter here. In this way you can\n perform incremental loads based on the last value of a given dataframe\n (e.g., table or file based). Can be used together with the\n get_max_value transformer to accomplish these incremental based loads.\n See our append load feature tests to see how to provide an acon for\n incremental loads, taking advantage of the scenario explained here.\n increment_col: name of the column from which to get the increment\n value from from (when using increment_df approach). This assumes there's\n only one row in the increment_df, reason why is a good idea to use\n together with the get_max_value transformer. Defaults to \"latest\"\n because that's the default output column name provided by the\n get_max_value transformer.\n greater_or_equal: if filtering should be done by also including the\n increment value or not (useful for scenarios where you are performing\n increment loads but still want to include data considering the increment\n value, and not only values greater than that increment... examples may\n include scenarios where you already loaded data including those values,\n but the source produced more data containing those values).\n Defaults to false.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tinput_col: str,\tincrement_value: Optional[Any] = None,\tincrement_df: Optional[pyspark.sql.dataframe.DataFrame] = None,\tincrement_col: str = 'latest',\tgreater_or_equal: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.expression_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.expression_filter", "kind": "function", "doc": "Filter a dataframe based on an expression.
\n\nArgs:\n exp: filter expression.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(exp: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.column_filter_exp", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.column_filter_exp", "kind": "function", "doc": "Filter a dataframe's columns based on a list of SQL expressions.
\n\nArgs:\n exp: column filter expressions.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(exp: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.drop_duplicate_rows", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.drop_duplicate_rows", "kind": "function", "doc": "Drop duplicate rows using spark function dropDuplicates().
\n\nThis transformer can be used with or without arguments.\nThe provided argument needs to be a list of columns.\nFor example: [\u201cName\u201d,\u201dVAT\u201d] will drop duplicate records within\n\"Name\" and \"VAT\" columns.\nIf the transformer is used without providing any columns list or providing\nan empty list, such as [] the result will be the same as using\nthe distinct() pyspark function. If the watermark dict is present it will\nensure that the drop operation will apply to rows within the watermark timeline\nwindow.
\n\nArgs:\n cols: column names.\n watermarker: properties to apply watermarker to the transformer.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cols: List[str] = None, watermarker: dict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.joiners", "modulename": "lakehouse_engine.transformers.joiners", "kind": "module", "doc": "Module with join transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners", "kind": "class", "doc": "Class containing join transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners.join", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners.join", "kind": "function", "doc": "Join two dataframes based on specified type and columns.
\n\nSome stream to stream joins are only possible if you apply Watermark, so this\nmethod also provides a parameter to enable watermarking specification.
\n\nArgs:\n left_df_alias: alias of the first dataframe.\n join_with: right dataframe.\n right_df_alias: alias of the second dataframe.\n join_condition: condition to join dataframes.\n join_type: type of join. Defaults to inner.\n Available values: inner, cross, outer, full, full outer,\n left, left outer, right, right outer, semi,\n left semi, anti, and left anti.\n broadcast_join: whether to perform a broadcast join or not.\n select_cols: list of columns to select at the end.\n watermarker: properties to apply watermarking.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tjoin_with: pyspark.sql.dataframe.DataFrame,\tjoin_condition: str,\tleft_df_alias: str = 'a',\tright_df_alias: str = 'b',\tjoin_type: str = 'inner',\tbroadcast_join: bool = True,\tselect_cols: Optional[List[str]] = None,\twatermarker: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.null_handlers", "modulename": "lakehouse_engine.transformers.null_handlers", "kind": "module", "doc": "Module with null handlers transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers", "kind": "class", "doc": "Class containing null handler transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers.replace_nulls", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers.replace_nulls", "kind": "function", "doc": "Replace nulls in a dataframe.
\n\nArgs:\n replace_on_nums: if it is to replace nulls on numeric columns.\n Applies to ints, longs and floats.\n default_num_value: default integer value to use as replacement.\n replace_on_strings: if it is to replace nulls on string columns.\n default_string_value: default string value to use as replacement.\n subset_cols: list of columns in which to replace nulls. If not\n provided, all nulls in all columns will be replaced as specified.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\treplace_on_nums: bool = True,\tdefault_num_value: int = -999,\treplace_on_strings: bool = True,\tdefault_string_value: str = 'UNKNOWN',\tsubset_cols: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "kind": "module", "doc": "Optimizers module.
\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers", "kind": "class", "doc": "Class containing all the functions that can provide optimizations.
\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.cache", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.cache", "kind": "function", "doc": "Caches the current dataframe.
\n\nThe default storage level used is MEMORY_AND_DISK.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.persist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.persist", "kind": "function", "doc": "Caches the current dataframe with a specific StorageLevel.
\n\nArgs:\n storage_level: the type of StorageLevel, as default MEMORY_AND_DISK_DESER.\n More options here: https://spark.apache.org/docs/latest/api/python/\n reference/api/pyspark.StorageLevel.html
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, storage_level: str = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.unpersist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.unpersist", "kind": "function", "doc": "Removes the dataframe from the disk and memory.
\n\nArgs:\n blocking: whether to block until all the data blocks are\n removed from disk/memory or run asynchronously.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, blocking: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.regex_transformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "kind": "module", "doc": "Regex transformers module.
\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers", "kind": "class", "doc": "Class containing all regex functions.
\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers.with_regex_value", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers.with_regex_value", "kind": "function", "doc": "Get the result of applying a regex to an input column (via regexp_extract).
\n\nArgs:\n input_col: name of the input column.\n output_col: name of the output column.\n regex: regular expression.\n drop_input_col: whether to drop input_col or not.\n idx: index to return.
\n\nReturns:\n A function to be executed in the .transform() spark function.
\n", "signature": "(\tinput_col: str,\toutput_col: str,\tregex: str,\tdrop_input_col: bool = False,\tidx: int = 1) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "kind": "module", "doc": "Module with repartitioners transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners", "kind": "class", "doc": "Class containing repartitioners transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.coalesce", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.coalesce", "kind": "function", "doc": "Coalesce a dataframe into n partitions.
\n\nArgs:\n num_partitions: num of partitions to coalesce.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(cls, num_partitions: int) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.repartition", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.repartition", "kind": "function", "doc": "Repartition a dataframe into n partitions.
\n\nIf num_partitions is provided repartitioning happens based on the provided\nnumber, otherwise it happens based on the values of the provided cols (columns).
\n\nArgs:\n num_partitions: num of partitions to repartition.\n cols: list of columns to use for repartitioning.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tnum_partitions: Optional[int] = None,\tcols: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.transformer_factory", "modulename": "lakehouse_engine.transformers.transformer_factory", "kind": "module", "doc": "Module with the factory pattern to return transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory", "kind": "class", "doc": "TransformerFactory class following the factory pattern.
\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory.get_transformer", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory.get_transformer", "kind": "function", "doc": "Get a transformer following the factory pattern.
\n\nArgs:\n spec: transformer specification (individual transformation... not to be\n confused with list of all transformations).\n data: ordered dict of dataframes to be transformed. Needed when a\n transformer requires more than one dataframe as input.
\n\nReturns:\n Transformer function to be executed in .transform() spark function.
\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TransformerSpec,\tdata: OrderedDict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions", "modulename": "lakehouse_engine.transformers.unions", "kind": "module", "doc": "Module with union transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions", "kind": "class", "doc": "Class containing union transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union", "kind": "function", "doc": "Union dataframes, resolving columns by position (not by name).
\n\nArgs:\n union_with: list of dataframes to union.\n deduplication: whether to perform deduplication of elements or not.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union_by_name", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union_by_name", "kind": "function", "doc": "Union dataframes, resolving columns by name (not by position).
\n\nArgs:\n union_with: list of dataframes to union.\n deduplication: whether to perform deduplication of elements or not.\n allow_missing_columns: allow the union of DataFrames with different\n schemas.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True,\tallow_missing_columns: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "kind": "module", "doc": "Watermarker module.
\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker", "kind": "class", "doc": "Class containing all watermarker transformers.
\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker.with_watermark", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker.with_watermark", "kind": "function", "doc": "Get the dataframe with watermarker defined.
\n\nArgs:\n watermarker_column: name of the input column to be considered for\n the watermarking. Note: it must be a timestamp.\n watermarker_time: time window to define the watermark value.
\n\nReturns:\n A function to be executed on other transformers.
\n", "signature": "(watermarker_column: str, watermarker_time: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils", "modulename": "lakehouse_engine.utils", "kind": "module", "doc": "Utilities package.
\n"}, {"fullname": "lakehouse_engine.utils.configs", "modulename": "lakehouse_engine.utils.configs", "kind": "module", "doc": "Config utilities package.
\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils", "modulename": "lakehouse_engine.utils.configs.config_utils", "kind": "module", "doc": "Module to read configurations.
\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils", "kind": "class", "doc": "Config utilities class.
\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_acon", "kind": "function", "doc": "Get acon based on a filesystem path or on a dict.
\n\nArgs:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).
\n\nReturns:\n Dict representation of an acon.
\n", "signature": "(\tcls,\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_config", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_config", "kind": "function", "doc": "Get Lakehouse Engine configurations.
\n\nReturns:\n A dictionary with the engine configurations.
\n", "signature": "() -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_json_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_json_acon", "kind": "function", "doc": "Read an acon (algorithm configuration) file.
\n\nArgs:\n path: path to the acon file.
\n\nReturns:\n The acon file content as a dict.
\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_sql", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_sql", "kind": "function", "doc": "Read a DDL file in Spark SQL format from a cloud object storage system.
\n\nArgs:\n path: path to the acon (algorithm configuration) file.
\n\nReturns:\n Content of the SQL file.
\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "kind": "module", "doc": "Utilities for databricks operations.
\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils", "kind": "class", "doc": "Databricks utilities class.
\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_db_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_db_utils", "kind": "function", "doc": "Get db utils on databricks.
\n\nArgs:\n spark: spark session.
\n\nReturns:\n Dbutils from databricks.
\n", "signature": "(spark: pyspark.sql.session.SparkSession) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_databricks_job_information", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_databricks_job_information", "kind": "function", "doc": "Get notebook context from running acon.
\n\nReturns:\n Dict containing databricks notebook context.
\n", "signature": "() -> Tuple[str, str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.expectations_utils", "modulename": "lakehouse_engine.utils.expectations_utils", "kind": "module", "doc": "Utilities to be used by custom expectations.
\n"}, {"fullname": "lakehouse_engine.utils.expectations_utils.validate_result", "modulename": "lakehouse_engine.utils.expectations_utils", "qualname": "validate_result", "kind": "function", "doc": "Validates the test results of the custom expectations.
\n\nIf you need to make additional validations on your custom expectation\nand/or require additional fields to be returned you can add them before\ncalling this function. The partial_success and partial_result\noptional parameters can be used to pass the result of additional\nvalidations and add more information to the result key of the\nreturned dict respectively.
\n\nArgs:\n expectation: Expectation to validate.\n configuration: Configuration used in the test.\n metrics: Test result metrics.\n partial_success: Result of validations done before calling this method.\n partial_result: Extra fields to be returned to the user.
\n\nReturns:\n The result of the validation.
\n", "signature": "(\texpectation: great_expectations.expectations.expectation.Expectation,\tconfiguration: great_expectations.core.expectation_configuration.ExpectationConfiguration,\tmetrics: Dict,\tpartial_success: bool = True,\tpartial_result: dict = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction", "modulename": "lakehouse_engine.utils.extraction", "kind": "module", "doc": "Extraction utilities package.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "kind": "module", "doc": "Utilities module for JDBC extraction processes.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType", "kind": "class", "doc": "Standardize the types of extractions we can have from a JDBC source.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.INIT", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.INIT", "kind": "variable", "doc": "\n", "default_value": "<JDBCExtractionType.INIT: 'init'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.DELTA", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.DELTA", "kind": "variable", "doc": "\n", "default_value": "<JDBCExtractionType.DELTA: 'delta'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction", "kind": "class", "doc": "Configurations available for an Extraction from a JDBC source.
\n\nThese configurations cover:\n user: username to connect to JDBC source.\n password: password to connect to JDBC source (always use secrets,\n don't use text passwords in your code).\n url: url to connect to JDBC source.\n dbtable: database.table to extract data from.\n calc_upper_bound_schema: custom schema used for the upper bound calculation.\n changelog_table: table of type changelog from which to extract data,\n when the extraction type is delta.\n partition_column: column used to split the extraction.\n latest_timestamp_data_location: data location (e.g., s3) containing the data\n to get the latest timestamp already loaded into bronze.\n latest_timestamp_data_format: the format of the dataset in\n latest_timestamp_data_location. Default: delta.\n extraction_type: type of extraction (delta or init). Default: \"delta\".\n driver: JDBC driver name. Default: \"com.sap.db.jdbc.Driver\".\n num_partitions: number of Spark partitions to split the extraction.\n lower_bound: lower bound to decide the partition stride.\n upper_bound: upper bound to decide the partition stride. If\n calculate_upper_bound is True, then upperBound will be\n derived by our upper bound optimizer, using the partition column.\n default_upper_bound: the value to use as default upper bound in case\n the result of the upper bound calculation is None. Default: \"1\".\n fetch_size: how many rows to fetch per round trip. Default: \"100000\".\n compress: enable network compression. Default: True.\n custom_schema: specify custom_schema for particular columns of the\n returned dataframe in the init/delta extraction of the source table.\n min_timestamp: min timestamp to consider to filter the changelog data.\n Default: None and automatically derived from the location provided.\n In case this one is provided it has precedence and the calculation\n is not done.\n max_timestamp: max timestamp to consider to filter the changelog data.\n Default: None and automatically derived from the table having information\n about the extraction requests, their timestamps and their status.\n In case this one is provided it has precedence and the calculation\n is not done.\n generate_predicates: whether to generate predicates automatically or not.\n Default: False.\n predicates: list containing all values to partition (if generate_predicates\n is used, the manual values provided are ignored). Default: None.\n predicates_add_null: whether to consider null on predicates list.\n Default: True.\n extraction_timestamp: the timestamp of the extraction. Default: current time\n following the format \"%Y%m%d%H%M%S\".\n max_timestamp_custom_schema: custom schema used on the max_timestamp derivation\n from the table holding the extraction requests information.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction.__init__", "kind": "function", "doc": "\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231018182628',\tmax_timestamp_custom_schema: Optional[str] = None)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils", "kind": "class", "doc": "Utils for managing data extraction from particularly relevant JDBC sources.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.__init__", "kind": "function", "doc": "Construct JDBCExtractionUtils.
\n\nArgs:\n jdbc_extraction: JDBC Extraction configurations. Can be of type:\n JDBCExtraction, SAPB4Extraction or SAPBWExtraction.
\n", "signature": "(jdbc_extraction: Any)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_additional_spark_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_additional_spark_options", "kind": "function", "doc": "Helper to get additional Spark Options initially passed.
\n\nIf people provide additional Spark options, not covered by the util function\narguments (get_spark_jdbc_options), we need to consider them.\nThus, we update the options retrieved by the utils, by checking if there is\nany Spark option initially provided that is not yet considered in the retrieved\noptions or function arguments and if the value for the key is not None.\nIf these conditions are filled, we add the options and return the complete dict.
\n\nArgs:\n input_spec: the input specification.\n options: dict with Spark options.\n ignore_options: list of options to be ignored by the process.\n Spark read has two different approaches to parallelize\n reading process, one of them is using upper/lower bound,\n another one is using predicates, those process can't be\n executed at the same time, you must choose one of them.\n By choosing predicates you can't pass lower and upper bound,\n also can't pass number of partitions and partition column\n otherwise spark will interpret the execution partitioned by\n upper and lower bound and will expect to fill all variables.\n To avoid fill all predicates hardcoded at the acon, there is\n a feature that automatically generates all predicates for init\n or delta load based on input partition column, but at the end\n of the process, partition column can't be passed to the options,\n because we are choosing predicates execution, that is why to\n generate predicates we need to pass some options to ignore.
\n\nReturns:\n a dict with all the options passed as argument, plus the options that\n were initially provided, but were not used in the util\n (get_spark_jdbc_options).
\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\toptions: dict,\tignore_options: List = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_predicates", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_predicates", "kind": "function", "doc": "Get the predicates list, based on a predicates query.
\n\nArgs:\n predicates_query: query to use as the basis to get the distinct values for\n a specified column, based on which predicates are generated.
\n\nReturns:\n List containing the predicates to use to split the extraction from\n JDBC sources.
\n", "signature": "(self, predicates_query: str) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_options", "kind": "function", "doc": "Get the Spark options to extract data from a JDBC source.
\n\nReturns:\n The Spark jdbc args dictionary, including the query to submit\n and also options args dictionary.
\n", "signature": "(self) -> Tuple[dict, dict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "kind": "function", "doc": "Get an optimal upperBound to properly split a Spark JDBC extraction.
\n\nReturns:\n Either an int, date or timestamp to serve as upperBound Spark JDBC option.
\n", "signature": "(self) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "kind": "module", "doc": "Utilities module for SAP B4 extraction processes.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes", "kind": "class", "doc": "Standardise the types of ADSOs we can have for Extractions from SAP B4.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.AQ", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.AQ", "kind": "variable", "doc": "\n", "annotation": ": str", "default_value": "<ADSOTypes.AQ: 'AQ'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.CL", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.CL", "kind": "variable", "doc": "\n", "annotation": ": str", "default_value": "<ADSOTypes.CL: 'CL'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.SUPPORTED_TYPES", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.SUPPORTED_TYPES", "kind": "variable", "doc": "\n", "annotation": ": list", "default_value": "<ADSOTypes.SUPPORTED_TYPES: ['AQ', 'CL']>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction", "kind": "class", "doc": "Configurations available for an Extraction from SAP B4.
\n\nIt inherits from JDBCExtraction configurations, so it can use\nand/or overwrite those configurations.
\n\nThese configurations cover:\n latest_timestamp_input_col: the column containing the request timestamps\n in the dataset in latest_timestamp_data_location. Default: REQTSN.\n request_status_tbl: the name of the SAP B4 table having information\n about the extraction requests. Composed of database.table.\n Default: SAPHANADB.RSPMREQUEST.\n request_col_name: name of the column having the request timestamp to join\n with the request status table. Default: REQUEST_TSN.\n data_target: the data target to extract from. User in the join operation with\n the request status table.\n act_req_join_condition: the join condition into activation table\n can be changed using this property.\n Default: 'tbl.reqtsn = req.request_col_name'.\n include_changelog_tech_cols: whether to include the technical columns\n (usually coming from the changelog) table or not.\n extra_cols_req_status_tbl: columns to be added from request status table.\n It needs to contain the prefix \"req.\". E.g. \"req.col1 as column_one,\n req.col2 as column_two\".\n request_status_tbl_filter: filter to use for filtering the request status table,\n influencing the calculation of the max timestamps and the delta extractions.\n adso_type: the type of ADSO that you are extracting from. Can be \"AQ\" or \"CL\".\n max_timestamp_custom_schema: the custom schema to apply on the calculation of\n the max timestamp to consider for the delta extractions.\n Default: timestamp DECIMAL(23,0).\n default_max_timestamp: the timestamp to use as default, when it is not possible\n to derive one.\n custom_schema: specify custom_schema for particular columns of the\n returned dataframe in the init/delta extraction of the source table.
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction.__init__", "kind": "function", "doc": "\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: str = 'REQTSN DECIMAL(23,0)',\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231018182628',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)',\tlatest_timestamp_input_col: str = 'REQTSN',\trequest_status_tbl: str = 'SAPHANADB.RSPMREQUEST',\trequest_col_name: str = 'REQUEST_TSN',\tdata_target: Optional[str] = None,\tact_req_join_condition: Optional[str] = None,\tinclude_changelog_tech_cols: Optional[bool] = None,\textra_cols_req_status_tbl: Optional[str] = None,\trequest_status_tbl_filter: Optional[str] = None,\tadso_type: Optional[str] = None,\tdefault_max_timestamp: str = '1970000000000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils", "kind": "class", "doc": "Utils for managing data extraction from SAP B4.
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.__init__", "kind": "function", "doc": "Construct SAPB4ExtractionUtils.
\n\nArgs:\n sap_b4_extraction: SAP B4 Extraction configurations.
\n", "signature": "(\tsap_b4_extraction: lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.get_data_target", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.get_data_target", "kind": "function", "doc": "Get the data_target from the data_target option or derive it.
\n\nBy definition data_target is the same for the table and changelog table and\nis the same string ignoring everything before / and the first and last\ncharacter after /. E.g. for a dbtable /BIC/abtable12, the data_target\nwould be btable1.
\n\nArgs:\n input_spec_opt: options from the input_spec.
\n\nReturns:\n A string with the data_target.
\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "kind": "module", "doc": "Utilities module for SAP BW extraction processes.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction", "kind": "class", "doc": "Configurations available for an Extraction from SAP BW.
\n\nIt inherits from SAPBWExtraction configurations, so it can use\nand/or overwrite those configurations.
\n\nThese configurations cover:\n latest_timestamp_input_col: the column containing the actrequest timestamp\n in the dataset in latest_timestamp_data_location. Default:\n \"actrequest_timestamp\".\n act_request_table: the name of the SAP BW activation requests table.\n Composed of database.table. Default: SAPPHA.RSODSACTREQ.\n request_col_name: name of the column having the request to join\n with the activation request table. Default: actrequest.\n act_req_join_condition: the join condition into activation table\n can be changed using this property.\n Default: 'changelog_tbl.request = act_req.request_col_name'.\n odsobject: name of BW Object, used for joining with the activation request\n table to get the max actrequest_timestamp to consider while filtering\n the changelog table.\n include_changelog_tech_cols: whether to include the technical columns\n (usually coming from the changelog) table or not. Default: True.\n extra_cols_act_request: list of columns to be added from act request table.\n It needs to contain the prefix \"act_req.\". E.g. \"act_req.col1\n as column_one, act_req.col2 as column_two\".\n get_timestamp_from_act_request: whether to get init timestamp\n from act request table or assume current/given timestamp.\n sap_bw_schema: sap bw schema. Default: SAPPHA.\n max_timestamp_custom_schema: the custom schema to apply on the calculation of\n the max timestamp to consider for the delta extractions.\n Default: timestamp DECIMAL(23,0).\n default_max_timestamp: the timestamp to use as default, when it is not possible\n to derive one.
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction.__init__", "kind": "function", "doc": "\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231018182628',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)',\tlatest_timestamp_input_col: str = 'actrequest_timestamp',\tact_request_table: str = 'SAPPHA.RSODSACTREQ',\trequest_col_name: str = 'actrequest',\tact_req_join_condition: Optional[str] = None,\todsobject: Optional[str] = None,\tinclude_changelog_tech_cols: bool = True,\textra_cols_act_request: Optional[str] = None,\tget_timestamp_from_act_request: bool = False,\tsap_bw_schema: str = 'SAPPHA',\tdefault_max_timestamp: str = '197000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils", "kind": "class", "doc": "Utils for managing data extraction from particularly relevant JDBC sources.
\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.__init__", "kind": "function", "doc": "Construct SAPBWExtractionUtils.
\n\nArgs:\n sap_bw_extraction: SAP BW Extraction configurations.
\n", "signature": "(\tsap_bw_extraction: lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_changelog_table", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_changelog_table", "kind": "function", "doc": "Get the changelog table, given an odsobject.
\n\nReturns:\n String to use as changelog_table.
\n", "signature": "(self) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_odsobject", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_odsobject", "kind": "function", "doc": "Get the odsobject based on the provided options.
\n\nWith the table name we may also get the db name, so we need to split.\nMoreover, there might be the need for people to specify odsobject if\nit is different from the dbtable.
\n\nArgs:\n input_spec_opt: options from the input_spec.
\n\nReturns:\n A string with the odsobject.
\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "kind": "module", "doc": "Utilities module for SFTP extraction processes.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat", "kind": "class", "doc": "Formats of algorithm input.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.CSV", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.CSV", "kind": "variable", "doc": "\n", "default_value": "<SFTPInputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.FWF", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.FWF", "kind": "variable", "doc": "\n", "default_value": "<SFTPInputFormat.FWF: 'fwf'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.JSON", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.JSON", "kind": "variable", "doc": "\n", "default_value": "<SFTPInputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.XML", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.XML", "kind": "variable", "doc": "\n", "default_value": "<SFTPInputFormat.XML: 'xml'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter", "kind": "class", "doc": "Standardize the types of filters we can have from a SFTP source.
\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.file_name_contains", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.file_name_contains", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.file_name_contains: 'file_name_contains'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LATEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LATEST_FILE", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.LATEST_FILE: 'latest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.EARLIEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.EARLIEST_FILE", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.EARLIEST_FILE: 'earliest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.GREATER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.GREATER_THAN", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.GREATER_THAN: 'date_time_gt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LOWER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LOWER_THAN", "kind": "variable", "doc": "\n", "default_value": "<SFTPExtractionFilter.LOWER_THAN: 'date_time_lt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils", "kind": "class", "doc": "Utils for managing data extraction from particularly relevant SFTP sources.
\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_files_list", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_files_list", "kind": "function", "doc": "Get a list of files to be extracted from SFTP.
\n\nThe arguments (options_args) to list files are:\ndate_time_gt(str):\n Filter the files greater than the string datetime\n formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".\ndate_time_lt(str):\n Filter the files lower than the string datetime\n formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".\nearliest_file(bool):\n Filter the earliest dated file in the directory.\nfile_name_contains(str):\n Filter files when match the pattern.\nlatest_file(bool):\n Filter the most recent dated file in the directory.\nsub_dir(bool):\n When true, the engine will search files into subdirectories\n of the remote_path.\n It will consider one level below the remote_path.\n When sub_dir is used with latest_file/earliest_file argument,\n the engine will retrieve the latest_file/earliest_file\n for each subdirectory.
\n\nArgs:\n sftp: the SFTP client object.\n remote_path: path of files to be filtered.\n options_args: options from the acon.
\n\nReturns:\n A list containing the file names to be passed to Spark.
\n", "signature": "(\tcls,\tsftp: paramiko.sftp_client.SFTPClient,\tremote_path: str,\toptions_args: dict) -> Set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_sftp_client", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_sftp_client", "kind": "function", "doc": "Get the SFTP client.
\n\nThe SFTP client is used to open an SFTP session across an open\nSSH Transport and perform remote file operations.
\n\nArgs:\n options_args: dictionary containing SFTP connection parameters.\n The Paramiko arguments expected to connect are:\n \"hostname\": the server to connect to.\n \"port\": the server port to connect to.\n \"username\": the username to authenticate as.\n \"password\": used for password authentication.\n \"pkey\": optional - an optional public key to use for authentication.\n \"passphrase\" \u2013 optional - options used for decrypting private keys.\n \"key_filename\" \u2013 optional - the filename, or list of filenames,\n of optional private key(s) and/or certs to try for authentication.\n \"timeout\" \u2013 an optional timeout (in seconds) for the TCP connect.\n \"allow_agent\" \u2013 optional - set to False to disable\n connecting to the SSH agent.\n \"look_for_keys\" \u2013 optional - set to False to disable searching\n for discoverable private key files in ~/.ssh/.\n \"compress\" \u2013 optional - set to True to turn on compression.\n \"sock\" - optional - an open socket or socket-like object\n to use for communication to the target host.\n \"gss_auth\" \u2013 optional - True if you want to use GSS-API authentication.\n \"gss_kex\" \u2013 optional - Perform GSS-API Key Exchange and\n user authentication.\n \"gss_deleg_creds\" \u2013 optional - Delegate GSS-API client\n credentials or not.\n \"gss_host\" \u2013 optional - The targets name in the kerberos database.\n \"gss_trust_dns\" \u2013 optional - Indicates whether or\n not the DNS is trusted to securely canonicalize the name of the\n host being connected to (default True).\n \"banner_timeout\" \u2013 an optional timeout (in seconds)\n to wait for the SSH banner to be presented.\n \"auth_timeout\" \u2013 an optional timeout (in seconds)\n to wait for an authentication response.\n \"disabled_algorithms\" \u2013 an optional dict passed directly to Transport\n and its keyword argument of the same name.\n \"transport_factory\" \u2013 an optional callable which is handed a subset of\n the constructor arguments (primarily those related to the socket,\n GSS functionality, and algorithm selection) and generates a\n Transport instance to be used by this client.\n Defaults to Transport.__init__.
\n\n\n\nThe parameter to specify the private key is expected to be in RSA format.\nAttempting a connection with a blank host key is not allowed\nunless the argument \"add_auto_policy\" is explicitly set to True.\n
Returns:\n sftp -> a new SFTPClient session object.\n transport -> the Transport for this connection.
\n", "signature": "(\tcls,\toptions_args: dict) -> Tuple[paramiko.sftp_client.SFTPClient, paramiko.transport.Transport]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_format", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_format", "kind": "function", "doc": "Validate the file extension based on the format definitions.
\n\nArgs:\n files_format: a string containing the file extension.
\n\nReturns:\n The string validated and formatted.
\n", "signature": "(cls, files_format: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_location", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_location", "kind": "function", "doc": "Validate the location. Add \"/\" in the case it does not exist.
\n\nArgs:\n location: file path.
\n\nReturns:\n The location validated.
\n", "signature": "(cls, location: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.file_utils", "modulename": "lakehouse_engine.utils.file_utils", "kind": "module", "doc": "Utilities for file name based operations.
\n"}, {"fullname": "lakehouse_engine.utils.file_utils.get_file_names_without_file_type", "modulename": "lakehouse_engine.utils.file_utils", "qualname": "get_file_names_without_file_type", "kind": "function", "doc": "Function to retrieve list of file names in a folder.
\n\nThis function filters by file type and removes the extension of the file name\nit returns.
\n\nArgs:\n path: path to the folder to list files\n file_type: type of the file to include in list\n exclude_regex: regex of file names to exclude
\n\nReturns:\n A list of file names without file type.
\n", "signature": "(path: str, file_type: str, exclude_regex: str) -> list:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler", "modulename": "lakehouse_engine.utils.logging_handler", "kind": "module", "doc": "Module to configure project logging.
\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData", "kind": "class", "doc": "Logging filter to hide sensitive data from being shown in the logs.
\n", "bases": "logging.Filter"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData.filter", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData.filter", "kind": "function", "doc": "Hide sensitive information from being shown in the logs.
\n\nBased on the configured regex and replace strings, the content of the log\nrecords is replaced and then all the records are allowed to be logged\n(return True).
\n\nArgs:\n record: the LogRecord event being logged.
\n\nReturns:\n The transformed record to be logged.
\n", "signature": "(self, record: logging.LogRecord) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler", "kind": "class", "doc": "Handle the logging of the lakehouse engine project.
\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.__init__", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.__init__", "kind": "function", "doc": "Construct a LoggingHandler instance.
\n\nArgs:\n class_name: name of the class to be indicated in the logs.
\n", "signature": "(class_name: str)"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.get_logger", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.get_logger", "kind": "function", "doc": "Get the _logger instance variable.
\n\nReturns
\n\n\n\n", "signature": "(self) -> logging.Logger:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils", "modulename": "lakehouse_engine.utils.schema_utils", "kind": "module", "doc": "the logger object.
\nUtilities to facilitate dataframe schema management.
\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils", "kind": "class", "doc": "Schema utils that help retrieve and manage schemas of dataframes.
\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file", "kind": "function", "doc": "Get a spark schema from a file (spark StructType json file) in a file system.
\n\nArgs:\n file_path: path of the file in a file system. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html
\n\nReturns:\n Spark schema struct type.
\n", "signature": "(file_path: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file_to_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file_to_dict", "kind": "function", "doc": "Get a dict with the spark schema from a file in a file system.
\n\nArgs:\n file_path: path of the file in a file system. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html
\n\nReturns:\n Spark schema in a dict.
\n", "signature": "(file_path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_dict", "kind": "function", "doc": "Get a spark schema from a dict.
\n\nArgs:\n struct_type: dict containing a spark schema structure. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html
\n\nReturns:\n Spark schema struct type.
\n", "signature": "(struct_type: dict) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_table_schema", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_table_schema", "kind": "function", "doc": "Get a spark schema from a table.
\n\nArgs:\n table: table name from which to inherit the schema.
\n\nReturns:\n Spark schema struct type.
\n", "signature": "(table: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_input_spec", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_input_spec", "kind": "function", "doc": "Get a spark schema from an input specification.
\n\nThis covers scenarios where the schema is provided as part of the input\nspecification of the algorithm. Schema can come from the table specified in the\ninput specification (enforce_schema_from_table) or by the dict with the spark\nschema provided there also.
\n\nArgs:\n input_spec: input specification.
\n\nReturns:\n spark schema struct type.
\n", "signature": "(\tcls,\tinput_spec: lakehouse_engine.core.definitions.InputSpec) -> Optional[pyspark.sql.types.StructType]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.schema_flattener", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.schema_flattener", "kind": "function", "doc": "Recursive method to flatten the schema of the dataframe.
\n\nArgs:\n schema: schema to be flattened.\n prefix: prefix of the struct to get the value for. Only relevant\n for being used in the internal recursive logic.\n level: level of the depth in the schema being flattened. Only relevant\n for being used in the internal recursive logic.\n max_level: level until which you want to flatten the schema. Default: None.\n shorten_names: whether to shorten the names of the prefixes of the fields\n being flattened or not. Default: False.\n alias: whether to define alias for the columns being flattened or\n not. Default: True.\n num_chars: number of characters to consider when shortening the names of\n the fields. Default: 7.\n ignore_cols: columns which you don't want to flatten. Default: None.
\n\nReturns:\n A function to be called in .transform() spark function.
\n", "signature": "(\tschema: pyspark.sql.types.StructType,\tprefix: str = None,\tlevel: int = 1,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage", "modulename": "lakehouse_engine.utils.storage", "kind": "module", "doc": "Utilities to interact with storage systems.
\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage", "modulename": "lakehouse_engine.utils.storage.file_storage", "kind": "module", "doc": "Module for abstract representation of a storage system holding files.
\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage", "kind": "class", "doc": "Abstract file storage class.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.get_file_payload", "kind": "function", "doc": "Get the payload of a file.
\n\nArgs:\n url: url of the file.
\n\nReturns:\n File payload/content.
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.write_payload_to_file", "kind": "function", "doc": "Write payload into a file.
\n\nArgs:\n url: url of the file.\n content: content to write into the file.
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "kind": "module", "doc": "Module for common file storage functions.
\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions", "kind": "class", "doc": "Class for common file storage functions.
\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.read_json", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.read_json", "kind": "function", "doc": "Read a json file.
\n\nThe file should be in a supported file system (e.g., s3 or local filesystem -\nfor local tests only).
\n\nArgs:\n path: path to the json file.
\n\nReturns:\n Dict with json file content.
\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "kind": "module", "doc": "Module to represent a local file storage system.
\n"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage", "kind": "class", "doc": "Class to represent a local file storage system.
\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.get_file_payload", "kind": "function", "doc": "Get the payload of a file.
\n\nArgs:\n url: url of the file.
\n\nReturns:\n file payload/content.
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> <class 'TextIO'>:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.write_payload_to_file", "kind": "function", "doc": "Write payload into a file.
\n\nArgs:\n url: url of the file.\n content: content to write into the file.
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "kind": "module", "doc": "Module to represent a s3 file storage system.
\n"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage", "kind": "class", "doc": "Class to represent a s3 file storage system.
\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.get_file_payload", "kind": "function", "doc": "Get the payload of a config file.
\n\nArgs:\n url: url of the file.
\n\nReturns:\n File payload/content.
\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.write_payload_to_file", "kind": "function", "doc": "Write payload into a file.
\n\nArgs:\n url: url of the file.\n content: content to write into the file.
\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}]; // mirrored in build-search-index.js (part 1) // Also split on html tags. this is a cheap heuristic, but good enough.