diff --git a/lakehouse_engine/core/exec_env.html b/lakehouse_engine/core/exec_env.html index 0298aef..a415453 100644 --- a/lakehouse_engine/core/exec_env.html +++ b/lakehouse_engine/core/exec_env.html @@ -65,88 +65,95 @@

 1"""Module to take care of creating a singleton of the execution environment class."""
  2import os
  3
- 4from pyspark import SparkConf
- 5from pyspark.sql import SparkSession
- 6
- 7from lakehouse_engine.utils.logging_handler import LoggingHandler
+ 4from pyspark.sql import SparkSession
+ 5
+ 6from lakehouse_engine.utils.logging_handler import LoggingHandler
+ 7
  8
- 9
-10class ExecEnv(object):
-11    """Represents the basic resources regarding the engine execution environment.
-12
-13    Currently, it is solely used to encapsulate the logic to get a Spark session.
-14    """
-15
-16    SESSION: SparkSession
-17    _LOGGER = LoggingHandler(__name__).get_logger()
-18    DEFAULT_AWS_REGION = "eu-west-1"
-19
-20    @classmethod
-21    def get_or_create(
-22        cls,
-23        session: SparkSession = None,
-24        enable_hive_support: bool = True,
-25        app_name: str = None,
-26        config: dict = None,
-27    ) -> None:
-28        """Get or create an execution environment session (currently Spark).
-29
-30        It instantiates a singleton session that can be accessed anywhere from the
-31        lakehouse engine.
-32
-33        Args:
-34            session: spark session.
-35            enable_hive_support: whether to enable hive support or not.
-36            app_name: application name.
-37            config: extra spark configs to supply to the spark session.
-38        """
-39        default_config = {
-40            "spark.databricks.delta.optimizeWrite.enabled": True,
-41            "spark.sql.adaptive.enabled": True,
-42            "spark.databricks.delta.merge.enableLowShuffle": True,
-43        }
-44        cls._LOGGER.info(
-45            f"Using the following default configs you may want to override them for "
-46            f"your job: {default_config}"
-47        )
-48        final_config: dict = {**default_config, **(config if config else {})}
-49        cls._LOGGER.info(f"Final config is: {final_config}")
-50
-51        if session:
-52            cls.SESSION = session
-53        else:
-54            session_builder = SparkSession.builder.appName(app_name)
-55            if config:
-56                session_builder = session_builder.config(
-57                    conf=SparkConf().setAll(final_config.items())  # type: ignore
-58                )
-59            if enable_hive_support:
-60                session_builder = session_builder.enableHiveSupport()
-61            cls.SESSION = session_builder.getOrCreate()
-62
-63            cls._set_environment_variables(final_config.get("os_env_vars"))
-64
-65    @classmethod
-66    def _set_environment_variables(cls, os_env_vars: dict = None) -> None:
-67        """Set environment variables at OS level.
-68
-69        By default, we are setting the AWS_DEFAULT_REGION as we have identified this is
-70        beneficial to avoid getBucketLocation permission problems.
+ 9class ExecEnv(object):
+10    """Represents the basic resources regarding the engine execution environment.
+11
+12    Currently, it is solely used to encapsulate the logic to get a Spark session.
+13    """
+14
+15    SESSION: SparkSession
+16    _LOGGER = LoggingHandler(__name__).get_logger()
+17    DEFAULT_AWS_REGION = "eu-west-1"
+18
+19    @classmethod
+20    def get_or_create(
+21        cls,
+22        session: SparkSession = None,
+23        enable_hive_support: bool = True,
+24        app_name: str = None,
+25        config: dict = None,
+26    ) -> None:
+27        """Get or create an execution environment session (currently Spark).
+28
+29        It instantiates a singleton session that can be accessed anywhere from the
+30        lakehouse engine.
+31
+32        Args:
+33            session: spark session.
+34            enable_hive_support: whether to enable hive support or not.
+35            app_name: application name.
+36            config: extra spark configs to supply to the spark session.
+37        """
+38        default_config = {
+39            "spark.databricks.delta.optimizeWrite.enabled": True,
+40            "spark.sql.adaptive.enabled": True,
+41            "spark.databricks.delta.merge.enableLowShuffle": True,
+42        }
+43        cls._LOGGER.info(
+44            f"Using the following default configs you may want to override them for "
+45            f"your job: {default_config}"
+46        )
+47        final_config: dict = {**default_config, **(config if config else {})}
+48        cls._LOGGER.info(f"Final config is: {final_config}")
+49
+50        if session:
+51            cls.SESSION = session
+52        else:
+53            # with active session we do not need app name
+54            if SparkSession.getActiveSession():
+55                app_name = SparkSession.getActiveSession().sparkContext.appName
+56                cls._LOGGER.info(f"Detected active session: {app_name}")
+57            elif not SparkSession.getActiveSession() and not app_name:
+58                cls._LOGGER.info("No active session or appname detected")
+59                app_name = "lakehouse_engine"
+60            # we will still add this part to set configs
+61            session_builder = SparkSession.builder.appName(app_name)
+62            if config:
+63                for k, v in final_config.items():
+64                    session_builder.config(k, v)
+65
+66            if enable_hive_support:
+67                session_builder = session_builder.enableHiveSupport()
+68            cls.SESSION = session_builder.getOrCreate()
+69
+70            cls._set_environment_variables(final_config.get("os_env_vars"))
 71
-72        Args:
-73            os_env_vars: this parameter can be used to pass the environment variables to
-74            be defined.
-75        """
-76        if os_env_vars is None:
-77            os_env_vars = {}
+72    @classmethod
+73    def _set_environment_variables(cls, os_env_vars: dict = None) -> None:
+74        """Set environment variables at OS level.
+75
+76        By default, we are setting the AWS_DEFAULT_REGION as we have identified this is
+77        beneficial to avoid getBucketLocation permission problems.
 78
-79        for env_var in os_env_vars.items():
-80            os.environ[env_var[0]] = env_var[1]
-81
-82        if "AWS_DEFAULT_REGION" not in os_env_vars:
-83            os.environ["AWS_DEFAULT_REGION"] = cls.SESSION.sparkContext.getConf().get(
-84                "spark.databricks.clusterUsageTags.region", cls.DEFAULT_AWS_REGION
-85            )
+79        Args:
+80            os_env_vars: this parameter can be used to pass the environment variables to
+81            be defined.
+82        """
+83        if os_env_vars is None:
+84            os_env_vars = {}
+85
+86        for env_var in os_env_vars.items():
+87            os.environ[env_var[0]] = env_var[1]
+88
+89        if "AWS_DEFAULT_REGION" not in os_env_vars:
+90            os.environ["AWS_DEFAULT_REGION"] = cls.SESSION.sparkContext.getConf().get(
+91                "spark.databricks.clusterUsageTags.region", cls.DEFAULT_AWS_REGION
+92            )
 
@@ -162,82 +169,90 @@

-
11class ExecEnv(object):
-12    """Represents the basic resources regarding the engine execution environment.
-13
-14    Currently, it is solely used to encapsulate the logic to get a Spark session.
-15    """
-16
-17    SESSION: SparkSession
-18    _LOGGER = LoggingHandler(__name__).get_logger()
-19    DEFAULT_AWS_REGION = "eu-west-1"
-20
-21    @classmethod
-22    def get_or_create(
-23        cls,
-24        session: SparkSession = None,
-25        enable_hive_support: bool = True,
-26        app_name: str = None,
-27        config: dict = None,
-28    ) -> None:
-29        """Get or create an execution environment session (currently Spark).
-30
-31        It instantiates a singleton session that can be accessed anywhere from the
-32        lakehouse engine.
-33
-34        Args:
-35            session: spark session.
-36            enable_hive_support: whether to enable hive support or not.
-37            app_name: application name.
-38            config: extra spark configs to supply to the spark session.
-39        """
-40        default_config = {
-41            "spark.databricks.delta.optimizeWrite.enabled": True,
-42            "spark.sql.adaptive.enabled": True,
-43            "spark.databricks.delta.merge.enableLowShuffle": True,
-44        }
-45        cls._LOGGER.info(
-46            f"Using the following default configs you may want to override them for "
-47            f"your job: {default_config}"
-48        )
-49        final_config: dict = {**default_config, **(config if config else {})}
-50        cls._LOGGER.info(f"Final config is: {final_config}")
-51
-52        if session:
-53            cls.SESSION = session
-54        else:
-55            session_builder = SparkSession.builder.appName(app_name)
-56            if config:
-57                session_builder = session_builder.config(
-58                    conf=SparkConf().setAll(final_config.items())  # type: ignore
-59                )
-60            if enable_hive_support:
-61                session_builder = session_builder.enableHiveSupport()
-62            cls.SESSION = session_builder.getOrCreate()
-63
-64            cls._set_environment_variables(final_config.get("os_env_vars"))
-65
-66    @classmethod
-67    def _set_environment_variables(cls, os_env_vars: dict = None) -> None:
-68        """Set environment variables at OS level.
-69
-70        By default, we are setting the AWS_DEFAULT_REGION as we have identified this is
-71        beneficial to avoid getBucketLocation permission problems.
+            
10class ExecEnv(object):
+11    """Represents the basic resources regarding the engine execution environment.
+12
+13    Currently, it is solely used to encapsulate the logic to get a Spark session.
+14    """
+15
+16    SESSION: SparkSession
+17    _LOGGER = LoggingHandler(__name__).get_logger()
+18    DEFAULT_AWS_REGION = "eu-west-1"
+19
+20    @classmethod
+21    def get_or_create(
+22        cls,
+23        session: SparkSession = None,
+24        enable_hive_support: bool = True,
+25        app_name: str = None,
+26        config: dict = None,
+27    ) -> None:
+28        """Get or create an execution environment session (currently Spark).
+29
+30        It instantiates a singleton session that can be accessed anywhere from the
+31        lakehouse engine.
+32
+33        Args:
+34            session: spark session.
+35            enable_hive_support: whether to enable hive support or not.
+36            app_name: application name.
+37            config: extra spark configs to supply to the spark session.
+38        """
+39        default_config = {
+40            "spark.databricks.delta.optimizeWrite.enabled": True,
+41            "spark.sql.adaptive.enabled": True,
+42            "spark.databricks.delta.merge.enableLowShuffle": True,
+43        }
+44        cls._LOGGER.info(
+45            f"Using the following default configs you may want to override them for "
+46            f"your job: {default_config}"
+47        )
+48        final_config: dict = {**default_config, **(config if config else {})}
+49        cls._LOGGER.info(f"Final config is: {final_config}")
+50
+51        if session:
+52            cls.SESSION = session
+53        else:
+54            # with active session we do not need app name
+55            if SparkSession.getActiveSession():
+56                app_name = SparkSession.getActiveSession().sparkContext.appName
+57                cls._LOGGER.info(f"Detected active session: {app_name}")
+58            elif not SparkSession.getActiveSession() and not app_name:
+59                cls._LOGGER.info("No active session or appname detected")
+60                app_name = "lakehouse_engine"
+61            # we will still add this part to set configs
+62            session_builder = SparkSession.builder.appName(app_name)
+63            if config:
+64                for k, v in final_config.items():
+65                    session_builder.config(k, v)
+66
+67            if enable_hive_support:
+68                session_builder = session_builder.enableHiveSupport()
+69            cls.SESSION = session_builder.getOrCreate()
+70
+71            cls._set_environment_variables(final_config.get("os_env_vars"))
 72
-73        Args:
-74            os_env_vars: this parameter can be used to pass the environment variables to
-75            be defined.
-76        """
-77        if os_env_vars is None:
-78            os_env_vars = {}
+73    @classmethod
+74    def _set_environment_variables(cls, os_env_vars: dict = None) -> None:
+75        """Set environment variables at OS level.
+76
+77        By default, we are setting the AWS_DEFAULT_REGION as we have identified this is
+78        beneficial to avoid getBucketLocation permission problems.
 79
-80        for env_var in os_env_vars.items():
-81            os.environ[env_var[0]] = env_var[1]
-82
-83        if "AWS_DEFAULT_REGION" not in os_env_vars:
-84            os.environ["AWS_DEFAULT_REGION"] = cls.SESSION.sparkContext.getConf().get(
-85                "spark.databricks.clusterUsageTags.region", cls.DEFAULT_AWS_REGION
-86            )
+80        Args:
+81            os_env_vars: this parameter can be used to pass the environment variables to
+82            be defined.
+83        """
+84        if os_env_vars is None:
+85            os_env_vars = {}
+86
+87        for env_var in os_env_vars.items():
+88            os.environ[env_var[0]] = env_var[1]
+89
+90        if "AWS_DEFAULT_REGION" not in os_env_vars:
+91            os.environ["AWS_DEFAULT_REGION"] = cls.SESSION.sparkContext.getConf().get(
+92                "spark.databricks.clusterUsageTags.region", cls.DEFAULT_AWS_REGION
+93            )
 
@@ -259,50 +274,58 @@

-
21    @classmethod
-22    def get_or_create(
-23        cls,
-24        session: SparkSession = None,
-25        enable_hive_support: bool = True,
-26        app_name: str = None,
-27        config: dict = None,
-28    ) -> None:
-29        """Get or create an execution environment session (currently Spark).
-30
-31        It instantiates a singleton session that can be accessed anywhere from the
-32        lakehouse engine.
-33
-34        Args:
-35            session: spark session.
-36            enable_hive_support: whether to enable hive support or not.
-37            app_name: application name.
-38            config: extra spark configs to supply to the spark session.
-39        """
-40        default_config = {
-41            "spark.databricks.delta.optimizeWrite.enabled": True,
-42            "spark.sql.adaptive.enabled": True,
-43            "spark.databricks.delta.merge.enableLowShuffle": True,
-44        }
-45        cls._LOGGER.info(
-46            f"Using the following default configs you may want to override them for "
-47            f"your job: {default_config}"
-48        )
-49        final_config: dict = {**default_config, **(config if config else {})}
-50        cls._LOGGER.info(f"Final config is: {final_config}")
-51
-52        if session:
-53            cls.SESSION = session
-54        else:
-55            session_builder = SparkSession.builder.appName(app_name)
-56            if config:
-57                session_builder = session_builder.config(
-58                    conf=SparkConf().setAll(final_config.items())  # type: ignore
-59                )
-60            if enable_hive_support:
-61                session_builder = session_builder.enableHiveSupport()
-62            cls.SESSION = session_builder.getOrCreate()
-63
-64            cls._set_environment_variables(final_config.get("os_env_vars"))
+            
20    @classmethod
+21    def get_or_create(
+22        cls,
+23        session: SparkSession = None,
+24        enable_hive_support: bool = True,
+25        app_name: str = None,
+26        config: dict = None,
+27    ) -> None:
+28        """Get or create an execution environment session (currently Spark).
+29
+30        It instantiates a singleton session that can be accessed anywhere from the
+31        lakehouse engine.
+32
+33        Args:
+34            session: spark session.
+35            enable_hive_support: whether to enable hive support or not.
+36            app_name: application name.
+37            config: extra spark configs to supply to the spark session.
+38        """
+39        default_config = {
+40            "spark.databricks.delta.optimizeWrite.enabled": True,
+41            "spark.sql.adaptive.enabled": True,
+42            "spark.databricks.delta.merge.enableLowShuffle": True,
+43        }
+44        cls._LOGGER.info(
+45            f"Using the following default configs you may want to override them for "
+46            f"your job: {default_config}"
+47        )
+48        final_config: dict = {**default_config, **(config if config else {})}
+49        cls._LOGGER.info(f"Final config is: {final_config}")
+50
+51        if session:
+52            cls.SESSION = session
+53        else:
+54            # with active session we do not need app name
+55            if SparkSession.getActiveSession():
+56                app_name = SparkSession.getActiveSession().sparkContext.appName
+57                cls._LOGGER.info(f"Detected active session: {app_name}")
+58            elif not SparkSession.getActiveSession() and not app_name:
+59                cls._LOGGER.info("No active session or appname detected")
+60                app_name = "lakehouse_engine"
+61            # we will still add this part to set configs
+62            session_builder = SparkSession.builder.appName(app_name)
+63            if config:
+64                for k, v in final_config.items():
+65                    session_builder.config(k, v)
+66
+67            if enable_hive_support:
+68                session_builder = session_builder.enableHiveSupport()
+69            cls.SESSION = session_builder.getOrCreate()
+70
+71            cls._set_environment_variables(final_config.get("os_env_vars"))
 
diff --git a/lakehouse_engine/core/file_manager.html b/lakehouse_engine/core/file_manager.html index 34af7db..8ae2cd2 100644 --- a/lakehouse_engine/core/file_manager.html +++ b/lakehouse_engine/core/file_manager.html @@ -97,7 +97,7 @@

  1"""File manager module."""
   2import time
-  3from typing import Any, Optional
+  3from typing import Any, Optional, Tuple
   4
   5import boto3
   6
@@ -111,516 +111,609 @@ 

14from lakehouse_engine.utils.logging_handler import LoggingHandler 15 16 - 17def _dry_run(bucket: str, object_paths: list) -> dict: - 18 """Build the dry run request return format. + 17def _process_directory_path(path: str) -> str: + 18 """Add '/' to the end of the path of a directory. 19 20 Args: - 21 bucket: name of bucket to perform operation. - 22 object_paths: paths of object to list. - 23 - 24 Returns: - 25 A dict with a list of objects that would be copied/deleted. - 26 """ - 27 response = {} + 21 path: directory to be processed + 22 + 23 Returns: + 24 Directory path stripped and with '/' at the end. + 25 """ + 26 path = path.strip() + 27 return path if path[-1] == "/" else path + "/" 28 - 29 for path in object_paths: - 30 path = path.strip() - 31 res = _list_objects_recursively(bucket=bucket, path=path) + 29 + 30def _dry_run(bucket: str, object_paths: list) -> dict: + 31 """Build the dry run request return format. 32 - 33 if res: - 34 response[path] = res - 35 else: - 36 response[path] = ["No such key"] - 37 - 38 return response - 39 - 40 - 41def _list_objects_recursively(bucket: str, path: str) -> list: - 42 """Recursively list all objects given a prefix in s3. - 43 - 44 Args: - 45 bucket: name of bucket to perform the list. - 46 path: path to be used as a prefix. + 33 Args: + 34 bucket: name of bucket to perform operation. + 35 object_paths: paths of object to list. + 36 + 37 Returns: + 38 A dict with a list of objects that would be copied/deleted. + 39 """ + 40 response = {} + 41 + 42 for path in object_paths: + 43 if _check_directory(bucket, path): + 44 path = _process_directory_path(path) + 45 + 46 res = _list_objects_recursively(bucket=bucket, path=path) 47 - 48 Returns: - 49 A list of object names fetched recursively. - 50 """ - 51 object_list = [] - 52 more_objects = True - 53 pagination = "" + 48 if res: + 49 response[path] = res + 50 else: + 51 response[path] = ["No such key"] + 52 + 53 return response 54 - 55 s3 = boto3.client("s3") - 56 - 57 while more_objects: - 58 if not pagination: - 59 list_response = s3.list_objects_v2(Bucket=bucket, Prefix=path) - 60 else: - 61 list_response = s3.list_objects_v2( - 62 Bucket=bucket, - 63 Prefix=path, - 64 ContinuationToken=pagination, - 65 ) - 66 - 67 if FileManagerAPIKeys.CONTENTS.value in list_response: - 68 for obj in list_response[FileManagerAPIKeys.CONTENTS.value]: - 69 object_list.append(obj[FileManagerAPIKeys.KEY.value]) + 55 + 56def _list_objects( + 57 s3_client: Any, bucket: str, path: str, paginator: str = "" + 58) -> Tuple[list, str]: + 59 """List 1000 objects in a bucket given a prefix and paginator in s3. + 60 + 61 Args: + 62 bucket: name of bucket to perform the list. + 63 path: path to be used as a prefix. + 64 paginator: paginator token to be used. + 65 + 66 Returns: + 67 A list of object names. + 68 """ + 69 object_list = [] 70 - 71 if FileManagerAPIKeys.CONTINUATION.value in list_response: - 72 pagination = list_response[FileManagerAPIKeys.CONTINUATION.value] - 73 else: - 74 more_objects = False - 75 - 76 return object_list - 77 - 78 - 79class FileManager(object): - 80 """Set of actions to manipulate files in several ways.""" - 81 - 82 _logger = LoggingHandler(__name__).get_logger() + 71 if not paginator: + 72 list_response = s3_client.list_objects_v2(Bucket=bucket, Prefix=path) + 73 else: + 74 list_response = s3_client.list_objects_v2( + 75 Bucket=bucket, + 76 Prefix=path, + 77 ContinuationToken=paginator, + 78 ) + 79 + 80 if FileManagerAPIKeys.CONTENTS.value in list_response: + 81 for obj in list_response[FileManagerAPIKeys.CONTENTS.value]: + 82 object_list.append(obj[FileManagerAPIKeys.KEY.value]) 83 - 84 def __init__(self, configs: dict): - 85 """Construct FileManager algorithm instances. - 86 - 87 Args: - 88 configs: configurations for the FileManager algorithm. - 89 """ - 90 self.configs = configs - 91 self.function = self.configs["function"] - 92 - 93 def get_function(self) -> None: - 94 """Get a specific function to execute.""" - 95 available_functions = { - 96 "delete_objects": self.delete_objects, - 97 "copy_objects": self.copy_objects, - 98 "request_restore": self.request_restore, - 99 "check_restore_status": self.check_restore_status, -100 "request_restore_to_destination_and_wait": ( -101 self.request_restore_to_destination_and_wait -102 ), -103 } -104 -105 self._logger.info("Function being executed: {}".format(self.function)) -106 if self.function in available_functions.keys(): -107 func = available_functions[self.function] -108 func() -109 else: -110 raise NotImplementedError( -111 f"The requested function {self.function} is not implemented." -112 ) -113 -114 def delete_objects(self) -> None: -115 """Delete objects and 'directories' in s3. -116 -117 If dry_run is set to True the function will print a dict with all the -118 paths that would be deleted based on the given keys. -119 """ -120 bucket = self.configs["bucket"] -121 objects_paths = self.configs["object_paths"] -122 dry_run = self.configs["dry_run"] -123 -124 s3 = boto3.client("s3") + 84 if FileManagerAPIKeys.CONTINUATION.value in list_response: + 85 pagination = list_response[FileManagerAPIKeys.CONTINUATION.value] + 86 else: + 87 pagination = "" + 88 + 89 return object_list, pagination + 90 + 91 + 92def _list_objects_recursively(bucket: str, path: str) -> list: + 93 """Recursively list all objects given a prefix in s3. + 94 + 95 Args: + 96 bucket: name of bucket to perform the list. + 97 path: path to be used as a prefix. + 98 + 99 Returns: +100 A list of object names fetched recursively. +101 """ +102 object_list = [] +103 more_objects = True +104 paginator = "" +105 +106 s3 = boto3.client("s3") +107 +108 while more_objects: +109 temp_list, paginator = _list_objects(s3, bucket, path, paginator) +110 +111 object_list.extend(temp_list) +112 +113 if not paginator: +114 more_objects = False +115 +116 return object_list +117 +118 +119def _check_directory(bucket: str, path: str) -> bool: +120 """Checks if the object is a 'directory' in s3. +121 +122 Args: +123 bucket: name of bucket to perform the check. +124 path: path to be used as a prefix. 125 -126 if dry_run: -127 response = _dry_run(bucket=bucket, object_paths=objects_paths) -128 -129 self._logger.info("Paths that would be deleted:") -130 else: -131 objects_to_delete = [] -132 for path in objects_paths: -133 for obj in _list_objects_recursively(bucket=bucket, path=path): -134 objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj}) -135 -136 response = s3.delete_objects( -137 Bucket=bucket, -138 Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete}, -139 ) -140 -141 self._logger.info(response) -142 -143 def copy_objects(self) -> None: -144 """Copies objects and 'directories' in s3.""" -145 source_bucket = self.configs["bucket"] -146 source_object = self.configs["source_object"] -147 destination_bucket = self.configs["destination_bucket"] -148 destination_object = self.configs["destination_object"] -149 dry_run = self.configs["dry_run"] -150 -151 FileManager._copy_objects( -152 source_bucket=source_bucket, -153 source_object=source_object, -154 destination_bucket=destination_bucket, -155 destination_object=destination_object, -156 dry_run=dry_run, -157 ) -158 -159 def request_restore(self) -> None: -160 """Request the restore of archived data.""" -161 source_bucket = self.configs["bucket"] -162 source_object = self.configs["source_object"] -163 restore_expiration = self.configs["restore_expiration"] -164 retrieval_tier = self.configs["retrieval_tier"] -165 dry_run = self.configs["dry_run"] -166 -167 ArchiveFileManager.request_restore( -168 source_bucket, -169 source_object, -170 restore_expiration, -171 retrieval_tier, -172 dry_run, -173 ) -174 -175 def check_restore_status(self) -> None: -176 """Check the restore status of archived data.""" -177 source_bucket = self.configs["bucket"] -178 source_object = self.configs["source_object"] -179 -180 restore_status = ArchiveFileManager.check_restore_status( -181 source_bucket, source_object -182 ) +126 Returns: +127 If path represents a 'directory'. +128 """ +129 s3 = boto3.client("s3") +130 objects, _ = _list_objects(s3, bucket, path) +131 return len(objects) > 1 +132 +133 +134class FileManager(object): +135 """Set of actions to manipulate files in several ways.""" +136 +137 _logger = LoggingHandler(__name__).get_logger() +138 +139 def __init__(self, configs: dict): +140 """Construct FileManager algorithm instances. +141 +142 Args: +143 configs: configurations for the FileManager algorithm. +144 """ +145 self.configs = configs +146 self.function = self.configs["function"] +147 +148 def get_function(self) -> None: +149 """Get a specific function to execute.""" +150 available_functions = { +151 "delete_objects": self.delete_objects, +152 "copy_objects": self.copy_objects, +153 "request_restore": self.request_restore, +154 "check_restore_status": self.check_restore_status, +155 "request_restore_to_destination_and_wait": ( +156 self.request_restore_to_destination_and_wait +157 ), +158 } +159 +160 self._logger.info("Function being executed: {}".format(self.function)) +161 if self.function in available_functions.keys(): +162 func = available_functions[self.function] +163 func() +164 else: +165 raise NotImplementedError( +166 f"The requested function {self.function} is not implemented." +167 ) +168 +169 def _delete_objects(self, bucket: str, objects_paths: list) -> None: +170 """Delete objects recursively in s3. +171 +172 Params: +173 bucket: name of bucket to perform the delete operation. +174 objects_paths: objects to be deleted. +175 """ +176 s3 = boto3.client("s3") +177 +178 for path in objects_paths: +179 if _check_directory(bucket, path): +180 path = _process_directory_path(path) +181 else: +182 path = path.strip() 183 -184 self._logger.info( -185 f""" -186 Restore status: -187 - Not Started: {restore_status.get('not_started_objects')} -188 - Ongoing: {restore_status.get('ongoing_objects')} -189 - Restored: {restore_status.get('restored_objects')} -190 Total objects in this restore process: {restore_status.get('total_objects')} -191 """ -192 ) -193 -194 def request_restore_to_destination_and_wait(self) -> None: -195 """Request and wait for the restore to complete, polling the restore status. -196 -197 After the restore is done, copy the restored files to destination -198 """ -199 source_bucket = self.configs["bucket"] -200 source_object = self.configs["source_object"] -201 destination_bucket = self.configs["destination_bucket"] -202 destination_object = self.configs["destination_object"] -203 restore_expiration = self.configs["restore_expiration"] -204 retrieval_tier = self.configs["retrieval_tier"] -205 dry_run = self.configs["dry_run"] -206 -207 ArchiveFileManager.request_restore_and_wait( -208 source_bucket=source_bucket, -209 source_object=source_object, -210 restore_expiration=restore_expiration, -211 retrieval_tier=retrieval_tier, -212 dry_run=dry_run, -213 ) +184 more_objects = True +185 paginator = "" +186 objects_to_delete = [] +187 +188 while more_objects: +189 objects_found, paginator = _list_objects( +190 s3_client=s3, bucket=bucket, path=path, paginator=paginator +191 ) +192 for obj in objects_found: +193 objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj}) +194 +195 if not paginator: +196 more_objects = False +197 +198 response = s3.delete_objects( +199 Bucket=bucket, +200 Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete}, +201 ) +202 self._logger.info(response) +203 objects_to_delete = [] +204 +205 def delete_objects(self) -> None: +206 """Delete objects and 'directories' in s3. +207 +208 If dry_run is set to True the function will print a dict with all the +209 paths that would be deleted based on the given keys. +210 """ +211 bucket = self.configs["bucket"] +212 objects_paths = self.configs["object_paths"] +213 dry_run = self.configs["dry_run"] 214 -215 FileManager._logger.info( -216 f"Restoration complete for {source_bucket} and {source_object}" -217 ) -218 FileManager._logger.info( -219 f"Starting to copy data from {source_bucket}/{source_object} to " -220 f"{destination_bucket}/{destination_object}" -221 ) -222 FileManager._copy_objects( -223 source_bucket=source_bucket, -224 source_object=source_object, -225 destination_bucket=destination_bucket, -226 destination_object=destination_object, -227 dry_run=dry_run, -228 ) -229 FileManager._logger.info( -230 f"Finished copying data, data should be available on {destination_bucket}/" -231 f"{destination_object}" -232 ) -233 -234 @staticmethod -235 def _copy_objects( -236 source_bucket: str, -237 source_object: str, -238 destination_bucket: str, -239 destination_object: str, -240 dry_run: bool, -241 ) -> None: -242 """Copies objects and 'directories' in s3. -243 -244 Args: -245 source_bucket: name of bucket to perform the copy. -246 source_object: object/folder to be copied. -247 destination_bucket: name of the target bucket to copy. -248 destination_object: target object/folder to copy. -249 dry_run: if dry_run is set to True the function will print a dict with -250 all the paths that would be deleted based on the given keys. -251 """ -252 s3 = boto3.client("s3") -253 -254 if dry_run: -255 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) -256 -257 FileManager._logger.info("Paths that would be copied:") -258 FileManager._logger.info(response) -259 else: -260 copy_object = _list_objects_recursively( -261 bucket=source_bucket, path=source_object -262 ) +215 if dry_run: +216 response = _dry_run(bucket=bucket, object_paths=objects_paths) +217 +218 self._logger.info("Paths that would be deleted:") +219 self._logger.info(response) +220 else: +221 self._delete_objects(bucket, objects_paths) +222 +223 def copy_objects(self) -> None: +224 """Copies objects and 'directories' in s3.""" +225 source_bucket = self.configs["bucket"] +226 source_object = self.configs["source_object"] +227 destination_bucket = self.configs["destination_bucket"] +228 destination_object = self.configs["destination_object"] +229 dry_run = self.configs["dry_run"] +230 +231 FileManager._copy_objects( +232 source_bucket=source_bucket, +233 source_object=source_object, +234 destination_bucket=destination_bucket, +235 destination_object=destination_object, +236 dry_run=dry_run, +237 ) +238 +239 def request_restore(self) -> None: +240 """Request the restore of archived data.""" +241 source_bucket = self.configs["bucket"] +242 source_object = self.configs["source_object"] +243 restore_expiration = self.configs["restore_expiration"] +244 retrieval_tier = self.configs["retrieval_tier"] +245 dry_run = self.configs["dry_run"] +246 +247 ArchiveFileManager.request_restore( +248 source_bucket, +249 source_object, +250 restore_expiration, +251 retrieval_tier, +252 dry_run, +253 ) +254 +255 def check_restore_status(self) -> None: +256 """Check the restore status of archived data.""" +257 source_bucket = self.configs["bucket"] +258 source_object = self.configs["source_object"] +259 +260 restore_status = ArchiveFileManager.check_restore_status( +261 source_bucket, source_object +262 ) 263 -264 if len(copy_object) == 1: -265 FileManager._logger.info(f"Copying obj: {source_object}") -266 -267 response = s3.copy_object( -268 Bucket=destination_bucket, -269 CopySource={ -270 FileManagerAPIKeys.BUCKET.value: source_bucket, -271 FileManagerAPIKeys.KEY.value: source_object, -272 }, -273 Key=f"""{destination_object}/{copy_object[0].split("/")[-1]}""", -274 ) -275 FileManager._logger.info(response) -276 else: -277 for obj in copy_object: -278 FileManager._logger.info(f"Copying obj: {obj}") -279 -280 final_path = obj.replace(source_object, "") -281 -282 response = s3.copy_object( -283 Bucket=destination_bucket, -284 CopySource={ -285 FileManagerAPIKeys.BUCKET.value: source_bucket, -286 FileManagerAPIKeys.KEY.value: obj, -287 }, -288 Key=f"{destination_object}{final_path}", -289 ) -290 FileManager._logger.info(response) -291 -292 -293class ArchiveFileManager(object): -294 """Set of actions to restore archives.""" -295 -296 _logger = LoggingHandler(__name__).get_logger() -297 -298 @staticmethod -299 def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]: -300 """Get the archived object if it's an object. -301 -302 Args: -303 bucket: name of bucket to check get the object. -304 object_key: object to get. -305 -306 Returns: -307 S3 Object if it's an archived object, otherwise None. -308 """ -309 s3 = boto3.resource("s3") -310 object_to_restore = s3.Object(bucket, object_key) -311 -312 if ( -313 object_to_restore.storage_class is not None -314 and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS -315 ): -316 return object_to_restore -317 else: -318 return None -319 -320 @staticmethod -321 def _check_object_restore_status( -322 bucket: str, object_key: str -323 ) -> Optional[RestoreStatus]: -324 """Check the restore status of the archive. -325 -326 Args: -327 bucket: name of bucket to check the restore status. -328 object_key: object to check the restore status. -329 -330 Returns: -331 The restore status represented by an enum, possible values are: -332 NOT_STARTED, ONGOING or RESTORED -333 """ -334 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) -335 -336 if archived_object is None: -337 status = None -338 elif archived_object.restore is None: -339 status = RestoreStatus.NOT_STARTED -340 elif 'ongoing-request="true"' in archived_object.restore: -341 status = RestoreStatus.ONGOING -342 else: -343 status = RestoreStatus.RESTORED +264 self._logger.info( +265 f""" +266 Restore status: +267 - Not Started: {restore_status.get('not_started_objects')} +268 - Ongoing: {restore_status.get('ongoing_objects')} +269 - Restored: {restore_status.get('restored_objects')} +270 Total objects in this restore process: {restore_status.get('total_objects')} +271 """ +272 ) +273 +274 def request_restore_to_destination_and_wait(self) -> None: +275 """Request and wait for the restore to complete, polling the restore status. +276 +277 After the restore is done, copy the restored files to destination +278 """ +279 source_bucket = self.configs["bucket"] +280 source_object = self.configs["source_object"] +281 destination_bucket = self.configs["destination_bucket"] +282 destination_object = self.configs["destination_object"] +283 restore_expiration = self.configs["restore_expiration"] +284 retrieval_tier = self.configs["retrieval_tier"] +285 dry_run = self.configs["dry_run"] +286 +287 ArchiveFileManager.request_restore_and_wait( +288 source_bucket=source_bucket, +289 source_object=source_object, +290 restore_expiration=restore_expiration, +291 retrieval_tier=retrieval_tier, +292 dry_run=dry_run, +293 ) +294 +295 FileManager._logger.info( +296 f"Restoration complete for {source_bucket} and {source_object}" +297 ) +298 FileManager._logger.info( +299 f"Starting to copy data from {source_bucket}/{source_object} to " +300 f"{destination_bucket}/{destination_object}" +301 ) +302 FileManager._copy_objects( +303 source_bucket=source_bucket, +304 source_object=source_object, +305 destination_bucket=destination_bucket, +306 destination_object=destination_object, +307 dry_run=dry_run, +308 ) +309 FileManager._logger.info( +310 f"Finished copying data, data should be available on {destination_bucket}/" +311 f"{destination_object}" +312 ) +313 +314 @staticmethod +315 def _copy_objects( +316 source_bucket: str, +317 source_object: str, +318 destination_bucket: str, +319 destination_object: str, +320 dry_run: bool, +321 ) -> None: +322 """Copies objects and 'directories' in s3. +323 +324 Args: +325 source_bucket: name of bucket to perform the copy. +326 source_object: object/folder to be copied. +327 destination_bucket: name of the target bucket to copy. +328 destination_object: target object/folder to copy. +329 dry_run: if dry_run is set to True the function will print a dict with +330 all the paths that would be deleted based on the given keys. +331 """ +332 s3 = boto3.client("s3") +333 +334 if dry_run: +335 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) +336 +337 FileManager._logger.info("Paths that would be copied:") +338 FileManager._logger.info(response) +339 else: +340 original_object_name = source_object.split("/")[-1] +341 +342 if _check_directory(source_bucket, source_object): +343 source_object = _process_directory_path(source_object) 344 -345 return status -346 -347 @staticmethod -348 def check_restore_status(source_bucket: str, source_object: str) -> dict: -349 """Check the restore status of archived data. -350 -351 Args: -352 source_bucket: name of bucket to check the restore status. -353 source_object: object to check the restore status. -354 -355 Returns: -356 A dict containing the amount of objects in each status. -357 """ -358 not_started_objects = 0 -359 ongoing_objects = 0 -360 restored_objects = 0 -361 total_objects = 0 -362 -363 objects_to_restore = _list_objects_recursively( -364 bucket=source_bucket, path=source_object -365 ) -366 -367 for obj in objects_to_restore: -368 ArchiveFileManager._logger.info(f"Checking restore status for: {obj}") -369 -370 restore_status = ArchiveFileManager._check_object_restore_status( -371 source_bucket, obj -372 ) -373 if not restore_status: -374 ArchiveFileManager._logger.warning( -375 f"Restore status not found for {source_bucket}/{obj}" -376 ) -377 else: -378 total_objects += 1 +345 copy_object = _list_objects_recursively( +346 bucket=source_bucket, path=source_object +347 ) +348 +349 for obj in copy_object: +350 FileManager._logger.info(f"Copying obj: {obj}") +351 +352 final_path = obj.replace(source_object, "") +353 +354 response = s3.copy_object( +355 Bucket=destination_bucket, +356 CopySource={ +357 FileManagerAPIKeys.BUCKET.value: source_bucket, +358 FileManagerAPIKeys.KEY.value: obj, +359 }, +360 Key=f"{destination_object}/{original_object_name}/{final_path}", +361 ) +362 FileManager._logger.info(response) +363 else: +364 FileManager._logger.info(f"Copying obj: {source_object}") +365 +366 response = s3.copy_object( +367 Bucket=destination_bucket, +368 CopySource={ +369 FileManagerAPIKeys.BUCKET.value: source_bucket, +370 FileManagerAPIKeys.KEY.value: source_object, +371 }, +372 Key=f"""{destination_object}/{original_object_name}""", +373 ) +374 FileManager._logger.info(response) +375 +376 +377class ArchiveFileManager(object): +378 """Set of actions to restore archives.""" 379 -380 if RestoreStatus.NOT_STARTED == restore_status: -381 not_started_objects += 1 -382 elif RestoreStatus.ONGOING == restore_status: -383 ongoing_objects += 1 -384 else: -385 restored_objects += 1 -386 -387 ArchiveFileManager._logger.info( -388 f"{obj} restore status is {restore_status.value}" -389 ) -390 -391 return { -392 "total_objects": total_objects, -393 "not_started_objects": not_started_objects, -394 "ongoing_objects": ongoing_objects, -395 "restored_objects": restored_objects, -396 } -397 -398 @staticmethod -399 def _request_restore_object( -400 bucket: str, object_key: str, expiration: int, retrieval_tier: str -401 ) -> None: -402 """Request a restore of the archive. +380 _logger = LoggingHandler(__name__).get_logger() +381 +382 @staticmethod +383 def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]: +384 """Get the archived object if it's an object. +385 +386 Args: +387 bucket: name of bucket to check get the object. +388 object_key: object to get. +389 +390 Returns: +391 S3 Object if it's an archived object, otherwise None. +392 """ +393 s3 = boto3.resource("s3") +394 object_to_restore = s3.Object(bucket, object_key) +395 +396 if ( +397 object_to_restore.storage_class is not None +398 and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS +399 ): +400 return object_to_restore +401 else: +402 return None 403 -404 Args: -405 bucket: name of bucket to perform the restore. -406 object_key: object to be restored. -407 expiration: restore expiration. -408 retrieval_tier: type of restore, possible values are: -409 Bulk, Standard or Expedited. -410 """ -411 if not RestoreType.exists(retrieval_tier): -412 raise RestoreTypeNotFoundException( -413 f"Restore type {retrieval_tier} not supported." -414 ) -415 -416 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) -417 -418 if archived_object and archived_object.restore is None: -419 ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.") -420 archived_object.restore_object( -421 RestoreRequest={ -422 "Days": expiration, -423 "GlacierJobParameters": {"Tier": retrieval_tier}, -424 } -425 ) +404 @staticmethod +405 def _check_object_restore_status( +406 bucket: str, object_key: str +407 ) -> Optional[RestoreStatus]: +408 """Check the restore status of the archive. +409 +410 Args: +411 bucket: name of bucket to check the restore status. +412 object_key: object to check the restore status. +413 +414 Returns: +415 The restore status represented by an enum, possible values are: +416 NOT_STARTED, ONGOING or RESTORED +417 """ +418 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) +419 +420 if archived_object is None: +421 status = None +422 elif archived_object.restore is None: +423 status = RestoreStatus.NOT_STARTED +424 elif 'ongoing-request="true"' in archived_object.restore: +425 status = RestoreStatus.ONGOING 426 else: -427 ArchiveFileManager._logger.info( -428 f"Restore request for {bucket}/{object_key} not performed." -429 ) +427 status = RestoreStatus.RESTORED +428 +429 return status 430 431 @staticmethod -432 def request_restore( -433 source_bucket: str, -434 source_object: str, -435 restore_expiration: int, -436 retrieval_tier: str, -437 dry_run: bool, -438 ) -> None: -439 """Request the restore of archived data. -440 -441 Args: -442 source_bucket: name of bucket to perform the restore. -443 source_object: object to be restored. -444 restore_expiration: restore expiration in days. -445 retrieval_tier: type of restore, possible values are: -446 Bulk, Standard or Expedited. -447 dry_run: if dry_run is set to True the function will print a dict with -448 all the paths that would be deleted based on the given keys. -449 """ -450 if dry_run: -451 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) -452 -453 ArchiveFileManager._logger.info("Paths that would be restored:") -454 ArchiveFileManager._logger.info(response) -455 else: -456 objects_to_restore = _list_objects_recursively( -457 bucket=source_bucket, path=source_object -458 ) -459 -460 for obj in objects_to_restore: -461 ArchiveFileManager._request_restore_object( -462 source_bucket, -463 obj, -464 restore_expiration, -465 retrieval_tier, -466 ) -467 -468 @staticmethod -469 def request_restore_and_wait( -470 source_bucket: str, -471 source_object: str, -472 restore_expiration: int, -473 retrieval_tier: str, -474 dry_run: bool, -475 ) -> None: -476 """Request and wait for the restore to complete, polling the restore status. +432 def check_restore_status(source_bucket: str, source_object: str) -> dict: +433 """Check the restore status of archived data. +434 +435 Args: +436 source_bucket: name of bucket to check the restore status. +437 source_object: object to check the restore status. +438 +439 Returns: +440 A dict containing the amount of objects in each status. +441 """ +442 not_started_objects = 0 +443 ongoing_objects = 0 +444 restored_objects = 0 +445 total_objects = 0 +446 +447 if _check_directory(source_bucket, source_object): +448 source_object = _process_directory_path(source_object) +449 +450 objects_to_restore = _list_objects_recursively( +451 bucket=source_bucket, path=source_object +452 ) +453 +454 for obj in objects_to_restore: +455 ArchiveFileManager._logger.info(f"Checking restore status for: {obj}") +456 +457 restore_status = ArchiveFileManager._check_object_restore_status( +458 source_bucket, obj +459 ) +460 if not restore_status: +461 ArchiveFileManager._logger.warning( +462 f"Restore status not found for {source_bucket}/{obj}" +463 ) +464 else: +465 total_objects += 1 +466 +467 if RestoreStatus.NOT_STARTED == restore_status: +468 not_started_objects += 1 +469 elif RestoreStatus.ONGOING == restore_status: +470 ongoing_objects += 1 +471 else: +472 restored_objects += 1 +473 +474 ArchiveFileManager._logger.info( +475 f"{obj} restore status is {restore_status.value}" +476 ) 477 -478 Args: -479 source_bucket: name of bucket to perform the restore. -480 source_object: object to be restored. -481 restore_expiration: restore expiration in days. -482 retrieval_tier: type of restore, possible values are: -483 Bulk, Standard or Expedited. -484 dry_run: if dry_run is set to True the function will print a dict with -485 all the paths that would be deleted based on the given keys. -486 """ -487 if retrieval_tier != RestoreType.EXPEDITED.value: -488 ArchiveFileManager._logger.error( -489 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " -490 "kind of restore should be used just with `Expedited` retrieval tier " -491 "to save cluster costs." -492 ) -493 raise ValueError( -494 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " -495 "kind of restore should be used just with `Expedited` retrieval tier " -496 "to save cluster costs." -497 ) -498 -499 ArchiveFileManager.request_restore( -500 source_bucket=source_bucket, -501 source_object=source_object, -502 restore_expiration=restore_expiration, -503 retrieval_tier=retrieval_tier, -504 dry_run=dry_run, -505 ) -506 restore_status = ArchiveFileManager.check_restore_status( -507 source_bucket, source_object -508 ) -509 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") -510 -511 if not dry_run: -512 ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.") -513 wait_time = 300 -514 while restore_status.get("total_objects") > restore_status.get( -515 "restored_objects" -516 ): -517 ArchiveFileManager._logger.info( -518 "Not all objects have been restored yet, checking the status again " -519 f"in {wait_time} seconds." -520 ) -521 time.sleep(wait_time) -522 wait_time = 30 -523 restore_status = ArchiveFileManager.check_restore_status( -524 source_bucket, source_object -525 ) -526 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") +478 return { +479 "total_objects": total_objects, +480 "not_started_objects": not_started_objects, +481 "ongoing_objects": ongoing_objects, +482 "restored_objects": restored_objects, +483 } +484 +485 @staticmethod +486 def _request_restore_object( +487 bucket: str, object_key: str, expiration: int, retrieval_tier: str +488 ) -> None: +489 """Request a restore of the archive. +490 +491 Args: +492 bucket: name of bucket to perform the restore. +493 object_key: object to be restored. +494 expiration: restore expiration. +495 retrieval_tier: type of restore, possible values are: +496 Bulk, Standard or Expedited. +497 """ +498 if not RestoreType.exists(retrieval_tier): +499 raise RestoreTypeNotFoundException( +500 f"Restore type {retrieval_tier} not supported." +501 ) +502 +503 if _check_directory(bucket, object_key): +504 object_key = _process_directory_path(object_key) +505 +506 archived_object = ArchiveFileManager._get_archived_object(bucket, object_key) +507 +508 if archived_object and archived_object.restore is None: +509 ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.") +510 archived_object.restore_object( +511 RestoreRequest={ +512 "Days": expiration, +513 "GlacierJobParameters": {"Tier": retrieval_tier}, +514 } +515 ) +516 else: +517 ArchiveFileManager._logger.info( +518 f"Restore request for {bucket}/{object_key} not performed." +519 ) +520 +521 @staticmethod +522 def request_restore( +523 source_bucket: str, +524 source_object: str, +525 restore_expiration: int, +526 retrieval_tier: str, +527 dry_run: bool, +528 ) -> None: +529 """Request the restore of archived data. +530 +531 Args: +532 source_bucket: name of bucket to perform the restore. +533 source_object: object to be restored. +534 restore_expiration: restore expiration in days. +535 retrieval_tier: type of restore, possible values are: +536 Bulk, Standard or Expedited. +537 dry_run: if dry_run is set to True the function will print a dict with +538 all the paths that would be deleted based on the given keys. +539 """ +540 if _check_directory(source_bucket, source_object): +541 source_object = _process_directory_path(source_object) +542 +543 if dry_run: +544 response = _dry_run(bucket=source_bucket, object_paths=[source_object]) +545 +546 ArchiveFileManager._logger.info("Paths that would be restored:") +547 ArchiveFileManager._logger.info(response) +548 else: +549 objects_to_restore = _list_objects_recursively( +550 bucket=source_bucket, path=source_object +551 ) +552 +553 for obj in objects_to_restore: +554 ArchiveFileManager._request_restore_object( +555 source_bucket, +556 obj, +557 restore_expiration, +558 retrieval_tier, +559 ) +560 +561 @staticmethod +562 def request_restore_and_wait( +563 source_bucket: str, +564 source_object: str, +565 restore_expiration: int, +566 retrieval_tier: str, +567 dry_run: bool, +568 ) -> None: +569 """Request and wait for the restore to complete, polling the restore status. +570 +571 Args: +572 source_bucket: name of bucket to perform the restore. +573 source_object: object to be restored. +574 restore_expiration: restore expiration in days. +575 retrieval_tier: type of restore, possible values are: +576 Bulk, Standard or Expedited. +577 dry_run: if dry_run is set to True the function will print a dict with +578 all the paths that would be deleted based on the given keys. +579 """ +580 if retrieval_tier != RestoreType.EXPEDITED.value: +581 ArchiveFileManager._logger.error( +582 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " +583 "kind of restore should be used just with `Expedited` retrieval tier " +584 "to save cluster costs." +585 ) +586 raise ValueError( +587 f"Retrieval Tier {retrieval_tier} not allowed on this operation! This " +588 "kind of restore should be used just with `Expedited` retrieval tier " +589 "to save cluster costs." +590 ) +591 +592 ArchiveFileManager.request_restore( +593 source_bucket=source_bucket, +594 source_object=source_object, +595 restore_expiration=restore_expiration, +596 retrieval_tier=retrieval_tier, +597 dry_run=dry_run, +598 ) +599 restore_status = ArchiveFileManager.check_restore_status( +600 source_bucket, source_object +601 ) +602 ArchiveFileManager._logger.info(f"Restore status: {restore_status}") +603 +604 if not dry_run: +605 ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.") +606 wait_time = 300 +607 while restore_status.get("total_objects") > restore_status.get( +608 "restored_objects" +609 ): +610 ArchiveFileManager._logger.info( +611 "Not all objects have been restored yet, checking the status again " +612 f"in {wait_time} seconds." +613 ) +614 time.sleep(wait_time) +615 wait_time = 30 +616 restore_status = ArchiveFileManager.check_restore_status( +617 source_bucket, source_object +618 ) +619 ArchiveFileManager._logger.info(f"Restore status: {restore_status}")

@@ -636,218 +729,247 @@

-
 80class FileManager(object):
- 81    """Set of actions to manipulate files in several ways."""
- 82
- 83    _logger = LoggingHandler(__name__).get_logger()
- 84
- 85    def __init__(self, configs: dict):
- 86        """Construct FileManager algorithm instances.
- 87
- 88        Args:
- 89            configs: configurations for the FileManager algorithm.
- 90        """
- 91        self.configs = configs
- 92        self.function = self.configs["function"]
- 93
- 94    def get_function(self) -> None:
- 95        """Get a specific function to execute."""
- 96        available_functions = {
- 97            "delete_objects": self.delete_objects,
- 98            "copy_objects": self.copy_objects,
- 99            "request_restore": self.request_restore,
-100            "check_restore_status": self.check_restore_status,
-101            "request_restore_to_destination_and_wait": (
-102                self.request_restore_to_destination_and_wait
-103            ),
-104        }
-105
-106        self._logger.info("Function being executed: {}".format(self.function))
-107        if self.function in available_functions.keys():
-108            func = available_functions[self.function]
-109            func()
-110        else:
-111            raise NotImplementedError(
-112                f"The requested function {self.function} is not implemented."
-113            )
-114
-115    def delete_objects(self) -> None:
-116        """Delete objects and 'directories' in s3.
-117
-118        If dry_run is set to True the function will print a dict with all the
-119        paths that would be deleted based on the given keys.
-120        """
-121        bucket = self.configs["bucket"]
-122        objects_paths = self.configs["object_paths"]
-123        dry_run = self.configs["dry_run"]
-124
-125        s3 = boto3.client("s3")
-126
-127        if dry_run:
-128            response = _dry_run(bucket=bucket, object_paths=objects_paths)
-129
-130            self._logger.info("Paths that would be deleted:")
-131        else:
-132            objects_to_delete = []
-133            for path in objects_paths:
-134                for obj in _list_objects_recursively(bucket=bucket, path=path):
-135                    objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj})
-136
-137            response = s3.delete_objects(
-138                Bucket=bucket,
-139                Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete},
-140            )
-141
-142        self._logger.info(response)
-143
-144    def copy_objects(self) -> None:
-145        """Copies objects and 'directories' in s3."""
-146        source_bucket = self.configs["bucket"]
-147        source_object = self.configs["source_object"]
-148        destination_bucket = self.configs["destination_bucket"]
-149        destination_object = self.configs["destination_object"]
-150        dry_run = self.configs["dry_run"]
-151
-152        FileManager._copy_objects(
-153            source_bucket=source_bucket,
-154            source_object=source_object,
-155            destination_bucket=destination_bucket,
-156            destination_object=destination_object,
-157            dry_run=dry_run,
-158        )
-159
-160    def request_restore(self) -> None:
-161        """Request the restore of archived data."""
-162        source_bucket = self.configs["bucket"]
-163        source_object = self.configs["source_object"]
-164        restore_expiration = self.configs["restore_expiration"]
-165        retrieval_tier = self.configs["retrieval_tier"]
-166        dry_run = self.configs["dry_run"]
-167
-168        ArchiveFileManager.request_restore(
-169            source_bucket,
-170            source_object,
-171            restore_expiration,
-172            retrieval_tier,
-173            dry_run,
-174        )
-175
-176    def check_restore_status(self) -> None:
-177        """Check the restore status of archived data."""
-178        source_bucket = self.configs["bucket"]
-179        source_object = self.configs["source_object"]
-180
-181        restore_status = ArchiveFileManager.check_restore_status(
-182            source_bucket, source_object
-183        )
+            
135class FileManager(object):
+136    """Set of actions to manipulate files in several ways."""
+137
+138    _logger = LoggingHandler(__name__).get_logger()
+139
+140    def __init__(self, configs: dict):
+141        """Construct FileManager algorithm instances.
+142
+143        Args:
+144            configs: configurations for the FileManager algorithm.
+145        """
+146        self.configs = configs
+147        self.function = self.configs["function"]
+148
+149    def get_function(self) -> None:
+150        """Get a specific function to execute."""
+151        available_functions = {
+152            "delete_objects": self.delete_objects,
+153            "copy_objects": self.copy_objects,
+154            "request_restore": self.request_restore,
+155            "check_restore_status": self.check_restore_status,
+156            "request_restore_to_destination_and_wait": (
+157                self.request_restore_to_destination_and_wait
+158            ),
+159        }
+160
+161        self._logger.info("Function being executed: {}".format(self.function))
+162        if self.function in available_functions.keys():
+163            func = available_functions[self.function]
+164            func()
+165        else:
+166            raise NotImplementedError(
+167                f"The requested function {self.function} is not implemented."
+168            )
+169
+170    def _delete_objects(self, bucket: str, objects_paths: list) -> None:
+171        """Delete objects recursively in s3.
+172
+173        Params:
+174            bucket: name of bucket to perform the delete operation.
+175            objects_paths: objects to be deleted.
+176        """
+177        s3 = boto3.client("s3")
+178
+179        for path in objects_paths:
+180            if _check_directory(bucket, path):
+181                path = _process_directory_path(path)
+182            else:
+183                path = path.strip()
 184
-185        self._logger.info(
-186            f"""
-187            Restore status:
-188            - Not Started: {restore_status.get('not_started_objects')}
-189            - Ongoing: {restore_status.get('ongoing_objects')}
-190            - Restored: {restore_status.get('restored_objects')}
-191            Total objects in this restore process: {restore_status.get('total_objects')}
-192            """
-193        )
-194
-195    def request_restore_to_destination_and_wait(self) -> None:
-196        """Request and wait for the restore to complete, polling the restore status.
-197
-198        After the restore is done, copy the restored files to destination
-199        """
-200        source_bucket = self.configs["bucket"]
-201        source_object = self.configs["source_object"]
-202        destination_bucket = self.configs["destination_bucket"]
-203        destination_object = self.configs["destination_object"]
-204        restore_expiration = self.configs["restore_expiration"]
-205        retrieval_tier = self.configs["retrieval_tier"]
-206        dry_run = self.configs["dry_run"]
-207
-208        ArchiveFileManager.request_restore_and_wait(
-209            source_bucket=source_bucket,
-210            source_object=source_object,
-211            restore_expiration=restore_expiration,
-212            retrieval_tier=retrieval_tier,
-213            dry_run=dry_run,
-214        )
+185            more_objects = True
+186            paginator = ""
+187            objects_to_delete = []
+188
+189            while more_objects:
+190                objects_found, paginator = _list_objects(
+191                    s3_client=s3, bucket=bucket, path=path, paginator=paginator
+192                )
+193                for obj in objects_found:
+194                    objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj})
+195
+196                if not paginator:
+197                    more_objects = False
+198
+199                response = s3.delete_objects(
+200                    Bucket=bucket,
+201                    Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete},
+202                )
+203                self._logger.info(response)
+204                objects_to_delete = []
+205
+206    def delete_objects(self) -> None:
+207        """Delete objects and 'directories' in s3.
+208
+209        If dry_run is set to True the function will print a dict with all the
+210        paths that would be deleted based on the given keys.
+211        """
+212        bucket = self.configs["bucket"]
+213        objects_paths = self.configs["object_paths"]
+214        dry_run = self.configs["dry_run"]
 215
-216        FileManager._logger.info(
-217            f"Restoration complete for {source_bucket} and {source_object}"
-218        )
-219        FileManager._logger.info(
-220            f"Starting to copy data from {source_bucket}/{source_object} to "
-221            f"{destination_bucket}/{destination_object}"
-222        )
-223        FileManager._copy_objects(
-224            source_bucket=source_bucket,
-225            source_object=source_object,
-226            destination_bucket=destination_bucket,
-227            destination_object=destination_object,
-228            dry_run=dry_run,
-229        )
-230        FileManager._logger.info(
-231            f"Finished copying data, data should be available on {destination_bucket}/"
-232            f"{destination_object}"
-233        )
-234
-235    @staticmethod
-236    def _copy_objects(
-237        source_bucket: str,
-238        source_object: str,
-239        destination_bucket: str,
-240        destination_object: str,
-241        dry_run: bool,
-242    ) -> None:
-243        """Copies objects and 'directories' in s3.
-244
-245        Args:
-246            source_bucket: name of bucket to perform the copy.
-247            source_object: object/folder to be copied.
-248            destination_bucket: name of the target bucket to copy.
-249            destination_object: target object/folder to copy.
-250            dry_run: if dry_run is set to True the function will print a dict with
-251                all the paths that would be deleted based on the given keys.
-252        """
-253        s3 = boto3.client("s3")
-254
-255        if dry_run:
-256            response = _dry_run(bucket=source_bucket, object_paths=[source_object])
-257
-258            FileManager._logger.info("Paths that would be copied:")
-259            FileManager._logger.info(response)
-260        else:
-261            copy_object = _list_objects_recursively(
-262                bucket=source_bucket, path=source_object
-263            )
+216        if dry_run:
+217            response = _dry_run(bucket=bucket, object_paths=objects_paths)
+218
+219            self._logger.info("Paths that would be deleted:")
+220            self._logger.info(response)
+221        else:
+222            self._delete_objects(bucket, objects_paths)
+223
+224    def copy_objects(self) -> None:
+225        """Copies objects and 'directories' in s3."""
+226        source_bucket = self.configs["bucket"]
+227        source_object = self.configs["source_object"]
+228        destination_bucket = self.configs["destination_bucket"]
+229        destination_object = self.configs["destination_object"]
+230        dry_run = self.configs["dry_run"]
+231
+232        FileManager._copy_objects(
+233            source_bucket=source_bucket,
+234            source_object=source_object,
+235            destination_bucket=destination_bucket,
+236            destination_object=destination_object,
+237            dry_run=dry_run,
+238        )
+239
+240    def request_restore(self) -> None:
+241        """Request the restore of archived data."""
+242        source_bucket = self.configs["bucket"]
+243        source_object = self.configs["source_object"]
+244        restore_expiration = self.configs["restore_expiration"]
+245        retrieval_tier = self.configs["retrieval_tier"]
+246        dry_run = self.configs["dry_run"]
+247
+248        ArchiveFileManager.request_restore(
+249            source_bucket,
+250            source_object,
+251            restore_expiration,
+252            retrieval_tier,
+253            dry_run,
+254        )
+255
+256    def check_restore_status(self) -> None:
+257        """Check the restore status of archived data."""
+258        source_bucket = self.configs["bucket"]
+259        source_object = self.configs["source_object"]
+260
+261        restore_status = ArchiveFileManager.check_restore_status(
+262            source_bucket, source_object
+263        )
 264
-265            if len(copy_object) == 1:
-266                FileManager._logger.info(f"Copying obj: {source_object}")
-267
-268                response = s3.copy_object(
-269                    Bucket=destination_bucket,
-270                    CopySource={
-271                        FileManagerAPIKeys.BUCKET.value: source_bucket,
-272                        FileManagerAPIKeys.KEY.value: source_object,
-273                    },
-274                    Key=f"""{destination_object}/{copy_object[0].split("/")[-1]}""",
-275                )
-276                FileManager._logger.info(response)
-277            else:
-278                for obj in copy_object:
-279                    FileManager._logger.info(f"Copying obj: {obj}")
-280
-281                    final_path = obj.replace(source_object, "")
-282
-283                    response = s3.copy_object(
-284                        Bucket=destination_bucket,
-285                        CopySource={
-286                            FileManagerAPIKeys.BUCKET.value: source_bucket,
-287                            FileManagerAPIKeys.KEY.value: obj,
-288                        },
-289                        Key=f"{destination_object}{final_path}",
-290                    )
-291                    FileManager._logger.info(response)
+265        self._logger.info(
+266            f"""
+267            Restore status:
+268            - Not Started: {restore_status.get('not_started_objects')}
+269            - Ongoing: {restore_status.get('ongoing_objects')}
+270            - Restored: {restore_status.get('restored_objects')}
+271            Total objects in this restore process: {restore_status.get('total_objects')}
+272            """
+273        )
+274
+275    def request_restore_to_destination_and_wait(self) -> None:
+276        """Request and wait for the restore to complete, polling the restore status.
+277
+278        After the restore is done, copy the restored files to destination
+279        """
+280        source_bucket = self.configs["bucket"]
+281        source_object = self.configs["source_object"]
+282        destination_bucket = self.configs["destination_bucket"]
+283        destination_object = self.configs["destination_object"]
+284        restore_expiration = self.configs["restore_expiration"]
+285        retrieval_tier = self.configs["retrieval_tier"]
+286        dry_run = self.configs["dry_run"]
+287
+288        ArchiveFileManager.request_restore_and_wait(
+289            source_bucket=source_bucket,
+290            source_object=source_object,
+291            restore_expiration=restore_expiration,
+292            retrieval_tier=retrieval_tier,
+293            dry_run=dry_run,
+294        )
+295
+296        FileManager._logger.info(
+297            f"Restoration complete for {source_bucket} and {source_object}"
+298        )
+299        FileManager._logger.info(
+300            f"Starting to copy data from {source_bucket}/{source_object} to "
+301            f"{destination_bucket}/{destination_object}"
+302        )
+303        FileManager._copy_objects(
+304            source_bucket=source_bucket,
+305            source_object=source_object,
+306            destination_bucket=destination_bucket,
+307            destination_object=destination_object,
+308            dry_run=dry_run,
+309        )
+310        FileManager._logger.info(
+311            f"Finished copying data, data should be available on {destination_bucket}/"
+312            f"{destination_object}"
+313        )
+314
+315    @staticmethod
+316    def _copy_objects(
+317        source_bucket: str,
+318        source_object: str,
+319        destination_bucket: str,
+320        destination_object: str,
+321        dry_run: bool,
+322    ) -> None:
+323        """Copies objects and 'directories' in s3.
+324
+325        Args:
+326            source_bucket: name of bucket to perform the copy.
+327            source_object: object/folder to be copied.
+328            destination_bucket: name of the target bucket to copy.
+329            destination_object: target object/folder to copy.
+330            dry_run: if dry_run is set to True the function will print a dict with
+331                all the paths that would be deleted based on the given keys.
+332        """
+333        s3 = boto3.client("s3")
+334
+335        if dry_run:
+336            response = _dry_run(bucket=source_bucket, object_paths=[source_object])
+337
+338            FileManager._logger.info("Paths that would be copied:")
+339            FileManager._logger.info(response)
+340        else:
+341            original_object_name = source_object.split("/")[-1]
+342
+343            if _check_directory(source_bucket, source_object):
+344                source_object = _process_directory_path(source_object)
+345
+346                copy_object = _list_objects_recursively(
+347                    bucket=source_bucket, path=source_object
+348                )
+349
+350                for obj in copy_object:
+351                    FileManager._logger.info(f"Copying obj: {obj}")
+352
+353                    final_path = obj.replace(source_object, "")
+354
+355                    response = s3.copy_object(
+356                        Bucket=destination_bucket,
+357                        CopySource={
+358                            FileManagerAPIKeys.BUCKET.value: source_bucket,
+359                            FileManagerAPIKeys.KEY.value: obj,
+360                        },
+361                        Key=f"{destination_object}/{original_object_name}/{final_path}",
+362                    )
+363                    FileManager._logger.info(response)
+364            else:
+365                FileManager._logger.info(f"Copying obj: {source_object}")
+366
+367                response = s3.copy_object(
+368                    Bucket=destination_bucket,
+369                    CopySource={
+370                        FileManagerAPIKeys.BUCKET.value: source_bucket,
+371                        FileManagerAPIKeys.KEY.value: source_object,
+372                    },
+373                    Key=f"""{destination_object}/{original_object_name}""",
+374                )
+375                FileManager._logger.info(response)
 
@@ -865,14 +987,14 @@

-
85    def __init__(self, configs: dict):
-86        """Construct FileManager algorithm instances.
-87
-88        Args:
-89            configs: configurations for the FileManager algorithm.
-90        """
-91        self.configs = configs
-92        self.function = self.configs["function"]
+            
140    def __init__(self, configs: dict):
+141        """Construct FileManager algorithm instances.
+142
+143        Args:
+144            configs: configurations for the FileManager algorithm.
+145        """
+146        self.configs = configs
+147        self.function = self.configs["function"]
 
@@ -895,26 +1017,26 @@

-
 94    def get_function(self) -> None:
- 95        """Get a specific function to execute."""
- 96        available_functions = {
- 97            "delete_objects": self.delete_objects,
- 98            "copy_objects": self.copy_objects,
- 99            "request_restore": self.request_restore,
-100            "check_restore_status": self.check_restore_status,
-101            "request_restore_to_destination_and_wait": (
-102                self.request_restore_to_destination_and_wait
-103            ),
-104        }
-105
-106        self._logger.info("Function being executed: {}".format(self.function))
-107        if self.function in available_functions.keys():
-108            func = available_functions[self.function]
-109            func()
-110        else:
-111            raise NotImplementedError(
-112                f"The requested function {self.function} is not implemented."
-113            )
+            
149    def get_function(self) -> None:
+150        """Get a specific function to execute."""
+151        available_functions = {
+152            "delete_objects": self.delete_objects,
+153            "copy_objects": self.copy_objects,
+154            "request_restore": self.request_restore,
+155            "check_restore_status": self.check_restore_status,
+156            "request_restore_to_destination_and_wait": (
+157                self.request_restore_to_destination_and_wait
+158            ),
+159        }
+160
+161        self._logger.info("Function being executed: {}".format(self.function))
+162        if self.function in available_functions.keys():
+163            func = available_functions[self.function]
+164            func()
+165        else:
+166            raise NotImplementedError(
+167                f"The requested function {self.function} is not implemented."
+168            )
 
@@ -934,34 +1056,23 @@

-
115    def delete_objects(self) -> None:
-116        """Delete objects and 'directories' in s3.
-117
-118        If dry_run is set to True the function will print a dict with all the
-119        paths that would be deleted based on the given keys.
-120        """
-121        bucket = self.configs["bucket"]
-122        objects_paths = self.configs["object_paths"]
-123        dry_run = self.configs["dry_run"]
-124
-125        s3 = boto3.client("s3")
-126
-127        if dry_run:
-128            response = _dry_run(bucket=bucket, object_paths=objects_paths)
-129
-130            self._logger.info("Paths that would be deleted:")
-131        else:
-132            objects_to_delete = []
-133            for path in objects_paths:
-134                for obj in _list_objects_recursively(bucket=bucket, path=path):
-135                    objects_to_delete.append({FileManagerAPIKeys.KEY.value: obj})
-136
-137            response = s3.delete_objects(
-138                Bucket=bucket,
-139                Delete={FileManagerAPIKeys.OBJECTS.value: objects_to_delete},
-140            )
-141
-142        self._logger.info(response)
+            
206    def delete_objects(self) -> None:
+207        """Delete objects and 'directories' in s3.
+208
+209        If dry_run is set to True the function will print a dict with all the
+210        paths that would be deleted based on the given keys.
+211        """
+212        bucket = self.configs["bucket"]
+213        objects_paths = self.configs["object_paths"]
+214        dry_run = self.configs["dry_run"]
+215
+216        if dry_run:
+217            response = _dry_run(bucket=bucket, object_paths=objects_paths)
+218
+219            self._logger.info("Paths that would be deleted:")
+220            self._logger.info(response)
+221        else:
+222            self._delete_objects(bucket, objects_paths)
 
@@ -984,21 +1095,21 @@

-
144    def copy_objects(self) -> None:
-145        """Copies objects and 'directories' in s3."""
-146        source_bucket = self.configs["bucket"]
-147        source_object = self.configs["source_object"]
-148        destination_bucket = self.configs["destination_bucket"]
-149        destination_object = self.configs["destination_object"]
-150        dry_run = self.configs["dry_run"]
-151
-152        FileManager._copy_objects(
-153            source_bucket=source_bucket,
-154            source_object=source_object,
-155            destination_bucket=destination_bucket,
-156            destination_object=destination_object,
-157            dry_run=dry_run,
-158        )
+            
224    def copy_objects(self) -> None:
+225        """Copies objects and 'directories' in s3."""
+226        source_bucket = self.configs["bucket"]
+227        source_object = self.configs["source_object"]
+228        destination_bucket = self.configs["destination_bucket"]
+229        destination_object = self.configs["destination_object"]
+230        dry_run = self.configs["dry_run"]
+231
+232        FileManager._copy_objects(
+233            source_bucket=source_bucket,
+234            source_object=source_object,
+235            destination_bucket=destination_bucket,
+236            destination_object=destination_object,
+237            dry_run=dry_run,
+238        )
 
@@ -1018,21 +1129,21 @@

-
160    def request_restore(self) -> None:
-161        """Request the restore of archived data."""
-162        source_bucket = self.configs["bucket"]
-163        source_object = self.configs["source_object"]
-164        restore_expiration = self.configs["restore_expiration"]
-165        retrieval_tier = self.configs["retrieval_tier"]
-166        dry_run = self.configs["dry_run"]
-167
-168        ArchiveFileManager.request_restore(
-169            source_bucket,
-170            source_object,
-171            restore_expiration,
-172            retrieval_tier,
-173            dry_run,
-174        )
+            
240    def request_restore(self) -> None:
+241        """Request the restore of archived data."""
+242        source_bucket = self.configs["bucket"]
+243        source_object = self.configs["source_object"]
+244        restore_expiration = self.configs["restore_expiration"]
+245        retrieval_tier = self.configs["retrieval_tier"]
+246        dry_run = self.configs["dry_run"]
+247
+248        ArchiveFileManager.request_restore(
+249            source_bucket,
+250            source_object,
+251            restore_expiration,
+252            retrieval_tier,
+253            dry_run,
+254        )
 
@@ -1052,24 +1163,24 @@

-
176    def check_restore_status(self) -> None:
-177        """Check the restore status of archived data."""
-178        source_bucket = self.configs["bucket"]
-179        source_object = self.configs["source_object"]
-180
-181        restore_status = ArchiveFileManager.check_restore_status(
-182            source_bucket, source_object
-183        )
-184
-185        self._logger.info(
-186            f"""
-187            Restore status:
-188            - Not Started: {restore_status.get('not_started_objects')}
-189            - Ongoing: {restore_status.get('ongoing_objects')}
-190            - Restored: {restore_status.get('restored_objects')}
-191            Total objects in this restore process: {restore_status.get('total_objects')}
-192            """
-193        )
+            
256    def check_restore_status(self) -> None:
+257        """Check the restore status of archived data."""
+258        source_bucket = self.configs["bucket"]
+259        source_object = self.configs["source_object"]
+260
+261        restore_status = ArchiveFileManager.check_restore_status(
+262            source_bucket, source_object
+263        )
+264
+265        self._logger.info(
+266            f"""
+267            Restore status:
+268            - Not Started: {restore_status.get('not_started_objects')}
+269            - Ongoing: {restore_status.get('ongoing_objects')}
+270            - Restored: {restore_status.get('restored_objects')}
+271            Total objects in this restore process: {restore_status.get('total_objects')}
+272            """
+273        )
 
@@ -1089,45 +1200,45 @@

-
195    def request_restore_to_destination_and_wait(self) -> None:
-196        """Request and wait for the restore to complete, polling the restore status.
-197
-198        After the restore is done, copy the restored files to destination
-199        """
-200        source_bucket = self.configs["bucket"]
-201        source_object = self.configs["source_object"]
-202        destination_bucket = self.configs["destination_bucket"]
-203        destination_object = self.configs["destination_object"]
-204        restore_expiration = self.configs["restore_expiration"]
-205        retrieval_tier = self.configs["retrieval_tier"]
-206        dry_run = self.configs["dry_run"]
-207
-208        ArchiveFileManager.request_restore_and_wait(
-209            source_bucket=source_bucket,
-210            source_object=source_object,
-211            restore_expiration=restore_expiration,
-212            retrieval_tier=retrieval_tier,
-213            dry_run=dry_run,
-214        )
-215
-216        FileManager._logger.info(
-217            f"Restoration complete for {source_bucket} and {source_object}"
-218        )
-219        FileManager._logger.info(
-220            f"Starting to copy data from {source_bucket}/{source_object} to "
-221            f"{destination_bucket}/{destination_object}"
-222        )
-223        FileManager._copy_objects(
-224            source_bucket=source_bucket,
-225            source_object=source_object,
-226            destination_bucket=destination_bucket,
-227            destination_object=destination_object,
-228            dry_run=dry_run,
-229        )
-230        FileManager._logger.info(
-231            f"Finished copying data, data should be available on {destination_bucket}/"
-232            f"{destination_object}"
-233        )
+            
275    def request_restore_to_destination_and_wait(self) -> None:
+276        """Request and wait for the restore to complete, polling the restore status.
+277
+278        After the restore is done, copy the restored files to destination
+279        """
+280        source_bucket = self.configs["bucket"]
+281        source_object = self.configs["source_object"]
+282        destination_bucket = self.configs["destination_bucket"]
+283        destination_object = self.configs["destination_object"]
+284        restore_expiration = self.configs["restore_expiration"]
+285        retrieval_tier = self.configs["retrieval_tier"]
+286        dry_run = self.configs["dry_run"]
+287
+288        ArchiveFileManager.request_restore_and_wait(
+289            source_bucket=source_bucket,
+290            source_object=source_object,
+291            restore_expiration=restore_expiration,
+292            retrieval_tier=retrieval_tier,
+293            dry_run=dry_run,
+294        )
+295
+296        FileManager._logger.info(
+297            f"Restoration complete for {source_bucket} and {source_object}"
+298        )
+299        FileManager._logger.info(
+300            f"Starting to copy data from {source_bucket}/{source_object} to "
+301            f"{destination_bucket}/{destination_object}"
+302        )
+303        FileManager._copy_objects(
+304            source_bucket=source_bucket,
+305            source_object=source_object,
+306            destination_bucket=destination_bucket,
+307            destination_object=destination_object,
+308            dry_run=dry_run,
+309        )
+310        FileManager._logger.info(
+311            f"Finished copying data, data should be available on {destination_bucket}/"
+312            f"{destination_object}"
+313        )
 
@@ -1150,240 +1261,249 @@

-
294class ArchiveFileManager(object):
-295    """Set of actions to restore archives."""
-296
-297    _logger = LoggingHandler(__name__).get_logger()
-298
-299    @staticmethod
-300    def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]:
-301        """Get the archived object if it's an object.
-302
-303        Args:
-304            bucket: name of bucket to check get the object.
-305            object_key: object to get.
-306
-307        Returns:
-308            S3 Object if it's an archived object, otherwise None.
-309        """
-310        s3 = boto3.resource("s3")
-311        object_to_restore = s3.Object(bucket, object_key)
-312
-313        if (
-314            object_to_restore.storage_class is not None
-315            and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS
-316        ):
-317            return object_to_restore
-318        else:
-319            return None
-320
-321    @staticmethod
-322    def _check_object_restore_status(
-323        bucket: str, object_key: str
-324    ) -> Optional[RestoreStatus]:
-325        """Check the restore status of the archive.
-326
-327        Args:
-328            bucket: name of bucket to check the restore status.
-329            object_key: object to check the restore status.
-330
-331        Returns:
-332            The restore status represented by an enum, possible values are:
-333                NOT_STARTED, ONGOING or RESTORED
-334        """
-335        archived_object = ArchiveFileManager._get_archived_object(bucket, object_key)
-336
-337        if archived_object is None:
-338            status = None
-339        elif archived_object.restore is None:
-340            status = RestoreStatus.NOT_STARTED
-341        elif 'ongoing-request="true"' in archived_object.restore:
-342            status = RestoreStatus.ONGOING
-343        else:
-344            status = RestoreStatus.RESTORED
-345
-346        return status
-347
-348    @staticmethod
-349    def check_restore_status(source_bucket: str, source_object: str) -> dict:
-350        """Check the restore status of archived data.
-351
-352        Args:
-353            source_bucket: name of bucket to check the restore status.
-354            source_object: object to check the restore status.
-355
-356        Returns:
-357            A dict containing the amount of objects in each status.
-358        """
-359        not_started_objects = 0
-360        ongoing_objects = 0
-361        restored_objects = 0
-362        total_objects = 0
-363
-364        objects_to_restore = _list_objects_recursively(
-365            bucket=source_bucket, path=source_object
-366        )
-367
-368        for obj in objects_to_restore:
-369            ArchiveFileManager._logger.info(f"Checking restore status for: {obj}")
-370
-371            restore_status = ArchiveFileManager._check_object_restore_status(
-372                source_bucket, obj
-373            )
-374            if not restore_status:
-375                ArchiveFileManager._logger.warning(
-376                    f"Restore status not found for {source_bucket}/{obj}"
-377                )
-378            else:
-379                total_objects += 1
+            
378class ArchiveFileManager(object):
+379    """Set of actions to restore archives."""
 380
-381                if RestoreStatus.NOT_STARTED == restore_status:
-382                    not_started_objects += 1
-383                elif RestoreStatus.ONGOING == restore_status:
-384                    ongoing_objects += 1
-385                else:
-386                    restored_objects += 1
-387
-388                ArchiveFileManager._logger.info(
-389                    f"{obj} restore status is {restore_status.value}"
-390                )
-391
-392        return {
-393            "total_objects": total_objects,
-394            "not_started_objects": not_started_objects,
-395            "ongoing_objects": ongoing_objects,
-396            "restored_objects": restored_objects,
-397        }
-398
-399    @staticmethod
-400    def _request_restore_object(
-401        bucket: str, object_key: str, expiration: int, retrieval_tier: str
-402    ) -> None:
-403        """Request a restore of the archive.
+381    _logger = LoggingHandler(__name__).get_logger()
+382
+383    @staticmethod
+384    def _get_archived_object(bucket: str, object_key: str) -> Optional[Any]:
+385        """Get the archived object if it's an object.
+386
+387        Args:
+388            bucket: name of bucket to check get the object.
+389            object_key: object to get.
+390
+391        Returns:
+392            S3 Object if it's an archived object, otherwise None.
+393        """
+394        s3 = boto3.resource("s3")
+395        object_to_restore = s3.Object(bucket, object_key)
+396
+397        if (
+398            object_to_restore.storage_class is not None
+399            and object_to_restore.storage_class in ARCHIVE_STORAGE_CLASS
+400        ):
+401            return object_to_restore
+402        else:
+403            return None
 404
-405        Args:
-406            bucket: name of bucket to perform the restore.
-407            object_key: object to be restored.
-408            expiration: restore expiration.
-409            retrieval_tier: type of restore, possible values are:
-410                Bulk, Standard or Expedited.
-411        """
-412        if not RestoreType.exists(retrieval_tier):
-413            raise RestoreTypeNotFoundException(
-414                f"Restore type {retrieval_tier} not supported."
-415            )
-416
-417        archived_object = ArchiveFileManager._get_archived_object(bucket, object_key)
-418
-419        if archived_object and archived_object.restore is None:
-420            ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.")
-421            archived_object.restore_object(
-422                RestoreRequest={
-423                    "Days": expiration,
-424                    "GlacierJobParameters": {"Tier": retrieval_tier},
-425                }
-426            )
+405    @staticmethod
+406    def _check_object_restore_status(
+407        bucket: str, object_key: str
+408    ) -> Optional[RestoreStatus]:
+409        """Check the restore status of the archive.
+410
+411        Args:
+412            bucket: name of bucket to check the restore status.
+413            object_key: object to check the restore status.
+414
+415        Returns:
+416            The restore status represented by an enum, possible values are:
+417                NOT_STARTED, ONGOING or RESTORED
+418        """
+419        archived_object = ArchiveFileManager._get_archived_object(bucket, object_key)
+420
+421        if archived_object is None:
+422            status = None
+423        elif archived_object.restore is None:
+424            status = RestoreStatus.NOT_STARTED
+425        elif 'ongoing-request="true"' in archived_object.restore:
+426            status = RestoreStatus.ONGOING
 427        else:
-428            ArchiveFileManager._logger.info(
-429                f"Restore request for {bucket}/{object_key} not performed."
-430            )
+428            status = RestoreStatus.RESTORED
+429
+430        return status
 431
 432    @staticmethod
-433    def request_restore(
-434        source_bucket: str,
-435        source_object: str,
-436        restore_expiration: int,
-437        retrieval_tier: str,
-438        dry_run: bool,
-439    ) -> None:
-440        """Request the restore of archived data.
-441
-442        Args:
-443            source_bucket: name of bucket to perform the restore.
-444            source_object: object to be restored.
-445            restore_expiration: restore expiration in days.
-446            retrieval_tier: type of restore, possible values are:
-447                Bulk, Standard or Expedited.
-448            dry_run: if dry_run is set to True the function will print a dict with
-449                all the paths that would be deleted based on the given keys.
-450        """
-451        if dry_run:
-452            response = _dry_run(bucket=source_bucket, object_paths=[source_object])
-453
-454            ArchiveFileManager._logger.info("Paths that would be restored:")
-455            ArchiveFileManager._logger.info(response)
-456        else:
-457            objects_to_restore = _list_objects_recursively(
-458                bucket=source_bucket, path=source_object
-459            )
-460
-461            for obj in objects_to_restore:
-462                ArchiveFileManager._request_restore_object(
-463                    source_bucket,
-464                    obj,
-465                    restore_expiration,
-466                    retrieval_tier,
-467                )
-468
-469    @staticmethod
-470    def request_restore_and_wait(
-471        source_bucket: str,
-472        source_object: str,
-473        restore_expiration: int,
-474        retrieval_tier: str,
-475        dry_run: bool,
-476    ) -> None:
-477        """Request and wait for the restore to complete, polling the restore status.
+433    def check_restore_status(source_bucket: str, source_object: str) -> dict:
+434        """Check the restore status of archived data.
+435
+436        Args:
+437            source_bucket: name of bucket to check the restore status.
+438            source_object: object to check the restore status.
+439
+440        Returns:
+441            A dict containing the amount of objects in each status.
+442        """
+443        not_started_objects = 0
+444        ongoing_objects = 0
+445        restored_objects = 0
+446        total_objects = 0
+447
+448        if _check_directory(source_bucket, source_object):
+449            source_object = _process_directory_path(source_object)
+450
+451        objects_to_restore = _list_objects_recursively(
+452            bucket=source_bucket, path=source_object
+453        )
+454
+455        for obj in objects_to_restore:
+456            ArchiveFileManager._logger.info(f"Checking restore status for: {obj}")
+457
+458            restore_status = ArchiveFileManager._check_object_restore_status(
+459                source_bucket, obj
+460            )
+461            if not restore_status:
+462                ArchiveFileManager._logger.warning(
+463                    f"Restore status not found for {source_bucket}/{obj}"
+464                )
+465            else:
+466                total_objects += 1
+467
+468                if RestoreStatus.NOT_STARTED == restore_status:
+469                    not_started_objects += 1
+470                elif RestoreStatus.ONGOING == restore_status:
+471                    ongoing_objects += 1
+472                else:
+473                    restored_objects += 1
+474
+475                ArchiveFileManager._logger.info(
+476                    f"{obj} restore status is {restore_status.value}"
+477                )
 478
-479        Args:
-480            source_bucket: name of bucket to perform the restore.
-481            source_object: object to be restored.
-482            restore_expiration: restore expiration in days.
-483            retrieval_tier: type of restore, possible values are:
-484                Bulk, Standard or Expedited.
-485            dry_run: if dry_run is set to True the function will print a dict with
-486                all the paths that would be deleted based on the given keys.
-487        """
-488        if retrieval_tier != RestoreType.EXPEDITED.value:
-489            ArchiveFileManager._logger.error(
-490                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
-491                "kind of restore should be used just with `Expedited` retrieval tier "
-492                "to save cluster costs."
-493            )
-494            raise ValueError(
-495                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
-496                "kind of restore should be used just with `Expedited` retrieval tier "
-497                "to save cluster costs."
-498            )
-499
-500        ArchiveFileManager.request_restore(
-501            source_bucket=source_bucket,
-502            source_object=source_object,
-503            restore_expiration=restore_expiration,
-504            retrieval_tier=retrieval_tier,
-505            dry_run=dry_run,
-506        )
-507        restore_status = ArchiveFileManager.check_restore_status(
-508            source_bucket, source_object
-509        )
-510        ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
-511
-512        if not dry_run:
-513            ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.")
-514            wait_time = 300
-515            while restore_status.get("total_objects") > restore_status.get(
-516                "restored_objects"
-517            ):
-518                ArchiveFileManager._logger.info(
-519                    "Not all objects have been restored yet, checking the status again "
-520                    f"in {wait_time} seconds."
-521                )
-522                time.sleep(wait_time)
-523                wait_time = 30
-524                restore_status = ArchiveFileManager.check_restore_status(
-525                    source_bucket, source_object
-526                )
-527                ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
+479        return {
+480            "total_objects": total_objects,
+481            "not_started_objects": not_started_objects,
+482            "ongoing_objects": ongoing_objects,
+483            "restored_objects": restored_objects,
+484        }
+485
+486    @staticmethod
+487    def _request_restore_object(
+488        bucket: str, object_key: str, expiration: int, retrieval_tier: str
+489    ) -> None:
+490        """Request a restore of the archive.
+491
+492        Args:
+493            bucket: name of bucket to perform the restore.
+494            object_key: object to be restored.
+495            expiration: restore expiration.
+496            retrieval_tier: type of restore, possible values are:
+497                Bulk, Standard or Expedited.
+498        """
+499        if not RestoreType.exists(retrieval_tier):
+500            raise RestoreTypeNotFoundException(
+501                f"Restore type {retrieval_tier} not supported."
+502            )
+503
+504        if _check_directory(bucket, object_key):
+505            object_key = _process_directory_path(object_key)
+506
+507        archived_object = ArchiveFileManager._get_archived_object(bucket, object_key)
+508
+509        if archived_object and archived_object.restore is None:
+510            ArchiveFileManager._logger.info(f"Restoring archive {bucket}/{object_key}.")
+511            archived_object.restore_object(
+512                RestoreRequest={
+513                    "Days": expiration,
+514                    "GlacierJobParameters": {"Tier": retrieval_tier},
+515                }
+516            )
+517        else:
+518            ArchiveFileManager._logger.info(
+519                f"Restore request for {bucket}/{object_key} not performed."
+520            )
+521
+522    @staticmethod
+523    def request_restore(
+524        source_bucket: str,
+525        source_object: str,
+526        restore_expiration: int,
+527        retrieval_tier: str,
+528        dry_run: bool,
+529    ) -> None:
+530        """Request the restore of archived data.
+531
+532        Args:
+533            source_bucket: name of bucket to perform the restore.
+534            source_object: object to be restored.
+535            restore_expiration: restore expiration in days.
+536            retrieval_tier: type of restore, possible values are:
+537                Bulk, Standard or Expedited.
+538            dry_run: if dry_run is set to True the function will print a dict with
+539                all the paths that would be deleted based on the given keys.
+540        """
+541        if _check_directory(source_bucket, source_object):
+542            source_object = _process_directory_path(source_object)
+543
+544        if dry_run:
+545            response = _dry_run(bucket=source_bucket, object_paths=[source_object])
+546
+547            ArchiveFileManager._logger.info("Paths that would be restored:")
+548            ArchiveFileManager._logger.info(response)
+549        else:
+550            objects_to_restore = _list_objects_recursively(
+551                bucket=source_bucket, path=source_object
+552            )
+553
+554            for obj in objects_to_restore:
+555                ArchiveFileManager._request_restore_object(
+556                    source_bucket,
+557                    obj,
+558                    restore_expiration,
+559                    retrieval_tier,
+560                )
+561
+562    @staticmethod
+563    def request_restore_and_wait(
+564        source_bucket: str,
+565        source_object: str,
+566        restore_expiration: int,
+567        retrieval_tier: str,
+568        dry_run: bool,
+569    ) -> None:
+570        """Request and wait for the restore to complete, polling the restore status.
+571
+572        Args:
+573            source_bucket: name of bucket to perform the restore.
+574            source_object: object to be restored.
+575            restore_expiration: restore expiration in days.
+576            retrieval_tier: type of restore, possible values are:
+577                Bulk, Standard or Expedited.
+578            dry_run: if dry_run is set to True the function will print a dict with
+579                all the paths that would be deleted based on the given keys.
+580        """
+581        if retrieval_tier != RestoreType.EXPEDITED.value:
+582            ArchiveFileManager._logger.error(
+583                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
+584                "kind of restore should be used just with `Expedited` retrieval tier "
+585                "to save cluster costs."
+586            )
+587            raise ValueError(
+588                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
+589                "kind of restore should be used just with `Expedited` retrieval tier "
+590                "to save cluster costs."
+591            )
+592
+593        ArchiveFileManager.request_restore(
+594            source_bucket=source_bucket,
+595            source_object=source_object,
+596            restore_expiration=restore_expiration,
+597            retrieval_tier=retrieval_tier,
+598            dry_run=dry_run,
+599        )
+600        restore_status = ArchiveFileManager.check_restore_status(
+601            source_bucket, source_object
+602        )
+603        ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
+604
+605        if not dry_run:
+606            ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.")
+607            wait_time = 300
+608            while restore_status.get("total_objects") > restore_status.get(
+609                "restored_objects"
+610            ):
+611                ArchiveFileManager._logger.info(
+612                    "Not all objects have been restored yet, checking the status again "
+613                    f"in {wait_time} seconds."
+614                )
+615                time.sleep(wait_time)
+616                wait_time = 30
+617                restore_status = ArchiveFileManager.check_restore_status(
+618                    source_bucket, source_object
+619                )
+620                ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
 
@@ -1403,56 +1523,59 @@

-
348    @staticmethod
-349    def check_restore_status(source_bucket: str, source_object: str) -> dict:
-350        """Check the restore status of archived data.
-351
-352        Args:
-353            source_bucket: name of bucket to check the restore status.
-354            source_object: object to check the restore status.
-355
-356        Returns:
-357            A dict containing the amount of objects in each status.
-358        """
-359        not_started_objects = 0
-360        ongoing_objects = 0
-361        restored_objects = 0
-362        total_objects = 0
-363
-364        objects_to_restore = _list_objects_recursively(
-365            bucket=source_bucket, path=source_object
-366        )
-367
-368        for obj in objects_to_restore:
-369            ArchiveFileManager._logger.info(f"Checking restore status for: {obj}")
-370
-371            restore_status = ArchiveFileManager._check_object_restore_status(
-372                source_bucket, obj
-373            )
-374            if not restore_status:
-375                ArchiveFileManager._logger.warning(
-376                    f"Restore status not found for {source_bucket}/{obj}"
-377                )
-378            else:
-379                total_objects += 1
-380
-381                if RestoreStatus.NOT_STARTED == restore_status:
-382                    not_started_objects += 1
-383                elif RestoreStatus.ONGOING == restore_status:
-384                    ongoing_objects += 1
-385                else:
-386                    restored_objects += 1
-387
-388                ArchiveFileManager._logger.info(
-389                    f"{obj} restore status is {restore_status.value}"
-390                )
-391
-392        return {
-393            "total_objects": total_objects,
-394            "not_started_objects": not_started_objects,
-395            "ongoing_objects": ongoing_objects,
-396            "restored_objects": restored_objects,
-397        }
+            
432    @staticmethod
+433    def check_restore_status(source_bucket: str, source_object: str) -> dict:
+434        """Check the restore status of archived data.
+435
+436        Args:
+437            source_bucket: name of bucket to check the restore status.
+438            source_object: object to check the restore status.
+439
+440        Returns:
+441            A dict containing the amount of objects in each status.
+442        """
+443        not_started_objects = 0
+444        ongoing_objects = 0
+445        restored_objects = 0
+446        total_objects = 0
+447
+448        if _check_directory(source_bucket, source_object):
+449            source_object = _process_directory_path(source_object)
+450
+451        objects_to_restore = _list_objects_recursively(
+452            bucket=source_bucket, path=source_object
+453        )
+454
+455        for obj in objects_to_restore:
+456            ArchiveFileManager._logger.info(f"Checking restore status for: {obj}")
+457
+458            restore_status = ArchiveFileManager._check_object_restore_status(
+459                source_bucket, obj
+460            )
+461            if not restore_status:
+462                ArchiveFileManager._logger.warning(
+463                    f"Restore status not found for {source_bucket}/{obj}"
+464                )
+465            else:
+466                total_objects += 1
+467
+468                if RestoreStatus.NOT_STARTED == restore_status:
+469                    not_started_objects += 1
+470                elif RestoreStatus.ONGOING == restore_status:
+471                    ongoing_objects += 1
+472                else:
+473                    restored_objects += 1
+474
+475                ArchiveFileManager._logger.info(
+476                    f"{obj} restore status is {restore_status.value}"
+477                )
+478
+479        return {
+480            "total_objects": total_objects,
+481            "not_started_objects": not_started_objects,
+482            "ongoing_objects": ongoing_objects,
+483            "restored_objects": restored_objects,
+484        }
 
@@ -1480,42 +1603,45 @@

-
432    @staticmethod
-433    def request_restore(
-434        source_bucket: str,
-435        source_object: str,
-436        restore_expiration: int,
-437        retrieval_tier: str,
-438        dry_run: bool,
-439    ) -> None:
-440        """Request the restore of archived data.
-441
-442        Args:
-443            source_bucket: name of bucket to perform the restore.
-444            source_object: object to be restored.
-445            restore_expiration: restore expiration in days.
-446            retrieval_tier: type of restore, possible values are:
-447                Bulk, Standard or Expedited.
-448            dry_run: if dry_run is set to True the function will print a dict with
-449                all the paths that would be deleted based on the given keys.
-450        """
-451        if dry_run:
-452            response = _dry_run(bucket=source_bucket, object_paths=[source_object])
-453
-454            ArchiveFileManager._logger.info("Paths that would be restored:")
-455            ArchiveFileManager._logger.info(response)
-456        else:
-457            objects_to_restore = _list_objects_recursively(
-458                bucket=source_bucket, path=source_object
-459            )
-460
-461            for obj in objects_to_restore:
-462                ArchiveFileManager._request_restore_object(
-463                    source_bucket,
-464                    obj,
-465                    restore_expiration,
-466                    retrieval_tier,
-467                )
+            
522    @staticmethod
+523    def request_restore(
+524        source_bucket: str,
+525        source_object: str,
+526        restore_expiration: int,
+527        retrieval_tier: str,
+528        dry_run: bool,
+529    ) -> None:
+530        """Request the restore of archived data.
+531
+532        Args:
+533            source_bucket: name of bucket to perform the restore.
+534            source_object: object to be restored.
+535            restore_expiration: restore expiration in days.
+536            retrieval_tier: type of restore, possible values are:
+537                Bulk, Standard or Expedited.
+538            dry_run: if dry_run is set to True the function will print a dict with
+539                all the paths that would be deleted based on the given keys.
+540        """
+541        if _check_directory(source_bucket, source_object):
+542            source_object = _process_directory_path(source_object)
+543
+544        if dry_run:
+545            response = _dry_run(bucket=source_bucket, object_paths=[source_object])
+546
+547            ArchiveFileManager._logger.info("Paths that would be restored:")
+548            ArchiveFileManager._logger.info(response)
+549        else:
+550            objects_to_restore = _list_objects_recursively(
+551                bucket=source_bucket, path=source_object
+552            )
+553
+554            for obj in objects_to_restore:
+555                ArchiveFileManager._request_restore_object(
+556                    source_bucket,
+557                    obj,
+558                    restore_expiration,
+559                    retrieval_tier,
+560                )
 
@@ -1545,65 +1671,65 @@

-
469    @staticmethod
-470    def request_restore_and_wait(
-471        source_bucket: str,
-472        source_object: str,
-473        restore_expiration: int,
-474        retrieval_tier: str,
-475        dry_run: bool,
-476    ) -> None:
-477        """Request and wait for the restore to complete, polling the restore status.
-478
-479        Args:
-480            source_bucket: name of bucket to perform the restore.
-481            source_object: object to be restored.
-482            restore_expiration: restore expiration in days.
-483            retrieval_tier: type of restore, possible values are:
-484                Bulk, Standard or Expedited.
-485            dry_run: if dry_run is set to True the function will print a dict with
-486                all the paths that would be deleted based on the given keys.
-487        """
-488        if retrieval_tier != RestoreType.EXPEDITED.value:
-489            ArchiveFileManager._logger.error(
-490                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
-491                "kind of restore should be used just with `Expedited` retrieval tier "
-492                "to save cluster costs."
-493            )
-494            raise ValueError(
-495                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
-496                "kind of restore should be used just with `Expedited` retrieval tier "
-497                "to save cluster costs."
-498            )
-499
-500        ArchiveFileManager.request_restore(
-501            source_bucket=source_bucket,
-502            source_object=source_object,
-503            restore_expiration=restore_expiration,
-504            retrieval_tier=retrieval_tier,
-505            dry_run=dry_run,
-506        )
-507        restore_status = ArchiveFileManager.check_restore_status(
-508            source_bucket, source_object
-509        )
-510        ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
-511
-512        if not dry_run:
-513            ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.")
-514            wait_time = 300
-515            while restore_status.get("total_objects") > restore_status.get(
-516                "restored_objects"
-517            ):
-518                ArchiveFileManager._logger.info(
-519                    "Not all objects have been restored yet, checking the status again "
-520                    f"in {wait_time} seconds."
-521                )
-522                time.sleep(wait_time)
-523                wait_time = 30
-524                restore_status = ArchiveFileManager.check_restore_status(
-525                    source_bucket, source_object
-526                )
-527                ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
+            
562    @staticmethod
+563    def request_restore_and_wait(
+564        source_bucket: str,
+565        source_object: str,
+566        restore_expiration: int,
+567        retrieval_tier: str,
+568        dry_run: bool,
+569    ) -> None:
+570        """Request and wait for the restore to complete, polling the restore status.
+571
+572        Args:
+573            source_bucket: name of bucket to perform the restore.
+574            source_object: object to be restored.
+575            restore_expiration: restore expiration in days.
+576            retrieval_tier: type of restore, possible values are:
+577                Bulk, Standard or Expedited.
+578            dry_run: if dry_run is set to True the function will print a dict with
+579                all the paths that would be deleted based on the given keys.
+580        """
+581        if retrieval_tier != RestoreType.EXPEDITED.value:
+582            ArchiveFileManager._logger.error(
+583                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
+584                "kind of restore should be used just with `Expedited` retrieval tier "
+585                "to save cluster costs."
+586            )
+587            raise ValueError(
+588                f"Retrieval Tier {retrieval_tier} not allowed on this operation! This "
+589                "kind of restore should be used just with `Expedited` retrieval tier "
+590                "to save cluster costs."
+591            )
+592
+593        ArchiveFileManager.request_restore(
+594            source_bucket=source_bucket,
+595            source_object=source_object,
+596            restore_expiration=restore_expiration,
+597            retrieval_tier=retrieval_tier,
+598            dry_run=dry_run,
+599        )
+600        restore_status = ArchiveFileManager.check_restore_status(
+601            source_bucket, source_object
+602        )
+603        ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
+604
+605        if not dry_run:
+606            ArchiveFileManager._logger.info("Checking the restore status in 5 minutes.")
+607            wait_time = 300
+608            while restore_status.get("total_objects") > restore_status.get(
+609                "restored_objects"
+610            ):
+611                ArchiveFileManager._logger.info(
+612                    "Not all objects have been restored yet, checking the status again "
+613                    f"in {wait_time} seconds."
+614                )
+615                time.sleep(wait_time)
+616                wait_time = 30
+617                restore_status = ArchiveFileManager.check_restore_status(
+618                    source_bucket, source_object
+619                )
+620                ArchiveFileManager._logger.info(f"Restore status: {restore_status}")
 
diff --git a/lakehouse_engine/transformers/data_maskers.html b/lakehouse_engine/transformers/data_maskers.html index f60bde2..26960f6 100644 --- a/lakehouse_engine/transformers/data_maskers.html +++ b/lakehouse_engine/transformers/data_maskers.html @@ -111,32 +111,29 @@

44 else: 45 raise WrongArgumentsException("Hashing approach is not supported.") 46 -47 if suffix and suffix != "": -48 masked_df = masked_df.drop(col) -49 -50 return masked_df -51 -52 return inner -53 -54 @classmethod -55 def column_dropper(cls, cols: List[str]) -> Callable: -56 """Drop specific columns. +47 return masked_df +48 +49 return inner +50 +51 @classmethod +52 def column_dropper(cls, cols: List[str]) -> Callable: +53 """Drop specific columns. +54 +55 Args: +56 cols: list of column names to drop. 57 -58 Args: -59 cols: list of column names to drop. -60 -61 Returns: -62 A function to be called in .transform() spark function. -63 """ -64 -65 def inner(df: DataFrame) -> DataFrame: -66 drop_df = df -67 for col in cols: -68 drop_df = drop_df.drop(col) -69 -70 return drop_df -71 -72 return inner +58 Returns: +59 A function to be called in .transform() spark function. +60 """ +61 +62 def inner(df: DataFrame) -> DataFrame: +63 drop_df = df +64 for col in cols: +65 drop_df = drop_df.drop(col) +66 +67 return drop_df +68 +69 return inner

@@ -188,32 +185,29 @@

45 else: 46 raise WrongArgumentsException("Hashing approach is not supported.") 47 -48 if suffix and suffix != "": -49 masked_df = masked_df.drop(col) -50 -51 return masked_df -52 -53 return inner -54 -55 @classmethod -56 def column_dropper(cls, cols: List[str]) -> Callable: -57 """Drop specific columns. +48 return masked_df +49 +50 return inner +51 +52 @classmethod +53 def column_dropper(cls, cols: List[str]) -> Callable: +54 """Drop specific columns. +55 +56 Args: +57 cols: list of column names to drop. 58 -59 Args: -60 cols: list of column names to drop. -61 -62 Returns: -63 A function to be called in .transform() spark function. -64 """ -65 -66 def inner(df: DataFrame) -> DataFrame: -67 drop_df = df -68 for col in cols: -69 drop_df = drop_df.drop(col) -70 -71 return drop_df -72 -73 return inner +59 Returns: +60 A function to be called in .transform() spark function. +61 """ +62 +63 def inner(df: DataFrame) -> DataFrame: +64 drop_df = df +65 for col in cols: +66 drop_df = drop_df.drop(col) +67 +68 return drop_df +69 +70 return inner @@ -264,12 +258,9 @@

45 else: 46 raise WrongArgumentsException("Hashing approach is not supported.") 47 -48 if suffix and suffix != "": -49 masked_df = masked_df.drop(col) -50 -51 return masked_df -52 -53 return inner +48 return masked_df +49 +50 return inner @@ -321,25 +312,25 @@

-
55    @classmethod
-56    def column_dropper(cls, cols: List[str]) -> Callable:
-57        """Drop specific columns.
+            
52    @classmethod
+53    def column_dropper(cls, cols: List[str]) -> Callable:
+54        """Drop specific columns.
+55
+56        Args:
+57            cols: list of column names to drop.
 58
-59        Args:
-60            cols: list of column names to drop.
-61
-62        Returns:
-63            A function to be called in .transform() spark function.
-64        """
-65
-66        def inner(df: DataFrame) -> DataFrame:
-67            drop_df = df
-68            for col in cols:
-69                drop_df = drop_df.drop(col)
-70
-71            return drop_df
-72
-73        return inner
+59        Returns:
+60            A function to be called in .transform() spark function.
+61        """
+62
+63        def inner(df: DataFrame) -> DataFrame:
+64            drop_df = df
+65            for col in cols:
+66                drop_df = drop_df.drop(col)
+67
+68            return drop_df
+69
+70        return inner
 
diff --git a/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html b/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html index 791bbc3..2980331 100644 --- a/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html +++ b/lakehouse_engine/utils/extraction/jdbc_extraction_utils.html @@ -667,7 +667,7 @@
Inherited Members
- JDBCExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231012165159', max_timestamp_custom_schema: Optional[str] = None) + JDBCExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231018182628', max_timestamp_custom_schema: Optional[str] = None)
diff --git a/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html b/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html index 8d456b6..1fc9f3e 100644 --- a/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html +++ b/lakehouse_engine/utils/extraction/sap_b4_extraction_utils.html @@ -536,7 +536,7 @@
Inherited Members
- SAPB4Extraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: str = 'REQTSN DECIMAL(23,0)', min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231012165159', max_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)', latest_timestamp_input_col: str = 'REQTSN', request_status_tbl: str = 'SAPHANADB.RSPMREQUEST', request_col_name: str = 'REQUEST_TSN', data_target: Optional[str] = None, act_req_join_condition: Optional[str] = None, include_changelog_tech_cols: Optional[bool] = None, extra_cols_req_status_tbl: Optional[str] = None, request_status_tbl_filter: Optional[str] = None, adso_type: Optional[str] = None, default_max_timestamp: str = '1970000000000000000000') + SAPB4Extraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: str = 'REQTSN DECIMAL(23,0)', min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231018182628', max_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)', latest_timestamp_input_col: str = 'REQTSN', request_status_tbl: str = 'SAPHANADB.RSPMREQUEST', request_col_name: str = 'REQUEST_TSN', data_target: Optional[str] = None, act_req_join_condition: Optional[str] = None, include_changelog_tech_cols: Optional[bool] = None, extra_cols_req_status_tbl: Optional[str] = None, request_status_tbl_filter: Optional[str] = None, adso_type: Optional[str] = None, default_max_timestamp: str = '1970000000000000000000')
diff --git a/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html b/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html index f0d1aba..1506a30 100644 --- a/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html +++ b/lakehouse_engine/utils/extraction/sap_bw_extraction_utils.html @@ -511,7 +511,7 @@

- SAPBWExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231012165159', max_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)', latest_timestamp_input_col: str = 'actrequest_timestamp', act_request_table: str = 'SAPPHA.RSODSACTREQ', request_col_name: str = 'actrequest', act_req_join_condition: Optional[str] = None, odsobject: Optional[str] = None, include_changelog_tech_cols: bool = True, extra_cols_act_request: Optional[str] = None, get_timestamp_from_act_request: bool = False, sap_bw_schema: str = 'SAPPHA', default_max_timestamp: str = '197000000000000') + SAPBWExtraction( user: str, password: str, url: str, dbtable: str, calc_upper_bound_schema: Optional[str] = None, changelog_table: Optional[str] = None, partition_column: Optional[str] = None, latest_timestamp_data_location: Optional[str] = None, latest_timestamp_data_format: str = 'delta', extraction_type: str = 'delta', driver: str = 'com.sap.db.jdbc.Driver', num_partitions: Optional[int] = None, lower_bound: Union[int, float, str, NoneType] = None, upper_bound: Union[int, float, str, NoneType] = None, default_upper_bound: str = '1', fetch_size: str = '100000', compress: bool = True, custom_schema: Optional[str] = None, min_timestamp: Optional[str] = None, max_timestamp: Optional[str] = None, generate_predicates: bool = False, predicates: Optional[List] = None, predicates_add_null: bool = True, extraction_timestamp: str = '20231018182628', max_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)', latest_timestamp_input_col: str = 'actrequest_timestamp', act_request_table: str = 'SAPPHA.RSODSACTREQ', request_col_name: str = 'actrequest', act_req_join_condition: Optional[str] = None, odsobject: Optional[str] = None, include_changelog_tech_cols: bool = True, extra_cols_act_request: Optional[str] = None, get_timestamp_from_act_request: bool = False, sap_bw_schema: str = 'SAPPHA', default_max_timestamp: str = '197000000000000')
diff --git a/search.js b/search.js index fad226a..4de7dca 100644 --- a/search.js +++ b/search.js @@ -1,6 +1,6 @@ window.pdocSearch = (function(){ /** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();oLakehouse engine package containing all the system subpackages.

\n"}, {"fullname": "lakehouse_engine.algorithms", "modulename": "lakehouse_engine.algorithms", "kind": "module", "doc": "

Package containing all the lakehouse engine algorithms.

\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "kind": "module", "doc": "

Module containing the Algorithm class.

\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm", "kind": "class", "doc": "

Class to define the behavior of every algorithm based on ACONs.

\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.__init__", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.__init__", "kind": "function", "doc": "

Construct Algorithm instances.

\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.get_dq_spec", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.get_dq_spec", "kind": "function", "doc": "

Get data quality specification object from acon.

\n\n

Args:\n spec: data quality specifications.

\n\n

Returns:\n The DQSpec and the List of DQ Functions Specs.

\n", "signature": "(\tcls,\tspec: dict) -> Tuple[lakehouse_engine.core.definitions.DQSpec, List[lakehouse_engine.core.definitions.DQFunctionSpec], List[lakehouse_engine.core.definitions.DQFunctionSpec]]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader", "modulename": "lakehouse_engine.algorithms.data_loader", "kind": "module", "doc": "

Module to define DataLoader class.

\n"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader", "kind": "class", "doc": "

Load data using an algorithm configuration (ACON represented as dict).

\n\n

This algorithm focuses on the cases where users will be specifying all the algorithm\nsteps and configurations through a dict based configuration, which we name ACON\nin our framework.

\n\n

Since an ACON is a dict you can pass a custom transformer through a python function\nand, therefore, the DataLoader can also be used to load data with custom\ntransformations not provided in our transformers package.

\n\n

As the algorithm base class of the lakehouse-engine framework is based on the\nconcept of ACON, this DataLoader algorithm simply inherits from Algorithm,\nwithout overriding anything. We designed the codebase like this to avoid\ninstantiating the Algorithm class directly, which was always meant to be an\nabstraction for any specific algorithm included in the lakehouse-engine framework.

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.__init__", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.__init__", "kind": "function", "doc": "

Construct DataLoader algorithm instances.

\n\n

A data loader needs several specifications to work properly,\nbut some of them might be optional. The available specifications are:

\n\n
- input specifications (mandatory): specify how to read data.\n- transform specifications (optional): specify how to transform data.\n- data quality specifications (optional): specify how to execute the data\n    quality process.\n- output specifications (mandatory): specify how to write data to the\n    target.\n- terminate specifications (optional): specify what to do after writing into\n    the target (e.g., optimizing target table, vacuum, compute stats, etc).\n
\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.read", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.read", "kind": "function", "doc": "

Read data from an input location into a distributed dataframe.

\n\n

Returns:\n An ordered dict with all the dataframes that were read.

\n", "signature": "(self) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.transform", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.transform", "kind": "function", "doc": "

Transform (optionally) the data that was read.

\n\n

If there isn't a transformation specification this step will be skipped, and the\noriginal dataframes that were read will be returned.\nTransformations can have dependency from another transformation result, however\nwe need to keep in mind if we are using streaming source and for some reason we\nneed to enable micro batch processing, this result cannot be used as input to\nanother transformation. Micro batch processing in pyspark streaming is only\navailable in .write(), which means this transformation with micro batch needs\nto be the end of the process.

\n\n

Args:\n data: input dataframes in an ordered dict.

\n\n

Returns:\n Another ordered dict with the transformed dataframes, according to the\n transformation specification.

\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.process_dq", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.process_dq", "kind": "function", "doc": "

Process the data quality tasks for the data that was read and/or transformed.

\n\n

It supports multiple input dataframes. Although just one is advisable.

\n\n

It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.

\n\n

Args:\n data: dataframes from previous steps of the algorithm that we which to\n run the DQ process on.

\n\n

Returns:\n Another ordered dict with the validated dataframes.

\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.write", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.write", "kind": "function", "doc": "

Write the data that was read and transformed (if applicable).

\n\n

It supports writing multiple datasets. However, we only recommend to write one\ndataframe. This recommendation is based on easy debugging and reproducibility,\nsince if we start mixing several datasets being fueled by the same algorithm, it\nwould unleash an infinite sea of reproducibility issues plus tight coupling and\ndependencies between datasets. Having said that, there may be cases where\nwriting multiple datasets is desirable according to the use case requirements.\nUse it accordingly.

\n\n

Args:\n data: dataframes that were read and transformed (if applicable).

\n\n

Returns:\n Dataframes that were written.

\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.terminate", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.terminate", "kind": "function", "doc": "

Terminate the algorithm.

\n\n

Args:\n data: dataframes that were written.

\n", "signature": "(self, data: collections.OrderedDict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.execute", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.execute", "kind": "function", "doc": "

Define the algorithm execution behaviour.

\n", "signature": "(self) -> Optional[collections.OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator", "modulename": "lakehouse_engine.algorithms.dq_validator", "kind": "module", "doc": "

Module to define Data Validator class.

\n"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator", "kind": "class", "doc": "

Validate data using an algorithm configuration (ACON represented as dict).

\n\n

This algorithm focuses on isolate Data Quality Validations from loading,\napplying a set of data quality functions to a specific input dataset,\nwithout the need to define any output specification.\nYou can use any input specification compatible with the lakehouse engine\n(dataframe, table, files, etc).

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.__init__", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.__init__", "kind": "function", "doc": "

Construct DQValidator algorithm instances.

\n\n

A data quality validator needs the following specifications to work\nproperly:\n - input specification (mandatory): specify how and what data to\n read.\n - data quality specification (mandatory): specify how to execute\n the data quality process.\n - restore_prev_version (optional): specify if, having\n delta table/files as input, they should be restored to the\n previous version if the data quality process fails. Note: this\n is only considered if fail_on_error is kept as True.

\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.read", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.read", "kind": "function", "doc": "

Read data from an input location into a distributed dataframe.

\n\n

Returns:\n Dataframe with data that was read.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.process_dq", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.process_dq", "kind": "function", "doc": "

Process the data quality tasks for the data that was read.

\n\n

It supports a single input dataframe.

\n\n

It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.

\n\n

Args:\n data: input dataframe on which to run the DQ process.

\n\n

Returns:\n Validated dataframe.

\n", "signature": "(\tself,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.execute", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.execute", "kind": "function", "doc": "

Define the algorithm execution behaviour.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.exceptions", "modulename": "lakehouse_engine.algorithms.exceptions", "kind": "module", "doc": "

Package defining all the algorithm custom exceptions.

\n"}, {"fullname": "lakehouse_engine.algorithms.exceptions.ReconciliationFailedException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "ReconciliationFailedException", "kind": "class", "doc": "

Exception for when the reconciliation process fails.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.NoNewDataException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "NoNewDataException", "kind": "class", "doc": "

Exception for when no new data is available.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.SensorAlreadyExistsException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "SensorAlreadyExistsException", "kind": "class", "doc": "

Exception for when a sensor with same sensor id already exists.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.RestoreTypeNotFoundException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "RestoreTypeNotFoundException", "kind": "class", "doc": "

Exception for when the restore type is not found.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "kind": "module", "doc": "

Module containing the Reconciliator class.

\n"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType", "kind": "class", "doc": "

Type of Reconciliation.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.PCT", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.PCT", "kind": "variable", "doc": "

\n", "default_value": "<ReconciliationType.PCT: 'percentage'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.ABS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.ABS", "kind": "variable", "doc": "

\n", "default_value": "<ReconciliationType.ABS: 'absolute'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers", "kind": "class", "doc": "

Transformers Available for the Reconciliation Algorithm.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "kind": "variable", "doc": "

\n", "annotation": ": dict", "default_value": "<ReconciliationTransformers.AVAILABLE_TRANSFORMERS: {'cache': <bound method Optimizers.cache of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>, 'persist': <bound method Optimizers.persist of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>}>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator", "kind": "class", "doc": "

Class to define the behavior of an algorithm that checks if data reconciles.

\n\n

Checking if data reconciles, using this algorithm, is a matter of reading the\n'truth' data and the 'current' data. You can use any input specification compatible\nwith the lakehouse engine to read 'truth' or 'current' data. On top of that, you\ncan pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can\npreprocess the data before it goes into the actual reconciliation process.\nMoreover, you can use the 'truth_preprocess_query_args' and\n'current_preprocess_query_args' to pass additional arguments to be used to apply\nadditional operations on top of the dataframe, resulting from the previous steps.\nWith these arguments you can apply additional operations like caching or persisting\nthe Dataframe. The way to pass the additional arguments for the operations is\nsimilar to the TransformSpec, but only a few operations are allowed. Those are\ndefined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS.

\n\n

The reconciliation process is focused on joining 'truth' with 'current' by all\nprovided columns except the ones passed as 'metrics'. After that it calculates the\ndifferences in the metrics attributes (either percentage or absolute difference).\nFinally, it aggregates the differences, using the supplied aggregation function\n(e.g., sum, avg, min, max, etc).

\n\n

All of these configurations are passed via the ACON to instantiate a\nReconciliatorSpec object.

\n\n

Notes:\n - It is crucial that both the current and truth datasets have exactly the same\n structure.\n - You should not use 0 as yellow or red threshold, as the algorithm will verify\n if the difference between the truth and current values is bigger\n or equal than those thresholds.\n - The reconciliation does not produce any negative values or percentages, as we\n use the absolute value of the differences. This means that the recon result\n will not indicate if it was the current values that were bigger or smaller\n than the truth values, or vice versa.

\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.__init__", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.__init__", "kind": "function", "doc": "

Construct Algorithm instances.

\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_source_of_truth", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_source_of_truth", "kind": "function", "doc": "

Get the source of truth (expected result) for the reconciliation process.

\n\n

Returns:\n DataFrame containing the source of truth.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_current_results", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_current_results", "kind": "function", "doc": "

Get the current results from the table that we are checking if it reconciles.

\n\n

Returns:\n DataFrame containing the current results.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.execute", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.execute", "kind": "function", "doc": "

Reconcile the current results against the truth dataset.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.sensor", "modulename": "lakehouse_engine.algorithms.sensor", "kind": "module", "doc": "

Module to define Sensor algorithm behavior.

\n"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor", "kind": "class", "doc": "

Class representing a sensor to check if the upstream has new data.

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.__init__", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.__init__", "kind": "function", "doc": "

Construct Sensor instances.

\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.execute", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.execute", "kind": "function", "doc": "

Execute the sensor.

\n", "signature": "(self) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.configs", "modulename": "lakehouse_engine.configs", "kind": "module", "doc": "

This module receives a config file which is included in the wheel.

\n"}, {"fullname": "lakehouse_engine.core", "modulename": "lakehouse_engine.core", "kind": "module", "doc": "

Package with the core behaviour of the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.core.definitions", "modulename": "lakehouse_engine.core.definitions", "kind": "module", "doc": "

Definitions of standard values and structures for core components.

\n"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat", "kind": "class", "doc": "

Formats of algorithm input.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JDBC", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.AVRO", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JSON", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CSV", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.PARQUET", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DELTAFILES", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CLOUDFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CLOUDFILES", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.CLOUDFILES: 'cloudfiles'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.KAFKA", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SQL", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SQL", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SQL: 'sql'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_BW", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_BW", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SAP_BW: 'sap_bw'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_B4", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_B4", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SAP_B4: 'sap_b4'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DATAFRAME", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SFTP", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SFTP", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SFTP: 'sftp'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.values", "kind": "function", "doc": "

Generates a list containing all enum values.

\n\n

Return:\n A list with all enum values.

\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.exists", "kind": "function", "doc": "

Checks if the input format exists in the enum values.

\n\n

Args:\n input_format: format to check if exists.

\n\n

Return:\n If the input format exists in our enum.

\n", "signature": "(cls, input_format: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat", "kind": "class", "doc": "

Formats of algorithm output.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JDBC", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.AVRO", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JSON", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CSV", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.PARQUET", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DELTAFILES", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.KAFKA", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CONSOLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CONSOLE", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.CONSOLE: 'console'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.NOOP", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.NOOP", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.NOOP: 'noop'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DATAFRAME", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.FILE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.FILE", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.FILE: 'file'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.TABLE", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.TABLE: 'table'>"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType", "kind": "class", "doc": "

Type of notifier available.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType.EMAIL", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType.EMAIL", "kind": "variable", "doc": "

\n", "default_value": "<NotifierType.EMAIL: 'email'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationEmailServers", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationEmailServers", "kind": "class", "doc": "

Types of email server with special behaviour.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters", "kind": "class", "doc": "

Parameters to be replaced in runtime.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "kind": "variable", "doc": "

\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_JOB_NAME: 'databricks_job_name'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "kind": "variable", "doc": "

\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID: 'databricks_workspace_id'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType", "kind": "class", "doc": "

Define the types of read operations.

\n\n

BATCH - read the data in batch mode (e.g., Spark batch).\nSTREAMING - read the data in streaming mode (e.g., Spark streaming).

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.BATCH", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.BATCH", "kind": "variable", "doc": "

\n", "default_value": "<ReadType.BATCH: 'batch'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.STREAMING", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.STREAMING", "kind": "variable", "doc": "

\n", "default_value": "<ReadType.STREAMING: 'streaming'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode", "kind": "class", "doc": "

Different modes that control how we handle compliance to the provided schema.

\n\n

These read modes map to Spark's read modes at the moment.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.PERMISSIVE", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.PERMISSIVE", "kind": "variable", "doc": "

\n", "default_value": "<ReadMode.PERMISSIVE: 'PERMISSIVE'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.FAILFAST", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.FAILFAST", "kind": "variable", "doc": "

\n", "default_value": "<ReadMode.FAILFAST: 'FAILFAST'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.DROPMALFORMED", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.DROPMALFORMED", "kind": "variable", "doc": "

\n", "default_value": "<ReadMode.DROPMALFORMED: 'DROPMALFORMED'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults", "kind": "class", "doc": "

Defaults used on the data quality process.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_STORE", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.FILE_SYSTEM_STORE: 'file_system'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_S3_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_S3_STORE", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_BATCH_IDENTIFIERS", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_BATCH_IDENTIFIERS", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DQ_BATCH_IDENTIFIERS: ['spec_id', 'input_id', 'timestamp']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_CLASS_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATASOURCE_CLASS_NAME: 'Datasource'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_EXECUTION_ENGINE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_EXECUTION_ENGINE", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATASOURCE_EXECUTION_ENGINE: 'SparkDFExecutionEngine'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_CLASS_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CONNECTORS_CLASS_NAME: 'RuntimeDataConnector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_MODULE_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_MODULE_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CONNECTORS_MODULE_NAME: 'great_expectations.datasource.data_connector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CLASS_NAME: 'SimpleCheckpoint'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION: 1.0>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.STORE_BACKEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.STORE_BACKEND", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.EXPECTATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.EXPECTATIONS_STORE_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.EXPECTATIONS_STORE_PREFIX: 'dq/expectations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATIONS_STORE_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.VALIDATIONS_STORE_PREFIX: 'dq/validations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_DOCS_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_DOCS_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_DOCS_PREFIX: 'dq/data_docs/site/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CHECKPOINT_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CHECKPOINT_STORE_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.CHECKPOINT_STORE_PREFIX: 'dq/checkpoints/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.VALIDATION_COLUMN_IDENTIFIER: 'validationresultidentifier'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CUSTOM_EXPECTATION_LIST", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CUSTOM_EXPECTATION_LIST", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.CUSTOM_EXPECTATION_LIST: ['expect_column_values_to_be_date_not_older_than', 'expect_column_pair_a_to_be_smaller_or_equal_than_b', 'expect_multicolumn_column_a_must_equal_b_or_c', 'expect_queried_column_agg_value_to_be']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_VALIDATIONS_SCHEMA", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_VALIDATIONS_SCHEMA", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DQ_VALIDATIONS_SCHEMA: StructType([StructField('dq_validations', StructType([StructField('run_name', StringType(), True), StructField('run_success', BooleanType(), True), StructField('raised_exceptions', BooleanType(), True), StructField('run_row_success', BooleanType(), True), StructField('dq_failure_details', ArrayType(StructType([StructField('expectation_type', StringType(), True), StructField('kwargs', StringType(), True)]), True), True)]), True)])>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType", "kind": "class", "doc": "

Types of write operations.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.OVERWRITE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.OVERWRITE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.OVERWRITE: 'overwrite'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.COMPLETE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.COMPLETE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.COMPLETE: 'complete'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.APPEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.APPEND", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.APPEND: 'append'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.UPDATE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.UPDATE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.UPDATE: 'update'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.MERGE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.MERGE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.MERGE: 'merge'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.ERROR_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.ERROR_IF_EXISTS", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.ERROR_IF_EXISTS: 'error'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.IGNORE_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.IGNORE_IF_EXISTS", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.IGNORE_IF_EXISTS: 'ignore'>"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec", "kind": "class", "doc": "

Specification of an algorithm input.

\n\n

This is very aligned with the way the execution environment connects to the sources\n(e.g., spark sources).

\n\n

spec_id: spec_id of the input specification read_type: ReadType type of read\noperation.\ndata_format: format of the input.\nsftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp\n directory.\ndf_name: dataframe name.\ndb_table: table name in the form of ..\nlocation: uri that identifies from where to read data in the specified format.\nenforce_schema_from_table: if we want to enforce the table schema or not, by\n providing a table name in the form of .
.\nquery: sql query to execute and return the dataframe. Use it if you do not want to\n read from a file system nor from a table, but rather from a sql query instead.\nschema: dict representation of a schema of the input (e.g., Spark struct type\n schema).\nschema_path: path to a file with a representation of a schema of the input (e.g.,\n Spark struct type schema).\nwith_filepath: if we want to include the path of the file that is being read. Only\n works with the file reader (batch and streaming modes are supported).\noptions: dict with other relevant options according to the execution\n environment (e.g., spark) possible sources.\ncalculate_upper_bound: when to calculate upper bound to extract from SAP BW or not.\ncalc_upper_bound_schema: specific schema for the calculated upper_bound.\ngenerate_predicates: when to generate predicates to extract from SAP BW or not.\npredicates_add_null: if we want to include is null on partition by predicates.

\n"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tread_type: str,\tdata_format: Optional[str] = None,\tsftp_files_format: Optional[str] = None,\tdf_name: Optional[pyspark.sql.dataframe.DataFrame] = None,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tquery: Optional[str] = None,\tenforce_schema_from_table: Optional[str] = None,\tschema: Optional[dict] = None,\tschema_path: Optional[str] = None,\twith_filepath: bool = False,\toptions: Optional[dict] = None,\tjdbc_args: Optional[dict] = None,\tcalculate_upper_bound: bool = False,\tcalc_upper_bound_schema: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates_add_null: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec", "kind": "class", "doc": "

Transformer Specification, i.e., a single transformation amongst many.

\n\n

function: name of the function (or callable function) to be executed.\nargs: (not applicable if using a callable function) dict with the arguments to pass\nto the function pairs with the name of the parameter of the function and the\nrespective value.

\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(function: str, args: dict)"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec", "kind": "class", "doc": "

Transformation Specification.

\n\n

I.e., the specification that defines the many transformations to be done to the data\nthat was read.

\n\n

spec_id: id of the terminate specification input_id: id of the corresponding input\nspecification.\ntransformers: list of transformers to execute.\nforce_streaming_foreach_batch_processing: sometimes, when using streaming, we want\n to force the transform to be executed in the foreachBatch function to ensure\n non-supported streaming operations can be properly executed.

\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tinput_id: str,\ttransformers: List[lakehouse_engine.core.definitions.TransformerSpec],\tforce_streaming_foreach_batch_processing: bool = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQType", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType", "kind": "class", "doc": "

Available data quality tasks.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQType.VALIDATOR", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.VALIDATOR", "kind": "variable", "doc": "

\n", "default_value": "<DQType.VALIDATOR: 'validator'>"}, {"fullname": "lakehouse_engine.core.definitions.DQType.ASSISTANT", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.ASSISTANT", "kind": "variable", "doc": "

\n", "default_value": "<DQType.ASSISTANT: 'assistant'>"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec", "kind": "class", "doc": "

Defines a data quality function specification.

\n\n

function - name of the data quality function (expectation) to execute.\nIt follows the great_expectations api https://greatexpectations.io/expectations/.\nargs - args of the function (expectation). Follow the same api as above.

\n"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(function: str, args: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec", "kind": "class", "doc": "

Data quality overall specification.

\n\n
spec_id - id of the specification.\ninput_id - id of the input specification.\ndq_type - type of DQ process to execute (e.g. validator).\ndq_functions - list of function specifications to execute.\nunexpected_rows_pk - the list of columns composing the primary key of the\n    source data to identify the rows failing the DQ validations. Note: only one\n    of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It\n    is mandatory to provide one of these arguments when using tag_source_data\n    as True. When tag_source_data is False, this is not mandatory, but still\n    recommended.\ntbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.\n    Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to\n    be provided. It is mandatory to provide one of these arguments when using\n    tag_source_data as True. hen tag_source_data is False, this is not\n    mandatory, but still recommended.\ngx_result_format - great expectations result format. Default: \"COMPLETE\".\n
\n\n

\u00b4 tag_source_data - when set to true, this will ensure that the DQ process ends by\n tagging the source data with an additional column with information about the\n DQ results. This column makes it possible to identify if the DQ run was\n succeeded in general and, if not, it unlocks the insights to know what\n specific rows have made the DQ validations fail and why. Default: False.\n Note: it only works if result_sink_explode is True, gx_result_format is\n COMPLETE, fail_on_error is False (which is done automatically when\n you specify tag_source_data as True) and tbl_to_derive_pk or\n unexpected_rows_pk is configured.\n store_backend - which store_backend to use (e.g. s3 or file_system).\n local_fs_root_dir - path of the root directory. Note: only applicable for\n store_backend file_system.\n bucket - the bucket name to consider for the store_backend (store DQ artefacts).\n Note: only applicable for store_backend s3.\n data_docs_bucket - the bucket name for data docs only. When defined, it will\n supersede bucket parameter.\n expectations_store_prefix - prefix where to store expectations' data. Note: only\n applicable for store_backend s3.\n validations_store_prefix - prefix where to store validations' data. Note: only\n applicable for store_backend s3.\n data_docs_prefix - prefix where to store data_docs' data. Note: only applicable\n for store_backend s3.\n checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only\n applicable for store_backend s3.\n data_asset_name - name of the data asset to consider when configuring the great\n expectations' data source.\n expectation_suite_name - name to consider for great expectations' suite.\n assistant_options - additional options to pass to the DQ assistant processor.\n result_sink_db_table - db.table_name indicating the database and table in which\n to save the results of the DQ process.\n result_sink_location - file system location in which to save the results of the\n DQ process.\n result_sink_partitions - the list of partitions to consider.\n result_sink_format - format of the result table (e.g. delta, parquet, kafka...).\n result_sink_options - extra spark options for configuring the result sink.\n E.g: can be used to configure a Kafka sink if result_sink_format is kafka.\n result_sink_explode - flag to determine if the output table/location should have\n the columns exploded (as True) or not (as False). Default: True.\n result_sink_extra_columns - list of extra columns to be exploded (following\n the pattern \".*\") or columns to be selected. It is only used when\n result_sink_explode is set to True.\n source - name of data source, to be easier to identify in analysis. If not\n specified, it is set as default . This will be only used\n when result_sink_explode is set to True.\n fail_on_error - whether to fail the algorithm if the validations of your data in\n the DQ process failed.\n cache_df - whether to cache the dataframe before running the DQ process or not.\n critical_functions - functions that should not fail. When this argument is\n defined, fail_on_error is nullified.\n max_percentage_failure - percentage of failure that should be allowed.\n This argument has priority over both fail_on_error and critical_functions.

\n"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tinput_id: str,\tdq_type: str,\tdq_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tunexpected_rows_pk: Optional[List[str]] = None,\ttbl_to_derive_pk: Optional[str] = None,\tgx_result_format: Optional[str] = 'COMPLETE',\ttag_source_data: Optional[bool] = False,\tassistant_options: Optional[dict] = None,\tstore_backend: str = 's3',\tlocal_fs_root_dir: Optional[str] = None,\tbucket: Optional[str] = None,\tdata_docs_bucket: Optional[str] = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tdata_docs_prefix: str = 'dq/data_docs/site/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/',\tdata_asset_name: Optional[str] = None,\texpectation_suite_name: Optional[str] = None,\tresult_sink_db_table: Optional[str] = None,\tresult_sink_location: Optional[str] = None,\tresult_sink_partitions: Optional[List[str]] = None,\tresult_sink_format: str = 'delta',\tresult_sink_options: Optional[dict] = None,\tresult_sink_explode: bool = True,\tresult_sink_extra_columns: Optional[List[str]] = None,\tsource: Optional[str] = None,\tfail_on_error: bool = True,\tcache_df: bool = False,\tcritical_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tmax_percentage_failure: Optional[float] = None)"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions", "kind": "class", "doc": "

Options for a merge operation.

\n\n

merge_predicate: predicate to apply to the merge operation so that we can check if a\n new record corresponds to a record already included in the historical data.\ninsert_only: indicates if the merge should only insert data (e.g., deduplicate\n scenarios).\ndelete_predicate: predicate to apply to the delete operation.\nupdate_predicate: predicate to apply to the update operation.\ninsert_predicate: predicate to apply to the insert operation.\nupdate_column_set: rules to apply to the update operation which allows to set the\n value for each column to be updated.\n (e.g. {\"data\": \"new.data\", \"count\": \"current.count + 1\"} )\ninsert_column_set: rules to apply to the insert operation which allows to set the\n value for each column to be inserted.\n (e.g. {\"date\": \"updates.date\", \"count\": \"1\"} )

\n"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions.__init__", "kind": "function", "doc": "

\n", "signature": "(\tmerge_predicate: str,\tinsert_only: bool = False,\tdelete_predicate: Optional[str] = None,\tupdate_predicate: Optional[str] = None,\tinsert_predicate: Optional[str] = None,\tupdate_column_set: Optional[dict] = None,\tinsert_column_set: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec", "kind": "class", "doc": "

Specification of an algorithm output.

\n\n

This is very aligned with the way the execution environment connects to the output\nsystems (e.g., spark outputs).

\n\n

spec_id: id of the output specification.\ninput_id: id of the corresponding input specification.\nwrite_type: type of write operation.\ndata_format: format of the output. Defaults to DELTA.\ndb_table: table name in the form of .

.\nlocation: uri that identifies from where to write data in the specified format.\npartitions: list of partition input_col names.\nmerge_opts: options to apply to the merge operation.\nstreaming_micro_batch_transformers: transformers to invoke for each streaming micro\n batch, before writing (i.e., in Spark's foreachBatch structured\n streaming function). Note: the lakehouse engine manages this for you, so\n you don't have to manually specify streaming transformations here, so we don't\n advise you to manually specify transformations through this parameter. Supply\n them as regular transformers in the transform_specs sections of an ACON.\nstreaming_once: if the streaming query is to be executed just once, or not,\n generating just one micro batch.\nstreaming_processing_time: if streaming query is to be kept alive, this indicates\n the processing time of each micro batch.\nstreaming_available_now: if set to True, set a trigger that processes all available\n data in multiple batches then terminates the query.\n When using streaming, this is the default trigger that the lakehouse-engine will\n use, unless you configure a different one.\nstreaming_continuous: set a trigger that runs a continuous query with a given\n checkpoint interval.\nstreaming_await_termination: whether to wait (True) for the termination of the\n streaming query (e.g. timeout or exception) or not (False). Default: True.\nstreaming_await_termination_timeout: a timeout to set to the\n streaming_await_termination. Default: None.\nwith_batch_id: whether to include the streaming batch id in the final data, or not.\n It only takes effect in streaming mode.\noptions: dict with other relevant options according to the execution environment\n (e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for\n streaming, etc.\nstreaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers\n but for the DQ functions to be executed. Used internally by the lakehouse\n engine, so you don't have to supply DQ functions through this parameter. Use the\n dq_specs of the acon instead.

\n"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tinput_id: str,\twrite_type: str,\tdata_format: str = 'delta',\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tmerge_opts: Optional[lakehouse_engine.core.definitions.MergeOptions] = None,\tpartitions: Optional[List[str]] = None,\tstreaming_micro_batch_transformers: Optional[List[lakehouse_engine.core.definitions.TransformerSpec]] = None,\tstreaming_once: Optional[bool] = None,\tstreaming_processing_time: Optional[str] = None,\tstreaming_available_now: bool = True,\tstreaming_continuous: Optional[str] = None,\tstreaming_await_termination: bool = True,\tstreaming_await_termination_timeout: Optional[int] = None,\twith_batch_id: bool = False,\toptions: Optional[dict] = None,\tstreaming_micro_batch_dq_processors: Optional[List[lakehouse_engine.core.definitions.DQSpec]] = None)"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec", "kind": "class", "doc": "

Terminator Specification.

\n\n

I.e., the specification that defines a terminator operation to be executed. Examples\nare compute statistics, vacuum, optimize, etc.

\n\n

spec_id: id of the terminate specification.\nfunction: terminator function to execute.\nargs: arguments of the terminator function.\ninput_id: id of the corresponding output specification (Optional).

\n"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tfunction: str,\targs: Optional[dict] = None,\tinput_id: Optional[str] = None)"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec", "kind": "class", "doc": "

Reconciliator Specification.

\n\n

metrics: list of metrics in the form of:\n [{\n metric: name of the column present in both truth and current datasets,\n aggregation: sum, avg, max, min, ...,\n type: percentage or absolute,\n yellow: value,\n red: value\n }].\nrecon_type: reconciliation type (percentage or absolute). Percentage calculates\n the difference between truth and current results as a percentage (x-y/x), and\n absolute calculates the raw difference (x - y).\ntruth_input_spec: input specification of the truth data.\ncurrent_input_spec: input specification of the current results data\ntruth_preprocess_query: additional query on top of the truth input data to\n preprocess the truth data before it gets fueled into the reconciliation process.\n Important note: you need to assume that the data out of\n the truth_input_spec is referencable by a table called 'truth'.\ntruth_preprocess_query_args: optional dict having the functions/transformations to\n apply on top of the truth_preprocess_query and respective arguments. Note: cache\n is being applied on the Dataframe, by default. For turning the default behavior\n off, pass \"truth_preprocess_query_args\": [].\ncurrent_preprocess_query: additional query on top of the current results input data\n to preprocess the current results data before it gets fueled into the\n reconciliation process. Important note: you need to assume that the data out of\n the current_results_input_spec is referencable by a table called 'current'.\ncurrent_preprocess_query_args: optional dict having the functions/transformations to\n apply on top of the current_preprocess_query and respective arguments. Note:\n cache is being applied on the Dataframe, by default. For turning the default\n behavior off, pass \"current_preprocess_query_args\": [].\nignore_empty_df: optional boolean, to ignore the recon process if source & target\n dataframes are empty, recon will exit success code (passed)

\n"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tmetrics: List[dict],\ttruth_input_spec: lakehouse_engine.core.definitions.InputSpec,\tcurrent_input_spec: lakehouse_engine.core.definitions.InputSpec,\ttruth_preprocess_query: Optional[str] = None,\ttruth_preprocess_query_args: Optional[List[dict]] = None,\tcurrent_preprocess_query: Optional[str] = None,\tcurrent_preprocess_query_args: Optional[List[dict]] = None,\tignore_empty_df: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec", "kind": "class", "doc": "

Data Quality Validator Specification.

\n\n

input_spec: input specification of the data to be checked/validated.\ndq_spec: data quality specification.\nrestore_prev_version: specify if, having\ndelta table/files as input, they should be restored to the\nprevious version if the data quality process fails. Note: this\nis only considered if fail_on_error is kept as True.

\n"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\trestore_prev_version: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions", "kind": "class", "doc": "

SQL definitions statements.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.compute_table_stats", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.compute_table_stats", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.compute_table_stats: 'ANALYZE TABLE {} COMPUTE STATISTICS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_table_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_table_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.drop_table_stmt: 'DROP TABLE IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_view_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_view_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.drop_view_stmt: 'DROP VIEW IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.truncate_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.truncate_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.truncate_stmt: 'TRUNCATE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.describe_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.describe_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.describe_stmt: 'DESCRIBE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.optimize_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.optimize_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.optimize_stmt: 'OPTIMIZE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.show_tbl_props_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.show_tbl_props_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.show_tbl_props_stmt: 'SHOW TBLPROPERTIES'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.delete_where_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.delete_where_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.delete_where_stmt: 'DELETE FROM {} WHERE {}'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys", "kind": "class", "doc": "

File Manager s3 api keys.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTENTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTENTS", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.CONTENTS: 'Contents'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.KEY", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.KEY", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.KEY: 'Key'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTINUATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTINUATION", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.CONTINUATION: 'NextContinuationToken'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.BUCKET", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.BUCKET", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.BUCKET: 'Bucket'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.OBJECTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.OBJECTS", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.OBJECTS: 'Objects'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec", "kind": "class", "doc": "

Sensor Specification.

\n\n

sensor_id: sensor id.\nassets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.\ncontrol_db_table_name: db.table to store sensor metadata.\ninput_spec: input specification of the source to be checked for new data.\npreprocess_query: SQL query to transform/filter the result from the\n upstream. Consider that we should refer to 'new_data' whenever\n we are referring to the input of the sensor. E.g.:\n \"SELECT dummy_col FROM new_data WHERE ...\"\ncheckpoint_location: optional location to store checkpoints to resume\n from. These checkpoints use the same as Spark checkpoint strategy.\n For Spark readers that do not support checkpoints, use the\n preprocess_query parameter to form a SQL query to filter the result\n from the upstream accordingly.\nfail_on_empty_result: if the sensor should throw an error if there is no new\n data in the upstream. Default: True.

\n"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tsensor_id: str,\tassets: List[str],\tcontrol_db_table_name: str,\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tpreprocess_query: Optional[str],\tcheckpoint_location: Optional[str],\tfail_on_empty_result: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.create_from_acon", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.create_from_acon", "kind": "function", "doc": "

Create SensorSpec from acon.

\n\n

Args:\n acon: sensor ACON.

\n", "signature": "(cls, acon: dict):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus", "kind": "class", "doc": "

Status for a sensor.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.ACQUIRED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.ACQUIRED_NEW_DATA", "kind": "variable", "doc": "

\n", "default_value": "<SensorStatus.ACQUIRED_NEW_DATA: 'ACQUIRED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.PROCESSED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.PROCESSED_NEW_DATA", "kind": "variable", "doc": "

\n", "default_value": "<SensorStatus.PROCESSED_NEW_DATA: 'PROCESSED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain", "kind": "class", "doc": "

Defaults used on consuming data from SAP Logchain.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.DBTABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.DBTABLE", "kind": "variable", "doc": "

\n", "default_value": "<SAPLogchain.DBTABLE: 'SAPPHA.RSPCLOGCHAIN'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.GREEN_STATUS", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.GREEN_STATUS", "kind": "variable", "doc": "

\n", "default_value": "<SAPLogchain.GREEN_STATUS: 'G'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.ENGINE_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.ENGINE_TABLE", "kind": "variable", "doc": "

\n", "default_value": "<SAPLogchain.ENGINE_TABLE: 'sensor_new_data'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType", "kind": "class", "doc": "

Archive types.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.BULK", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.BULK", "kind": "variable", "doc": "

\n", "default_value": "<RestoreType.BULK: 'Bulk'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.STANDARD", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.STANDARD", "kind": "variable", "doc": "

\n", "default_value": "<RestoreType.STANDARD: 'Standard'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.EXPEDITED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.EXPEDITED", "kind": "variable", "doc": "

\n", "default_value": "<RestoreType.EXPEDITED: 'Expedited'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.values", "kind": "function", "doc": "

Generates a list containing all enum values.

\n\n

Return:\n A list with all enum values.

\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.exists", "kind": "function", "doc": "

Checks if the restore type exists in the enum values.

\n\n

Args:\n restore_type: restore type to check if exists.

\n\n

Return:\n If the restore type exists in our enum.

\n", "signature": "(cls, restore_type: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus", "kind": "class", "doc": "

Archive types.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.NOT_STARTED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.NOT_STARTED", "kind": "variable", "doc": "

\n", "default_value": "<RestoreStatus.NOT_STARTED: 'not_started'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.ONGOING", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.ONGOING", "kind": "variable", "doc": "

\n", "default_value": "<RestoreStatus.ONGOING: 'ongoing'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.RESTORED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.RESTORED", "kind": "variable", "doc": "

\n", "default_value": "<RestoreStatus.RESTORED: 'restored'>"}, {"fullname": "lakehouse_engine.core.exec_env", "modulename": "lakehouse_engine.core.exec_env", "kind": "module", "doc": "

Module to take care of creating a singleton of the execution environment class.

\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv", "kind": "class", "doc": "

Represents the basic resources regarding the engine execution environment.

\n\n

Currently, it is solely used to encapsulate the logic to get a Spark session.

\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv.get_or_create", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv.get_or_create", "kind": "function", "doc": "

Get or create an execution environment session (currently Spark).

\n\n

It instantiates a singleton session that can be accessed anywhere from the\nlakehouse engine.

\n\n

Args:\n session: spark session.\n enable_hive_support: whether to enable hive support or not.\n app_name: application name.\n config: extra spark configs to supply to the spark session.

\n", "signature": "(\tcls,\tsession: pyspark.sql.session.SparkSession = None,\tenable_hive_support: bool = True,\tapp_name: str = None,\tconfig: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.executable", "modulename": "lakehouse_engine.core.executable", "kind": "module", "doc": "

Module representing an executable lakehouse engine component.

\n"}, {"fullname": "lakehouse_engine.core.executable.Executable", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable", "kind": "class", "doc": "

Abstract class defining the behaviour of an executable component.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.executable.Executable.execute", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable.execute", "kind": "function", "doc": "

Define the executable component behaviour.

\n\n

E.g., the behaviour of an algorithm inheriting from this.

\n", "signature": "(self) -> Optional[Any]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager", "modulename": "lakehouse_engine.core.file_manager", "kind": "module", "doc": "

File manager module.

\n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager", "kind": "class", "doc": "

Set of actions to manipulate files in several ways.

\n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.__init__", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.__init__", "kind": "function", "doc": "

Construct FileManager algorithm instances.

\n\n

Args:\n configs: configurations for the FileManager algorithm.

\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.get_function", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.get_function", "kind": "function", "doc": "

Get a specific function to execute.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.delete_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.delete_objects", "kind": "function", "doc": "

Delete objects and 'directories' in s3.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.copy_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.copy_objects", "kind": "function", "doc": "

Copies objects and 'directories' in s3.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.request_restore", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.request_restore", "kind": "function", "doc": "

Request the restore of archived data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.check_restore_status", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.check_restore_status", "kind": "function", "doc": "

Check the restore status of archived data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.request_restore_to_destination_and_wait", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.request_restore_to_destination_and_wait", "kind": "function", "doc": "

Request and wait for the restore to complete, polling the restore status.

\n\n

After the restore is done, copy the restored files to destination

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager", "kind": "class", "doc": "

Set of actions to restore archives.

\n"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.check_restore_status", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.check_restore_status", "kind": "function", "doc": "

Check the restore status of archived data.

\n\n

Args:\n source_bucket: name of bucket to check the restore status.\n source_object: object to check the restore status.

\n\n

Returns:\n A dict containing the amount of objects in each status.

\n", "signature": "(source_bucket: str, source_object: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.request_restore", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.request_restore", "kind": "function", "doc": "

Request the restore of archived data.

\n\n

Args:\n source_bucket: name of bucket to perform the restore.\n source_object: object to be restored.\n restore_expiration: restore expiration in days.\n retrieval_tier: type of restore, possible values are:\n Bulk, Standard or Expedited.\n dry_run: if dry_run is set to True the function will print a dict with\n all the paths that would be deleted based on the given keys.

\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.request_restore_and_wait", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.request_restore_and_wait", "kind": "function", "doc": "

Request and wait for the restore to complete, polling the restore status.

\n\n

Args:\n source_bucket: name of bucket to perform the restore.\n source_object: object to be restored.\n restore_expiration: restore expiration in days.\n retrieval_tier: type of restore, possible values are:\n Bulk, Standard or Expedited.\n dry_run: if dry_run is set to True the function will print a dict with\n all the paths that would be deleted based on the given keys.

\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager", "modulename": "lakehouse_engine.core.sensor_manager", "kind": "module", "doc": "

Module to define Sensor Manager classes.

\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager", "kind": "class", "doc": "

Class to control the Sensor execution.

\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.check_if_sensor_has_acquired_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.check_if_sensor_has_acquired_data", "kind": "function", "doc": "

Check if sensor has acquired new data.

\n\n

Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to control sensor runs.

\n\n

Returns:\n True if acquired new data, otherwise False

\n", "signature": "(cls, sensor_id: str, control_db_table_name: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.update_sensor_status", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.update_sensor_status", "kind": "function", "doc": "

Control sensor execution storing the execution data in a delta table.

\n\n

Args:\n sensor_spec: sensor spec containing all sensor\n information we need to update the control status.\n status: status of the sensor.\n upstream_key: upstream key (e.g., used to store an attribute\n name from the upstream so that new data can be detected\n automatically).\n upstream_value: upstream value (e.g., used to store the max\n attribute value from the upstream so that new data can be\n detected automatically).

\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec,\tstatus: str,\tupstream_key: str = None,\tupstream_value: str = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.read_sensor_table_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.read_sensor_table_data", "kind": "function", "doc": "

Read data from delta table containing sensor status info.

\n\n

Args:\n sensor_id: sensor id. If this parameter is defined search occurs\n only considering this parameter. Otherwise, it considers sensor\n assets and checkpoint location.\n control_db_table_name: db.table to control sensor runs.\n assets: list of assets that are fueled by the pipeline\n where this sensor is.

\n\n

Return:\n Row containing the data for the provided sensor_id.

\n", "signature": "(\tcls,\tcontrol_db_table_name: str,\tsensor_id: str = None,\tassets: list = None) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager", "kind": "class", "doc": "

Class to deal with Sensor Upstream data.

\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_filter_exp_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_filter_exp_query", "kind": "function", "doc": "

Generates a sensor preprocess query based on timestamp logic.

\n\n

Args:\n sensor_id: sensor id.\n filter_exp: expression to filter incoming new data.\n You can use the placeholder ?upstream_value so that\n it can be replaced by the upstream_value in the\n control_db_table_name for this specific sensor_id.\n control_db_table_name: db.table to retrieve the last status change\n timestamp. This is only relevant for the jdbc sensor.\n upstream_key: the key of custom sensor information\n to control how to identify new data from the\n upstream (e.g., a time column in the upstream).\n upstream_value: value for custom sensor\n to identify new data from the upstream\n (e.g., the value of a time present in the upstream)\n If none we will set the default value.\n Note: This parameter is used just to override the\n default value -2147483647.\n upstream_table_name: value for custom sensor\n to query new data from the upstream.\n If none we will set the default value,\n our sensor_new_data view.

\n\n

Return:\n The query string.

\n", "signature": "(\tcls,\tsensor_id: str,\tfilter_exp: str,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_table_preprocess_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_table_preprocess_query", "kind": "function", "doc": "

Generates a query to be used for a sensor having other sensor as upstream.

\n\n

Args:\n sensor_id: sensor id.

\n\n

Return:\n The query string.

\n", "signature": "(cls, sensor_id: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.read_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.read_new_data", "kind": "function", "doc": "

Read new data from the upstream into the sensor 'new_data_df'.

\n\n

Args:\n sensor_spec: sensor spec containing all sensor information.

\n\n

Return:\n An empty dataframe if it doesn't have new data otherwise the new data

\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.get_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.get_new_data", "kind": "function", "doc": "

Get new data from upstream df if it's present.

\n\n

Args:\n new_data_df: DataFrame possibly containing new data.

\n\n

Return:\n Optional row, present if there is new data in the upstream,\n absent otherwise.

\n", "signature": "(\tcls,\tnew_data_df: pyspark.sql.dataframe.DataFrame) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_sap_logchain_query", "kind": "function", "doc": "

Generates a sensor query based in the SAP Logchain table.

\n\n

Args:\n chain_id: chain id to query the status on SAP.\n dbtable: db.table to retrieve the data to\n check if the sap chain is already finished.\n status: db.table to retrieve the last status change\n timestamp.\n engine_table_name: table name exposed with the SAP LOGCHAIN data.\n This table will be used in the jdbc query.

\n\n

Return:\n The query string.

\n", "signature": "(\tcls,\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager", "modulename": "lakehouse_engine.core.table_manager", "kind": "module", "doc": "

Table manager module.

\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager", "kind": "class", "doc": "

Set of actions to manipulate tables/views in several ways.

\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.__init__", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.__init__", "kind": "function", "doc": "

Construct TableManager algorithm instances.

\n\n

Args:\n configs: configurations for the TableManager algorithm.

\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_function", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_function", "kind": "function", "doc": "

Get a specific function to execute.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create", "kind": "function", "doc": "

Create a new table or view on metastore.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create_many", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create_many", "kind": "function", "doc": "

Create multiple tables or views on metastore.

\n\n

In this function the path to the ddl files can be separated by comma.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.compute_table_statistics", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.compute_table_statistics", "kind": "function", "doc": "

Compute table statistics.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_table", "kind": "function", "doc": "

Delete table function deletes table from metastore and erases all data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_view", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_view", "kind": "function", "doc": "

Delete view function deletes view from metastore and erases all data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.truncate", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.truncate", "kind": "function", "doc": "

Truncate function erases all data but keeps metadata.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.vacuum", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.vacuum", "kind": "function", "doc": "

Vacuum function erases older versions from Delta Lake tables or locations.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.describe", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.describe", "kind": "function", "doc": "

Describe function describes metadata from some table or view.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.optimize", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.optimize", "kind": "function", "doc": "

Optimize function optimizes the layout of Delta Lake data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_multiple_sql_files", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_multiple_sql_files", "kind": "function", "doc": "

Execute multiple statements in multiple sql files.

\n\n

In this function the path to the files is separated by comma.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_sql", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_sql", "kind": "function", "doc": "

Execute sql commands separated by semicolon (;).

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.show_tbl_properties", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.show_tbl_properties", "kind": "function", "doc": "

Show Table Properties.

\n\n

Returns: a dataframe with the table properties.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_tbl_pk", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_tbl_pk", "kind": "function", "doc": "

Get the primary key of a particular table.

\n\n

Returns: the list of columns that are part of the primary key.

\n", "signature": "(self) -> List[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.repair_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.repair_table", "kind": "function", "doc": "

Run the repair table command.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.delete_where", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.delete_where", "kind": "function", "doc": "

Run the delete where command.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors", "modulename": "lakehouse_engine.dq_processors", "kind": "module", "doc": "

Package to define data quality processes available in the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "kind": "module", "doc": "

Module containing the definition of a data assistant.

\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant.Assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "qualname": "Assistant", "kind": "class", "doc": "

Class containing the data assistant.

\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant.Assistant.run_data_assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "qualname": "Assistant.run_data_assistant", "kind": "function", "doc": "

Entrypoint to run the data assistant.

\n\n

Based on the data, it uses GE Onboarding Data Assistant to generate expectations\nthat can be applied to the data. Then, it returns the generated expectations\nand, depending on your configuration, it can display plots of the metrics,\nexpectations and also display or store the profiling of the data, for you to get\na better sense of it.

\n\n

Args:\n context: the BaseDataContext containing the configurations for the data\n source and store backend.\n batch_request: batch request to be able to query underlying data.\n expectation_suite_name: name of the expectation suite.\n assistant_options: additional options to pass to the DQ assistant processor.\n data: the input dataframe for which the DQ is running.\n profile_file_name: file name for storing the profiling html file.

\n\n

Returns:\n The context with the expectation suite stored.

\n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tassistant_options: dict,\tdata: pyspark.sql.dataframe.DataFrame,\tprofile_file_name: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations", "modulename": "lakehouse_engine.dq_processors.custom_expectations", "kind": "module", "doc": "

Package containing custom DQ expectations available in the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "kind": "module", "doc": "

Expectation to check if column 'a' is lower or equal than column 'b'.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ColumnPairCustom", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ColumnPairCustom", "kind": "class", "doc": "

Asserts that column 'A' is lower or equal than column 'B'.

\n\n

Additionally, the 'margin' parameter can be used to add a margin to the\ncheck between column 'A' and 'B': 'A' <= 'B' + 'margin'.

\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_pair_map_metric_provider.ColumnPairMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ExpectColumnPairAToBeSmallerOrEqualThanB", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ExpectColumnPairAToBeSmallerOrEqualThanB", "kind": "class", "doc": "

Expect values in column A to be lower or equal than column B.

\n\n

Args:\n column_A: The first column name.\n column_B: The second column name.\n margin: additional approximation to column B value.

\n\n

Keyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).\n result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC (default), COMPLETE, or SUMMARY.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.

\n\n

Returns:\n An ExpectationSuiteValidationResult.

\n", "bases": "great_expectations.expectations.expectation.ColumnPairMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "kind": "module", "doc": "

Expectation to check if column value is a date within a timeframe.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ColumnValuesDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ColumnValuesDateNotOlderThan", "kind": "class", "doc": "

Asserts that column values are a date that isn't older than a given date.

\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_map_metric_provider.ColumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ExpectColumnValuesToBeDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ExpectColumnValuesToBeDateNotOlderThan", "kind": "class", "doc": "

Expect value in column to be date that is not older than a given time.

\n\n

Since timedelta can only define an interval up to weeks, a month is defined\nas 4 weeks and a year is defined as 52 weeks.

\n\n

Args:\n column: Name of column to validate\n Note: Column must be of type Date, Timestamp or String (with Timestamp format).\n Format: yyyy-MM-ddTHH:mm:ss\n timeframe: dict with the definition of the timeframe.\n kwargs: dict with additional parameters.

\n\n

Keyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).\n result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC (default), COMPLETE, or SUMMARY.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.

\n\n

Returns:\n An ExpectationSuiteValidationResult.

\n", "bases": "great_expectations.expectations.expectation.ColumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "kind": "module", "doc": "

Expectation to check if column 'a' equals 'b', or 'c'.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.MulticolumnCustomMetric", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "MulticolumnCustomMetric", "kind": "class", "doc": "

Expectation metric definition.

\n\n

This expectation asserts that column 'a' must equal to column 'b' or column 'c'.\nIn addition to this it is possible to validate that column 'b' or 'c' match a regex.

\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.multicolumn_map_metric_provider.MulticolumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.ExpectMulticolumnColumnAMustEqualBOrC", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "ExpectMulticolumnColumnAMustEqualBOrC", "kind": "class", "doc": "

MultiColumn Expectation.

\n\n

Expect that the column 'a' is equal to 'b' when this is\nnot empty; otherwise 'a' must be equal to 'c'.

\n\n

Args:\n column_list: The column names to evaluate.

\n\n

Keyword Args:\n ignore_row_if: default to \"never\".\n result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC, COMPLETE, or SUMMARY.\n Default set to BASIC.\n include_config: If True, then include the expectation\n config as part of the result object.\n Default set to True.\n catch_exceptions: If True, then catch exceptions\n and include them as part of the result object.\n Default set to False.

\n\n

Returns:\n An ExpectationSuiteValidationResult.

\n", "bases": "great_expectations.expectations.expectation.MulticolumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "kind": "module", "doc": "

Expectation to check if aggregated column satisfy the condition.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe", "kind": "class", "doc": "

Expect agg of column to satisfy the condition specified.

\n\n

Args:\n template_dict: dict with the following keys:\n column (column to check sum).\n group_column_list (group by column names to be listed).\n condition (how to validate the aggregated value eg: between,\n greater, lesser).\n max_value (maximum allowed value).\n min_value (minimum allowed value).\n agg_type (sum/count/max/min).

\n", "bases": "great_expectations.expectations.expectation.QueryExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe.validate_configuration", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe.validate_configuration", "kind": "function", "doc": "

Validates that a configuration has been set.

\n\n

Args:\n configuration (OPTIONAL[ExpectationConfiguration]):\n An optional Expectation Configuration entry.

\n\n

Returns:\n None. Raises InvalidExpectationConfigurationError

\n", "signature": "(\tself,\tconfiguration: Optional[great_expectations.core.expectation_configuration.ExpectationConfiguration] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "kind": "module", "doc": "

Module containing the class definition of the Data Quality Factory.

\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory", "kind": "class", "doc": "

Class for the Data Quality Factory.

\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory.run_dq_process", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory.run_dq_process", "kind": "function", "doc": "

Run the specified data quality process on a dataframe.

\n\n

Based on the dq_specs we apply the defined expectations on top of the dataframe\nin order to apply the necessary validations and then output the result of\nthe data quality process.

\n\n

Args:\n dq_spec: data quality specification.\n data: input dataframe to run the dq process on.

\n\n

Returns:\n The DataFrame containing the results of the DQ process.

\n", "signature": "(\tcls,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.exceptions", "modulename": "lakehouse_engine.dq_processors.exceptions", "kind": "module", "doc": "

Package defining all the DQ custom exceptions.

\n"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQValidationsFailedException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQValidationsFailedException", "kind": "class", "doc": "

Exception for when the data quality validations fail.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQCheckpointsResultsException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQCheckpointsResultsException", "kind": "class", "doc": "

Exception for when the checkpoint results parsing fail.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.validator", "modulename": "lakehouse_engine.dq_processors.validator", "kind": "module", "doc": "

Module containing the definition of a data quality validator.

\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator", "kind": "class", "doc": "

Class containing the data quality validator.

\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.get_dq_validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.get_dq_validator", "kind": "function", "doc": "

Get a validator according to the specification.

\n\n

We use getattr to dynamically execute any expectation available.\ngetattr(validator, function) is similar to validator.function(). With this\napproach, we can execute any expectation supported.

\n\n

Args:\n context: the BaseDataContext containing the configurations for the data\n source and store backend.\n batch_request: run time batch request to be able to query underlying data.\n expectation_suite_name: name of the expectation suite.\n dq_functions: a list of DQFunctionSpec to consider in the expectation suite.\n critical_functions: list of critical expectations in the expectation suite.

\n\n

Returns:\n The validator with the expectation suite stored.

\n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tdq_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec],\tcritical_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec]) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.tag_source_with_dq", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.tag_source_with_dq", "kind": "function", "doc": "

Tags the source dataframe with a new column having the DQ results.

\n\n

Args:\n source_pk: the primary key of the source data.\n source_df: the source dataframe to be tagged with DQ results.\n results_df: dq results dataframe.

\n\n

Returns: a dataframe tagged with the DQ results.

\n", "signature": "(\tcls,\tsource_pk: List[str],\tsource_df: pyspark.sql.dataframe.DataFrame,\tresults_df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine", "modulename": "lakehouse_engine.engine", "kind": "module", "doc": "

Contract of the lakehouse engine with all the available functions to be executed.

\n"}, {"fullname": "lakehouse_engine.engine.load_data", "modulename": "lakehouse_engine.engine", "qualname": "load_data", "kind": "function", "doc": "

Load data using the DataLoader algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).

\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_reconciliation", "modulename": "lakehouse_engine.engine", "qualname": "execute_reconciliation", "kind": "function", "doc": "

Execute the Reconciliator algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_dq_validation", "modulename": "lakehouse_engine.engine", "qualname": "execute_dq_validation", "kind": "function", "doc": "

Execute the DQValidator algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_table", "modulename": "lakehouse_engine.engine", "qualname": "manage_table", "kind": "function", "doc": "

Manipulate tables/views using Table Manager algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_files", "modulename": "lakehouse_engine.engine", "qualname": "manage_files", "kind": "function", "doc": "

Manipulate s3 files using File Manager algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_sensor", "modulename": "lakehouse_engine.engine", "qualname": "execute_sensor", "kind": "function", "doc": "

Execute a sensor based on a Sensor Algorithm Configuration.

\n\n

A sensor is useful to check if an upstream system has new data.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.update_sensor_status", "modulename": "lakehouse_engine.engine", "qualname": "update_sensor_status", "kind": "function", "doc": "

Update internal sensor status.

\n\n

Update the sensor status in the control table,\nit should be used to tell the system\nthat the sensor has processed all new data that was previously identified,\nhence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA,\nbut there might be scenarios - still to identify -\nwhere we can update the sensor status from/to different statuses.

\n\n

Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to store sensor checkpoints.\n status: status of the sensor.\n assets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.

\n", "signature": "(\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_query", "kind": "function", "doc": "

Generates a preprocess query to be used in a sensor configuration.

\n\n

Args:\n sensor_id: sensor id.\n filter_exp: expression to filter incoming new data.\n You can use the placeholder ?default_upstream_key and\n ?default_upstream_value, so that it can be replaced by the\n respective values in the control_db_table_name for this specific\n sensor_id.\n control_db_table_name: db.table to retrieve the last status change\n timestamp. This is only relevant for the jdbc sensor.\n upstream_key: the key of custom sensor information to control how to\n identify new data from the upstream (e.g., a time column in the\n upstream).\n upstream_value: the upstream value\n to identify new data from the upstream (e.g., the value of a time\n present in the upstream).\n upstream_table_name: value for custom sensor\n to query new data from the upstream\n If none we will set the default value,\n our sensor_new_data view.

\n\n

Return:\n The query string.

\n", "signature": "(\tsensor_id: str,\tfilter_exp: str = None,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_sap_logchain_query", "kind": "function", "doc": "

Generates a sensor query based in the SAP Logchain table.

\n\n

Args:\n chain_id: chain id to query the status on SAP.\n dbtable: db.table to retrieve the data to\n check if the sap chain is already finished.\n status: db.table to retrieve the last status change\n timestamp.\n engine_table_name: table name exposed with the SAP LOGCHAIN data.\n This table will be used in the jdbc query.

\n\n

Return:\n The query string.

\n", "signature": "(\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.send_notification", "modulename": "lakehouse_engine.engine", "qualname": "send_notification", "kind": "function", "doc": "

Send a notification using a notifier.

\n\n

Args:\n args: arguments for the notifier.

\n", "signature": "(args: dict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io", "modulename": "lakehouse_engine.io", "kind": "module", "doc": "

Input and Output package responsible for the behaviour of reading and writing.

\n"}, {"fullname": "lakehouse_engine.io.exceptions", "modulename": "lakehouse_engine.io.exceptions", "kind": "module", "doc": "

Package defining all the io custom exceptions.

\n"}, {"fullname": "lakehouse_engine.io.exceptions.IncrementalFilterInputNotFoundException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "IncrementalFilterInputNotFoundException", "kind": "class", "doc": "

Exception for when the input of an incremental filter is not found.

\n\n

This may occur when tables are being loaded in incremental way, taking the increment\ndefinition out of a specific table, but the table still does not exist, mainly\nbecause probably it was not loaded for the first time yet.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.WrongIOFormatException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "WrongIOFormatException", "kind": "class", "doc": "

Exception for when a user provides a wrong I/O format.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.NotSupportedException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "NotSupportedException", "kind": "class", "doc": "

Exception for when a user provides a not supported operation.

\n", "bases": "builtins.RuntimeError"}, {"fullname": "lakehouse_engine.io.reader", "modulename": "lakehouse_engine.io.reader", "kind": "module", "doc": "

Defines abstract reader behaviour.

\n"}, {"fullname": "lakehouse_engine.io.reader.Reader", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader", "kind": "class", "doc": "

Abstract Reader class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader.Reader.__init__", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.__init__", "kind": "function", "doc": "

Construct Reader instances.

\n\n

Args:\n input_spec: input specification for reading data.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.reader.Reader.read", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.read", "kind": "function", "doc": "

Abstract read method.

\n\n

Returns:\n A dataframe read according to the input specification.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.reader_factory", "modulename": "lakehouse_engine.io.reader_factory", "kind": "module", "doc": "

Module for reader factory.

\n"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory", "kind": "class", "doc": "

Class for reader factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory.get_data", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory.get_data", "kind": "function", "doc": "

Get data according to the input specification following a factory pattern.

\n\n

Args:\n spec: input specification to get the data.

\n\n

Returns:\n A dataframe containing the data.

\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.InputSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers", "modulename": "lakehouse_engine.io.readers", "kind": "module", "doc": "

Readers package to define reading behaviour.

\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "kind": "module", "doc": "

Module to define behaviour to read from dataframes.

\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader", "kind": "class", "doc": "

Class to read data from a dataframe.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.__init__", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.__init__", "kind": "function", "doc": "

Construct DataFrameReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.read", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.read", "kind": "function", "doc": "

Read data from a dataframe.

\n\n

Returns:\n A dataframe containing the data from a dataframe previously\n computed.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.file_reader", "modulename": "lakehouse_engine.io.readers.file_reader", "kind": "module", "doc": "

Module to define behaviour to read from files.

\n"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader", "kind": "class", "doc": "

Class to read from files.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.__init__", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.__init__", "kind": "function", "doc": "

Construct FileReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.read", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.read", "kind": "function", "doc": "

Read file data.

\n\n

Returns:\n A dataframe containing the data from the files.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "kind": "module", "doc": "

Module to define behaviour to read from JDBC sources.

\n"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader", "kind": "class", "doc": "

Class to read from JDBC source.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.__init__", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.__init__", "kind": "function", "doc": "

Construct JDBCReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.read", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.read", "kind": "function", "doc": "

Read data from JDBC source.

\n\n

Returns:\n A dataframe containing the data from the JDBC source.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "kind": "module", "doc": "

Module to define behaviour to read from Kafka.

\n"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader", "kind": "class", "doc": "

Class to read from Kafka.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.__init__", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.__init__", "kind": "function", "doc": "

Construct KafkaReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.read", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.read", "kind": "function", "doc": "

Read Kafka data.

\n\n

Returns:\n A dataframe containing the data from Kafka.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.query_reader", "modulename": "lakehouse_engine.io.readers.query_reader", "kind": "module", "doc": "

Module to define behaviour to read from a query.

\n"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader", "kind": "class", "doc": "

Class to read data from a query.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.__init__", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.__init__", "kind": "function", "doc": "

Construct QueryReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.read", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.read", "kind": "function", "doc": "

Read data from a query.

\n\n

Returns:\n A dataframe containing the data from the query.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "kind": "module", "doc": "

Module to define behaviour to read from SAP B4 sources.

\n"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader", "kind": "class", "doc": "

Class to read from SAP B4 source.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.__init__", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.__init__", "kind": "function", "doc": "

Construct SAPB4Reader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.read", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.read", "kind": "function", "doc": "

Read data from SAP B4 source.

\n\n

Returns:\n A dataframe containing the data from the SAP B4 source.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "kind": "module", "doc": "

Module to define behaviour to read from SAP BW sources.

\n"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader", "kind": "class", "doc": "

Class to read from SAP BW source.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.__init__", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.__init__", "kind": "function", "doc": "

Construct SAPBWReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.read", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.read", "kind": "function", "doc": "

Read data from SAP BW source.

\n\n

Returns:\n A dataframe containing the data from the SAP BW source.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "kind": "module", "doc": "

Module to define behaviour to read from SFTP.

\n"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader", "kind": "class", "doc": "

Class to read from SFTP.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.__init__", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.__init__", "kind": "function", "doc": "

Construct SFTPReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.read", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.read", "kind": "function", "doc": "

Read SFTP data.

\n\n

Returns:\n A dataframe containing the data from SFTP.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.table_reader", "modulename": "lakehouse_engine.io.readers.table_reader", "kind": "module", "doc": "

Module to define behaviour to read from tables.

\n"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader", "kind": "class", "doc": "

Class to read data from a table.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.__init__", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.__init__", "kind": "function", "doc": "

Construct TableReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.read", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.read", "kind": "function", "doc": "

Read data from a table.

\n\n

Returns:\n A dataframe containing the data from the table.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer", "modulename": "lakehouse_engine.io.writer", "kind": "module", "doc": "

Defines abstract writer behaviour.

\n"}, {"fullname": "lakehouse_engine.io.writer.Writer", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer", "kind": "class", "doc": "

Abstract Writer class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer.Writer.__init__", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.__init__", "kind": "function", "doc": "

Construct Writer instances.

\n\n

Args:\n output_spec: output specification to write data.\n df: dataframe to write.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict = None)"}, {"fullname": "lakehouse_engine.io.writer.Writer.write", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write", "kind": "function", "doc": "

Abstract write method.

\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.write_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write_transformed_micro_batch", "kind": "function", "doc": "

Define how to write a streaming micro batch after transforming it.

\n\n

This function must define an inner function that manipulates a streaming batch,\nand then return that function. Look for concrete implementations of this\nfunction for more clarity.

\n\n

Args:\n kwargs: any keyword arguments.

\n\n

Returns:\n A function to be executed in the foreachBatch spark write method.

\n", "signature": "(**kwargs: Any) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_transformed_micro_batch", "kind": "function", "doc": "

Get the result of the transformations applied to a micro batch dataframe.

\n\n

Args:\n output_spec: output specification associated with the writer.\n batch_df: batch dataframe (given from streaming foreachBatch).\n batch_id: if of the batch (given from streaming foreachBatch).\n data: list of all dfs generated on previous steps before writer\n to be available on micro batch transforms.

\n\n

Returns:\n The transformed dataframe.

\n", "signature": "(\tcls,\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tbatch_df: pyspark.sql.dataframe.DataFrame,\tbatch_id: int,\tdata: OrderedDict) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_streaming_trigger", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_streaming_trigger", "kind": "function", "doc": "

Define which streaming trigger will be used.

\n\n

Args:\n output_spec: output specification.

\n\n

Returns:\n A dict containing streaming trigger.

\n", "signature": "(cls, output_spec: lakehouse_engine.core.definitions.OutputSpec) -> Dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.run_micro_batch_dq_process", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.run_micro_batch_dq_process", "kind": "function", "doc": "

Run the data quality process in a streaming micro batch dataframe.

\n\n

Iterates over the specs and performs the checks or analysis depending on the\ndata quality specification provided in the configuration.

\n\n

Args:\n df: the dataframe in which to run the dq process on.\n dq_spec: data quality specification.

\n\n

Returns: the validated dataframe.

\n", "signature": "(\tdf: pyspark.sql.dataframe.DataFrame,\tdq_spec: List[lakehouse_engine.core.definitions.DQSpec]) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer_factory", "modulename": "lakehouse_engine.io.writer_factory", "kind": "module", "doc": "

Module for writer factory.

\n"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory", "kind": "class", "doc": "

Class for writer factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory.get_writer", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory.get_writer", "kind": "function", "doc": "

Get a writer according to the output specification using a factory pattern.

\n\n

Args:\n OutputSpec spec: output specification to write data.\n DataFrame df: dataframe to be written.\n OrderedDict data: list of all dfs generated on previous steps before writer.

\n\n

Returns:\n Writer: writer that will write the data.

\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict) -> lakehouse_engine.io.writer.Writer:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers", "modulename": "lakehouse_engine.io.writers", "kind": "module", "doc": "

Package containing the writers responsible for writing data.

\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer", "modulename": "lakehouse_engine.io.writers.console_writer", "kind": "module", "doc": "

Module to define behaviour to write to console.

\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter", "kind": "class", "doc": "

Class to write data to console.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.__init__", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.__init__", "kind": "function", "doc": "

Construct ConsoleWriter instances.

\n\n

Args:\n output_spec: output specification\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.write", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.write", "kind": "function", "doc": "

Write data to console.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "kind": "module", "doc": "

Module to define behaviour to write to dataframe.

\n"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter", "kind": "class", "doc": "

Class to write data to dataframe.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.__init__", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.__init__", "kind": "function", "doc": "

Construct DataFrameWriter instances.

\n\n

Args:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.write", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.write", "kind": "function", "doc": "

Write data to dataframe.

\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "kind": "module", "doc": "

Module to define the behaviour of delta merges.

\n"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter", "kind": "class", "doc": "

Class to merge data using delta lake.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.__init__", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.__init__", "kind": "function", "doc": "

Construct DeltaMergeWriter instances.

\n\n

Args:\n output_spec: output specification containing merge options and\n relevant information.\n df: the dataframe containing the new data to be merged.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.write", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.write", "kind": "function", "doc": "

Merge new data with current data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.file_writer", "modulename": "lakehouse_engine.io.writers.file_writer", "kind": "module", "doc": "

Module to define behaviour to write to files.

\n"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter", "kind": "class", "doc": "

Class to write data to files.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.__init__", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.__init__", "kind": "function", "doc": "

Construct FileWriter instances.

\n\n

Args:\n output_spec: output specification\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.write", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.write", "kind": "function", "doc": "

Write data to files.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "kind": "module", "doc": "

Module that defines the behaviour to write to JDBC targets.

\n"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter", "kind": "class", "doc": "

Class to write to JDBC targets.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.__init__", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.__init__", "kind": "function", "doc": "

Construct JDBCWriter instances.

\n\n

Args:\n output_spec: output specification.\n df: dataframe to be writen.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.write", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.write", "kind": "function", "doc": "

Write data into JDBC target.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer", "modulename": "lakehouse_engine.io.writers.kafka_writer", "kind": "module", "doc": "

Module that defines the behaviour to write to Kafka.

\n"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter", "kind": "class", "doc": "

Class to write to a Kafka target.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.__init__", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.__init__", "kind": "function", "doc": "

Construct KafkaWriter instances.

\n\n

Args:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.write", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.write", "kind": "function", "doc": "

Write data to Kafka.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.table_writer", "modulename": "lakehouse_engine.io.writers.table_writer", "kind": "module", "doc": "

Module that defines the behaviour to write to tables.

\n"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter", "kind": "class", "doc": "

Class to write to a table.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.__init__", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.__init__", "kind": "function", "doc": "

Construct TableWriter instances.

\n\n

Args:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.write", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.write", "kind": "function", "doc": "

Write data to a table.

\n\n

After the write operation we repair the table (e.g., update partitions).\nHowever, there's a caveat to this, which is the fact that this repair\noperation is not reachable if we are running long-running streaming mode.\nTherefore, we recommend not using the TableWriter with formats other than\ndelta lake for those scenarios (as delta lake does not need msck repair).\nSo, you can: 1) use delta lake format for the table; 2) use the FileWriter\nand run the repair with a certain frequency in a separate task of your\npipeline.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators", "modulename": "lakehouse_engine.terminators", "kind": "module", "doc": "

Package to define algorithm terminators (e.g., vacuum, optimize, compute stats).

\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor", "modulename": "lakehouse_engine.terminators.cdf_processor", "kind": "module", "doc": "

Defines change data feed processor behaviour.

\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor", "kind": "class", "doc": "

Change data feed processor class.

\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.expose_cdf", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.expose_cdf", "kind": "function", "doc": "

Expose CDF to external location.

\n\n

Args:\n spec: terminator specification.

\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.delete_old_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.delete_old_data", "kind": "function", "doc": "

Delete old data from cdf delta table.

\n\n

Args:\n spec: terminator specifications.

\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.vacuum_cdf_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.vacuum_cdf_data", "kind": "function", "doc": "

Vacuum old data from cdf delta table.

\n\n

Args:\n spec: terminator specifications.

\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "kind": "module", "doc": "

Module with dataset optimizer terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer", "kind": "class", "doc": "

Class with dataset optimizer terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer.optimize_dataset", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer.optimize_dataset", "kind": "function", "doc": "

Optimize a dataset based on a set of pre-conceived optimizations.

\n\n

Most of the times the dataset is a table, but it can be a file-based one only.

\n\n

Args:\n db_table: database_name.table_name.\n location: dataset/table filesystem location.\n compute_table_stats: to compute table statistics or not.\n vacuum: (delta lake tables only) whether to vacuum the delta lake\n table or not.\n vacuum_hours: (delta lake tables only) number of hours to consider\n in vacuum operation.\n optimize: (delta lake tables only) whether to optimize the table or\n not. Custom optimize parameters can be supplied through ExecEnv (Spark)\n configs\n optimize_where: expression to use in the optimize function.\n optimize_zorder_col_list: (delta lake tables only) list of\n columns to consider in the zorder optimization process. Custom optimize\n parameters can be supplied through ExecEnv (Spark) configs.\n debug: flag indicating if we are just debugging this for local\n tests and therefore pass through all the exceptions to perform some\n assertions in local tests.

\n", "signature": "(\tcls,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tcompute_table_stats: bool = True,\tvacuum: bool = True,\tvacuum_hours: int = 720,\toptimize: bool = True,\toptimize_where: Optional[str] = None,\toptimize_zorder_col_list: Optional[List[str]] = None,\tdebug: bool = False) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier", "modulename": "lakehouse_engine.terminators.notifier", "kind": "module", "doc": "

Module with notification terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier", "kind": "class", "doc": "

Abstract Notification class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.__init__", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.__init__", "kind": "function", "doc": "

Construct Notification instances.

\n\n

Args:\n notification_spec: notification specification.

\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.create_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.create_notification", "kind": "function", "doc": "

Abstract create notification method.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.send_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.send_notification", "kind": "function", "doc": "

Abstract send notification method.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.check_if_notification_is_failure_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.check_if_notification_is_failure_notification", "kind": "function", "doc": "

Check if given notification is a failure notification.

\n\n

Args:\n spec: spec to validate if it is a failure notification.

\n\n

Returns:\n A boolean telling if the notification is a failure notification

\n", "signature": "(spec: lakehouse_engine.core.definitions.TerminatorSpec) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory", "modulename": "lakehouse_engine.terminators.notifier_factory", "kind": "module", "doc": "

Module for notifier factory.

\n"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory", "kind": "class", "doc": "

Class for notification factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.get_notifier", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.get_notifier", "kind": "function", "doc": "

Get a notifier according to the terminator specs using a factory.

\n\n

Args:\n spec: terminator specification.

\n\n

Returns:\n Notifier: notifier that will handle notifications.

\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.TerminatorSpec) -> lakehouse_engine.terminators.notifier.Notifier:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.generate_failure_notification", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.generate_failure_notification", "kind": "function", "doc": "

Check if it is necessary to send a failure notification and generate it.

\n\n

Args:\n spec: List of termination specs\n exception: Exception that caused the failure.

\n", "signature": "(spec: list, exception: Exception) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers", "modulename": "lakehouse_engine.terminators.notifiers", "kind": "module", "doc": "

Notifications module.

\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "kind": "module", "doc": "

Module with email notifier.

\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier", "kind": "class", "doc": "

Base Notification class.

\n", "bases": "lakehouse_engine.terminators.notifier.Notifier"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.__init__", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.__init__", "kind": "function", "doc": "

Construct Email Notification instance.

\n\n

Args:\n notification_spec: notification specification.

\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.create_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.create_notification", "kind": "function", "doc": "

Creates the notification to be sent.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.send_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.send_notification", "kind": "function", "doc": "

Sends the notification by using a series of methods.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "kind": "module", "doc": "

Email notification templates.

\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates.NotificationsTemplates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "qualname": "NotificationsTemplates", "kind": "class", "doc": "

Templates for notifications.

\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "kind": "module", "doc": "

Defines terminator behaviour.

\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator", "kind": "class", "doc": "

Sensor Terminator class.

\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator.update_sensor_status", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator.update_sensor_status", "kind": "function", "doc": "

Update internal sensor status.

\n\n

Update the sensor status in the control table, it should be used to tell the\nsystem that the sensor has processed all new data that was previously\nidentified, hence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA, but there might be scenarios - still\nto identify - where we can update the sensor status from/to different statuses.

\n\n

Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to store sensor checkpoints.\n status: status of the sensor.\n assets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.

\n", "signature": "(\tcls,\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.spark_terminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "kind": "module", "doc": "

Defines terminator behaviour.

\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator", "kind": "class", "doc": "

Spark Terminator class.

\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator.terminate_spark", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator.terminate_spark", "kind": "function", "doc": "

Terminate spark session.

\n", "signature": "(cls) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.terminator_factory", "modulename": "lakehouse_engine.terminators.terminator_factory", "kind": "module", "doc": "

Module with the factory pattern to return terminators.

\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory", "kind": "class", "doc": "

TerminatorFactory class following the factory pattern.

\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory.execute_terminator", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory.execute_terminator", "kind": "function", "doc": "

Execute a terminator following the factory pattern.

\n\n

Args:\n spec: terminator specification.\n df: dataframe to be used in the terminator. Needed when a\n terminator requires one dataframe as input.

\n\n

Returns:\n Transformer function to be executed in .transform() spark function.

\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TerminatorSpec,\tdf: Optional[pyspark.sql.dataframe.DataFrame] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers", "modulename": "lakehouse_engine.transformers", "kind": "module", "doc": "

Package to define transformers available in the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.transformers.aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "kind": "module", "doc": "

Aggregators module.

\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators", "kind": "class", "doc": "

Class containing all aggregation functions.

\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators.get_max_value", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators.get_max_value", "kind": "function", "doc": "

Get the maximum value of a given column of a dataframe.

\n\n

Args:\n input_col: name of the input column.\n output_col: name of the output column (defaults to \"latest\").

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(input_col: str, output_col: str = 'latest') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators", "modulename": "lakehouse_engine.transformers.column_creators", "kind": "module", "doc": "

Column creators transformers module.

\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators", "kind": "class", "doc": "

Class containing all functions that can create columns to add value.

\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_row_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_row_id", "kind": "function", "doc": "

Create a sequential but not consecutive id.

\n\n

Args:\n output_col: optional name of the output column.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_auto_increment_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_auto_increment_id", "kind": "function", "doc": "

Create a sequential and consecutive id.

\n\n

Args:\n output_col: optional name of the output column.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_literals", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_literals", "kind": "function", "doc": "

Create columns given a map of column names and literal values (constants).

\n\n

Args:\n Dict[str, Any] literals: map of column names and literal values (constants).

\n\n

Returns:\n Callable: A function to be executed in the .transform() spark function.

\n", "signature": "(cls, literals: Dict[str, Any]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "kind": "module", "doc": "

Module with column reshaping transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers", "kind": "class", "doc": "

Class containing column reshaping transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.cast", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.cast", "kind": "function", "doc": "

Cast specific columns into the designated type.

\n\n

Args:\n cols: dict with columns and respective target types.\n Target types need to have the exact name of spark types:\n https://spark.apache.org/docs/latest/sql-ref-datatypes.html

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, cols: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.column_selector", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.column_selector", "kind": "function", "doc": "

Select specific columns with specific output aliases.

\n\n

Args:\n cols: dict with columns to select and respective aliases.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, cols: collections.OrderedDict) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.flatten_schema", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.flatten_schema", "kind": "function", "doc": "

Flatten the schema of the dataframe.

\n\n

Args:\n max_level: level until which you want to flatten the schema.\n Default: None.\n shorten_names: whether to shorten the names of the prefixes\n of the fields being flattened or not. Default: False.\n alias: whether to define alias for the columns being flattened\n or not. Default: True.\n num_chars: number of characters to consider when shortening\n the names of the fields. Default: 7.\n ignore_cols: columns which you don't want to flatten.\n Default: None.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.explode_columns", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.explode_columns", "kind": "function", "doc": "

Explode columns with types like ArrayType and MapType.

\n\n

After it can be applied the flatten_schema transformation,\nif we desired for example to explode the map (as we explode a StructType)\nor to explode a StructType inside the array.\nWe recommend you to specify always the columns desired to explode\nand not explode all columns.

\n\n

Args:\n explode_arrays: whether you want to explode array columns (True)\n or not (False). Default: False.\n array_cols_to_explode: array columns which you want to explode.\n If you don't specify it will get all array columns and explode them.\n Default: None.\n explode_maps: whether you want to explode map columns (True)\n or not (False). Default: False.\n map_cols_to_explode: map columns which you want to explode.\n If you don't specify it will get all map columns and explode them.\n Default: None.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\texplode_arrays: bool = False,\tarray_cols_to_explode: List[str] = None,\texplode_maps: bool = False,\tmap_cols_to_explode: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.with_expressions", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.with_expressions", "kind": "function", "doc": "

Execute Spark SQL expressions to create the specified columns.

\n\n

This function uses the Spark expr function:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/\npyspark.sql.functions.expr.html

\n\n

Args:\n cols_and_exprs: dict with columns and respective expressions to compute\n (Spark SQL expressions).

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, cols_and_exprs: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.rename", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.rename", "kind": "function", "doc": "

Rename specific columns into the designated name.

\n\n

Args:\n cols: dict with columns and respective target names.\n escape_col_names: whether to escape column names (e.g. /BIC/COL1) or not.\n If True it creates a column with the new name and drop the old one.\n If False, uses the native withColumnRenamed Spark function. Default: True.

\n\n

Returns:\n Function to be called in .transform() spark function.

\n", "signature": "(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro", "kind": "function", "doc": "

Select all attributes from avro.

\n\n

Args:\n schema: the schema string.\n key_col: the name of the key column.\n value_col: the name of the value column.\n options: extra options (e.g., mode: \"PERMISSIVE\").\n expand_key: whether you want to expand the content inside the key\n column or not. Default: false.\n expand_value: whether you want to expand the content inside the value\n column or not. Default: true.

\n\n

Returns:\n Function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tschema: str = None,\tkey_col: str = 'key',\tvalue_col: str = 'value',\toptions: dict = None,\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro_with_registry", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro_with_registry", "kind": "function", "doc": "

Select all attributes from avro using a schema registry.

\n\n

Args:\n schema_registry: the url to the schema registry.\n value_schema: the name of the value schema entry in the schema registry.\n value_col: the name of the value column.\n key_schema: the name of the key schema entry in the schema\n registry. Default: None.\n key_col: the name of the key column.\n expand_key: whether you want to expand the content inside the key\n column or not. Default: false.\n expand_value: whether you want to expand the content inside the value\n column or not. Default: true.

\n\n

Returns:\n Function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tschema_registry: str,\tvalue_schema: str,\tvalue_col: str = 'value',\tkey_schema: str = None,\tkey_col: str = 'key',\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_json", "kind": "function", "doc": "

Convert a json string into a json column (struct).

\n\n

The new json column can be added to the existing columns (default) or it can\nreplace all the others, being the only one to output. The new column gets the\nsame name as the original one suffixed with '_json'.

\n\n

Args:\n input_col: dict with columns and respective target names.\n schema_path: path to the StructType schema (spark schema).\n schema: dict with the StructType schema (spark schema).\n json_options: options to parse the json value.\n drop_all_cols: whether to drop all the input columns or not.\n Defaults to False.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tinput_col: str,\tschema_path: Optional[str] = None,\tschema: Optional[dict] = None,\tjson_options: Optional[dict] = None,\tdrop_all_cols: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.to_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.to_json", "kind": "function", "doc": "

Convert dataframe columns into a json value.

\n\n

Args:\n in_cols: name(s) of the input column(s).\n Example values:\n \"*\" - all\n columns; \"my_col\" - one column named \"my_col\";\n \"my_col1, my_col2\" - two columns.\n out_col: name of the output column.\n json_options: options to parse the json value.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tin_cols: List[str],\tout_col: str,\tjson_options: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers", "modulename": "lakehouse_engine.transformers.condensers", "kind": "module", "doc": "

Condensers module.

\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers", "kind": "class", "doc": "

Class containing all the functions to condensate data for later merges.

\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.condense_record_mode_cdc", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.condense_record_mode_cdc", "kind": "function", "doc": "

Condense Change Data Capture (CDC) based on record_mode strategy.

\n\n

This CDC data is particularly seen in some CDC enabled systems. Other systems\nmay have different CDC strategies.

\n\n

Args:\n business_key: The business key (logical primary key) of the data.\n ranking_key_desc: In this type of CDC condensation the data needs to be\n ordered descendingly in a certain way, using columns specified in this\n parameter.\n ranking_key_asc: In this type of CDC condensation the data needs to be\n ordered ascendingly in a certain way, using columns specified in\n this parameter.\n record_mode_col: Name of the record mode input_col.\n valid_record_modes: Depending on the context, not all record modes may be\n considered for condensation. Use this parameter to skip those.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(\tcls,\tbusiness_key: List[str],\trecord_mode_col: str,\tvalid_record_modes: List[str],\tranking_key_desc: Optional[List[str]] = None,\tranking_key_asc: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.group_and_rank", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.group_and_rank", "kind": "function", "doc": "

Condense data based on a simple group by + take latest mechanism.

\n\n

Args:\n group_key: list of column names to use in the group by.\n ranking_key: the data needs to be ordered descendingly using columns\n specified in this parameter.\n descending: if the ranking considers descending order or not. Defaults to\n True.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(\tcls,\tgroup_key: List[str],\tranking_key: List[str],\tdescending: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.custom_transformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "kind": "module", "doc": "

Custom transformers module.

\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers", "kind": "class", "doc": "

Class representing a CustomTransformers.

\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers.custom_transformation", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers.custom_transformation", "kind": "function", "doc": "

Execute a custom transformation provided by the user.

\n\n

This transformer can be very useful whenever the user cannot use our provided\ntransformers, or they want to write complex logic in the transform step of the\nalgorithm.

\n\n

Attention!!! Please bare in mind that the custom_transformer function provided\nas argument needs to receive a DataFrame and return a DataFrame, because it is\nhow Spark's .transform method is able to chain the transformations.\nExample:\n def my_custom_logic(df: DataFrame) -> DataFrame:

\n\n

Args:\n custom_transformer: custom transformer function. A python function with all\n required pyspark logic provided by the user.

\n\n

Returns:\n Callable: the same function provided as parameter, in order to e called\n later in the TransformerFactory.

\n", "signature": "(custom_transformer: Callable) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers", "modulename": "lakehouse_engine.transformers.data_maskers", "kind": "module", "doc": "

Module with data masking transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers", "kind": "class", "doc": "

Class containing data masking transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.hash_masker", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.hash_masker", "kind": "function", "doc": "

Mask specific columns using an hashing approach.

\n\n

Args:\n cols: list of column names to mask.\n approach: hashing approach. Defaults to 'SHA'. There's \"MURMUR3\" as well.\n num_bits: number of bits of the SHA approach. Only applies to SHA approach.\n suffix: suffix to apply to new column name. Defaults to \"_hash\".\n Note: you can pass an empty suffix to have the original column replaced.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tcols: List[str],\tapproach: str = 'SHA',\tnum_bits: int = 256,\tsuffix: str = '_hash') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.column_dropper", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.column_dropper", "kind": "function", "doc": "

Drop specific columns.

\n\n

Args:\n cols: list of column names to drop.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, cols: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers", "modulename": "lakehouse_engine.transformers.date_transformers", "kind": "module", "doc": "

Module containing date transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers", "kind": "class", "doc": "

Class with set of transformers to transform dates in several forms.

\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.add_current_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.add_current_date", "kind": "function", "doc": "

Add column with current date.

\n\n

The current date comes from the driver as a constant, not from every executor.

\n\n

Args:\n output_col: name of the output column.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(output_col: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_date", "kind": "function", "doc": "

Convert multiple string columns with a source format into dates.

\n\n

Args:\n cols: list of names of the string columns to convert.\n source_format: dates source format (e.g., YYYY-MM-dd). Check here:\n https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_timestamp", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_timestamp", "kind": "function", "doc": "

Convert multiple string columns with a source format into timestamps.

\n\n

Args:\n cols: list of names of the string columns to convert.\n source_format: dates source format (e.g., MM-dd-yyyy HH:mm:ss.SSS). Check\n here: https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.format_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.format_date", "kind": "function", "doc": "

Convert multiple date/timestamp columns into strings with the target format.

\n\n

Args:\n cols: list of names of the string columns to convert.\n target_format: strings target format (e.g., YYYY-MM-dd). Check here:\n https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cols: List[str], target_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.get_date_hierarchy", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.get_date_hierarchy", "kind": "function", "doc": "

Create day/month/week/quarter/year hierarchy for the provided date columns.

\n\n

Uses Spark's extract function.

\n\n

Args:\n cols: list of names of the date columns to create the hierarchy.\n formats: dict with the correspondence between the hierarchy and the format\n to apply.\n Example: {\n \"year\": \"year\",\n \"month\": \"month\",\n \"day\": \"day\",\n \"week\": \"week\",\n \"quarter\": \"quarter\"\n }\n Check here: https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cols: List[str], formats: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.exceptions", "modulename": "lakehouse_engine.transformers.exceptions", "kind": "module", "doc": "

Module for all the transformers exceptions.

\n"}, {"fullname": "lakehouse_engine.transformers.exceptions.WrongArgumentsException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "WrongArgumentsException", "kind": "class", "doc": "

Exception for when a user provides wrong arguments to a transformer.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.exceptions.UnsupportedStreamingTransformerException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "UnsupportedStreamingTransformerException", "kind": "class", "doc": "

Exception for when a user requests a transformer not supported in streaming.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.filters", "modulename": "lakehouse_engine.transformers.filters", "kind": "module", "doc": "

Module containing the filters transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters", "kind": "class", "doc": "

Class containing the filters transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.incremental_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.incremental_filter", "kind": "function", "doc": "

Incrementally Filter a certain dataframe given an increment logic.

\n\n

This logic can either be an increment value or an increment dataframe from which\nthe get the latest value from. By default the operator for the filtering process\nis greater or equal to cover cases where we receive late arriving data not cover\nin a previous load. You can change greater_or_equal to false to use greater,\nwhen you trust the source will never output more data with the increment after\nyou have load the data (e.g., you will never load data until the source is still\ndumping data, which may cause you to get an incomplete picture of the last\narrived data).

\n\n

Args:\n input_col: input column name\n increment_value: value to which to filter the data, considering the\n provided input_Col.\n increment_df: a dataframe to get the increment value from.\n you either specify this or the increment_value (this takes precedence).\n This is a good approach to get the latest value from a given dataframe\n that was read and apply that value as filter here. In this way you can\n perform incremental loads based on the last value of a given dataframe\n (e.g., table or file based). Can be used together with the\n get_max_value transformer to accomplish these incremental based loads.\n See our append load feature tests to see how to provide an acon for\n incremental loads, taking advantage of the scenario explained here.\n increment_col: name of the column from which to get the increment\n value from from (when using increment_df approach). This assumes there's\n only one row in the increment_df, reason why is a good idea to use\n together with the get_max_value transformer. Defaults to \"latest\"\n because that's the default output column name provided by the\n get_max_value transformer.\n greater_or_equal: if filtering should be done by also including the\n increment value or not (useful for scenarios where you are performing\n increment loads but still want to include data considering the increment\n value, and not only values greater than that increment... examples may\n include scenarios where you already loaded data including those values,\n but the source produced more data containing those values).\n Defaults to false.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tinput_col: str,\tincrement_value: Optional[Any] = None,\tincrement_df: Optional[pyspark.sql.dataframe.DataFrame] = None,\tincrement_col: str = 'latest',\tgreater_or_equal: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.expression_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.expression_filter", "kind": "function", "doc": "

Filter a dataframe based on an expression.

\n\n

Args:\n exp: filter expression.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(exp: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.column_filter_exp", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.column_filter_exp", "kind": "function", "doc": "

Filter a dataframe's columns based on a list of SQL expressions.

\n\n

Args:\n exp: column filter expressions.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(exp: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.drop_duplicate_rows", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.drop_duplicate_rows", "kind": "function", "doc": "

Drop duplicate rows using spark function dropDuplicates().

\n\n

This transformer can be used with or without arguments.\nThe provided argument needs to be a list of columns.\nFor example: [\u201cName\u201d,\u201dVAT\u201d] will drop duplicate records within\n\"Name\" and \"VAT\" columns.\nIf the transformer is used without providing any columns list or providing\nan empty list, such as [] the result will be the same as using\nthe distinct() pyspark function. If the watermark dict is present it will\nensure that the drop operation will apply to rows within the watermark timeline\nwindow.

\n\n

Args:\n cols: column names.\n watermarker: properties to apply watermarker to the transformer.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cols: List[str] = None, watermarker: dict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.joiners", "modulename": "lakehouse_engine.transformers.joiners", "kind": "module", "doc": "

Module with join transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners", "kind": "class", "doc": "

Class containing join transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners.join", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners.join", "kind": "function", "doc": "

Join two dataframes based on specified type and columns.

\n\n

Some stream to stream joins are only possible if you apply Watermark, so this\nmethod also provides a parameter to enable watermarking specification.

\n\n

Args:\n left_df_alias: alias of the first dataframe.\n join_with: right dataframe.\n right_df_alias: alias of the second dataframe.\n join_condition: condition to join dataframes.\n join_type: type of join. Defaults to inner.\n Available values: inner, cross, outer, full, full outer,\n left, left outer, right, right outer, semi,\n left semi, anti, and left anti.\n broadcast_join: whether to perform a broadcast join or not.\n select_cols: list of columns to select at the end.\n watermarker: properties to apply watermarking.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tjoin_with: pyspark.sql.dataframe.DataFrame,\tjoin_condition: str,\tleft_df_alias: str = 'a',\tright_df_alias: str = 'b',\tjoin_type: str = 'inner',\tbroadcast_join: bool = True,\tselect_cols: Optional[List[str]] = None,\twatermarker: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.null_handlers", "modulename": "lakehouse_engine.transformers.null_handlers", "kind": "module", "doc": "

Module with null handlers transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers", "kind": "class", "doc": "

Class containing null handler transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers.replace_nulls", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers.replace_nulls", "kind": "function", "doc": "

Replace nulls in a dataframe.

\n\n

Args:\n replace_on_nums: if it is to replace nulls on numeric columns.\n Applies to ints, longs and floats.\n default_num_value: default integer value to use as replacement.\n replace_on_strings: if it is to replace nulls on string columns.\n default_string_value: default string value to use as replacement.\n subset_cols: list of columns in which to replace nulls. If not\n provided, all nulls in all columns will be replaced as specified.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\treplace_on_nums: bool = True,\tdefault_num_value: int = -999,\treplace_on_strings: bool = True,\tdefault_string_value: str = 'UNKNOWN',\tsubset_cols: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "kind": "module", "doc": "

Optimizers module.

\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers", "kind": "class", "doc": "

Class containing all the functions that can provide optimizations.

\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.cache", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.cache", "kind": "function", "doc": "

Caches the current dataframe.

\n\n

The default storage level used is MEMORY_AND_DISK.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.persist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.persist", "kind": "function", "doc": "

Caches the current dataframe with a specific StorageLevel.

\n\n

Args:\n storage_level: the type of StorageLevel, as default MEMORY_AND_DISK_DESER.\n More options here: https://spark.apache.org/docs/latest/api/python/\n reference/api/pyspark.StorageLevel.html

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, storage_level: str = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.unpersist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.unpersist", "kind": "function", "doc": "

Removes the dataframe from the disk and memory.

\n\n

Args:\n blocking: whether to block until all the data blocks are\n removed from disk/memory or run asynchronously.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, blocking: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.regex_transformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "kind": "module", "doc": "

Regex transformers module.

\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers", "kind": "class", "doc": "

Class containing all regex functions.

\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers.with_regex_value", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers.with_regex_value", "kind": "function", "doc": "

Get the result of applying a regex to an input column (via regexp_extract).

\n\n

Args:\n input_col: name of the input column.\n output_col: name of the output column.\n regex: regular expression.\n drop_input_col: whether to drop input_col or not.\n idx: index to return.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(\tinput_col: str,\toutput_col: str,\tregex: str,\tdrop_input_col: bool = False,\tidx: int = 1) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "kind": "module", "doc": "

Module with repartitioners transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners", "kind": "class", "doc": "

Class containing repartitioners transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.coalesce", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.coalesce", "kind": "function", "doc": "

Coalesce a dataframe into n partitions.

\n\n

Args:\n num_partitions: num of partitions to coalesce.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, num_partitions: int) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.repartition", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.repartition", "kind": "function", "doc": "

Repartition a dataframe into n partitions.

\n\n

If num_partitions is provided repartitioning happens based on the provided\nnumber, otherwise it happens based on the values of the provided cols (columns).

\n\n

Args:\n num_partitions: num of partitions to repartition.\n cols: list of columns to use for repartitioning.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tnum_partitions: Optional[int] = None,\tcols: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.transformer_factory", "modulename": "lakehouse_engine.transformers.transformer_factory", "kind": "module", "doc": "

Module with the factory pattern to return transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory", "kind": "class", "doc": "

TransformerFactory class following the factory pattern.

\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory.get_transformer", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory.get_transformer", "kind": "function", "doc": "

Get a transformer following the factory pattern.

\n\n

Args:\n spec: transformer specification (individual transformation... not to be\n confused with list of all transformations).\n data: ordered dict of dataframes to be transformed. Needed when a\n transformer requires more than one dataframe as input.

\n\n

Returns:\n Transformer function to be executed in .transform() spark function.

\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TransformerSpec,\tdata: OrderedDict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions", "modulename": "lakehouse_engine.transformers.unions", "kind": "module", "doc": "

Module with union transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions", "kind": "class", "doc": "

Class containing union transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union", "kind": "function", "doc": "

Union dataframes, resolving columns by position (not by name).

\n\n

Args:\n union_with: list of dataframes to union.\n deduplication: whether to perform deduplication of elements or not.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union_by_name", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union_by_name", "kind": "function", "doc": "

Union dataframes, resolving columns by name (not by position).

\n\n

Args:\n union_with: list of dataframes to union.\n deduplication: whether to perform deduplication of elements or not.\n allow_missing_columns: allow the union of DataFrames with different\n schemas.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True,\tallow_missing_columns: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "kind": "module", "doc": "

Watermarker module.

\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker", "kind": "class", "doc": "

Class containing all watermarker transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker.with_watermark", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker.with_watermark", "kind": "function", "doc": "

Get the dataframe with watermarker defined.

\n\n

Args:\n watermarker_column: name of the input column to be considered for\n the watermarking. Note: it must be a timestamp.\n watermarker_time: time window to define the watermark value.

\n\n

Returns:\n A function to be executed on other transformers.

\n", "signature": "(watermarker_column: str, watermarker_time: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils", "modulename": "lakehouse_engine.utils", "kind": "module", "doc": "

Utilities package.

\n"}, {"fullname": "lakehouse_engine.utils.configs", "modulename": "lakehouse_engine.utils.configs", "kind": "module", "doc": "

Config utilities package.

\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils", "modulename": "lakehouse_engine.utils.configs.config_utils", "kind": "module", "doc": "

Module to read configurations.

\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils", "kind": "class", "doc": "

Config utilities class.

\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_acon", "kind": "function", "doc": "

Get acon based on a filesystem path or on a dict.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).

\n\n

Returns:\n Dict representation of an acon.

\n", "signature": "(\tcls,\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_config", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_config", "kind": "function", "doc": "

Get Lakehouse Engine configurations.

\n\n

Returns:\n A dictionary with the engine configurations.

\n", "signature": "() -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_json_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_json_acon", "kind": "function", "doc": "

Read an acon (algorithm configuration) file.

\n\n

Args:\n path: path to the acon file.

\n\n

Returns:\n The acon file content as a dict.

\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_sql", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_sql", "kind": "function", "doc": "

Read a DDL file in Spark SQL format from a cloud object storage system.

\n\n

Args:\n path: path to the acon (algorithm configuration) file.

\n\n

Returns:\n Content of the SQL file.

\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "kind": "module", "doc": "

Utilities for databricks operations.

\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils", "kind": "class", "doc": "

Databricks utilities class.

\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_db_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_db_utils", "kind": "function", "doc": "

Get db utils on databricks.

\n\n

Args:\n spark: spark session.

\n\n

Returns:\n Dbutils from databricks.

\n", "signature": "(spark: pyspark.sql.session.SparkSession) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_databricks_job_information", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_databricks_job_information", "kind": "function", "doc": "

Get notebook context from running acon.

\n\n

Returns:\n Dict containing databricks notebook context.

\n", "signature": "() -> Tuple[str, str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.expectations_utils", "modulename": "lakehouse_engine.utils.expectations_utils", "kind": "module", "doc": "

Utilities to be used by custom expectations.

\n"}, {"fullname": "lakehouse_engine.utils.expectations_utils.validate_result", "modulename": "lakehouse_engine.utils.expectations_utils", "qualname": "validate_result", "kind": "function", "doc": "

Validates the test results of the custom expectations.

\n\n

If you need to make additional validations on your custom expectation\nand/or require additional fields to be returned you can add them before\ncalling this function. The partial_success and partial_result\noptional parameters can be used to pass the result of additional\nvalidations and add more information to the result key of the\nreturned dict respectively.

\n\n

Args:\n expectation: Expectation to validate.\n configuration: Configuration used in the test.\n metrics: Test result metrics.\n partial_success: Result of validations done before calling this method.\n partial_result: Extra fields to be returned to the user.

\n\n

Returns:\n The result of the validation.

\n", "signature": "(\texpectation: great_expectations.expectations.expectation.Expectation,\tconfiguration: great_expectations.core.expectation_configuration.ExpectationConfiguration,\tmetrics: Dict,\tpartial_success: bool = True,\tpartial_result: dict = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction", "modulename": "lakehouse_engine.utils.extraction", "kind": "module", "doc": "

Extraction utilities package.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "kind": "module", "doc": "

Utilities module for JDBC extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType", "kind": "class", "doc": "

Standardize the types of extractions we can have from a JDBC source.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.INIT", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.INIT", "kind": "variable", "doc": "

\n", "default_value": "<JDBCExtractionType.INIT: 'init'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.DELTA", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.DELTA", "kind": "variable", "doc": "

\n", "default_value": "<JDBCExtractionType.DELTA: 'delta'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction", "kind": "class", "doc": "

Configurations available for an Extraction from a JDBC source.

\n\n

These configurations cover:\n user: username to connect to JDBC source.\n password: password to connect to JDBC source (always use secrets,\n don't use text passwords in your code).\n url: url to connect to JDBC source.\n dbtable: database.table to extract data from.\n calc_upper_bound_schema: custom schema used for the upper bound calculation.\n changelog_table: table of type changelog from which to extract data,\n when the extraction type is delta.\n partition_column: column used to split the extraction.\n latest_timestamp_data_location: data location (e.g., s3) containing the data\n to get the latest timestamp already loaded into bronze.\n latest_timestamp_data_format: the format of the dataset in\n latest_timestamp_data_location. Default: delta.\n extraction_type: type of extraction (delta or init). Default: \"delta\".\n driver: JDBC driver name. Default: \"com.sap.db.jdbc.Driver\".\n num_partitions: number of Spark partitions to split the extraction.\n lower_bound: lower bound to decide the partition stride.\n upper_bound: upper bound to decide the partition stride. If\n calculate_upper_bound is True, then upperBound will be\n derived by our upper bound optimizer, using the partition column.\n default_upper_bound: the value to use as default upper bound in case\n the result of the upper bound calculation is None. Default: \"1\".\n fetch_size: how many rows to fetch per round trip. Default: \"100000\".\n compress: enable network compression. Default: True.\n custom_schema: specify custom_schema for particular columns of the\n returned dataframe in the init/delta extraction of the source table.\n min_timestamp: min timestamp to consider to filter the changelog data.\n Default: None and automatically derived from the location provided.\n In case this one is provided it has precedence and the calculation\n is not done.\n max_timestamp: max timestamp to consider to filter the changelog data.\n Default: None and automatically derived from the table having information\n about the extraction requests, their timestamps and their status.\n In case this one is provided it has precedence and the calculation\n is not done.\n generate_predicates: whether to generate predicates automatically or not.\n Default: False.\n predicates: list containing all values to partition (if generate_predicates\n is used, the manual values provided are ignored). Default: None.\n predicates_add_null: whether to consider null on predicates list.\n Default: True.\n extraction_timestamp: the timestamp of the extraction. Default: current time\n following the format \"%Y%m%d%H%M%S\".\n max_timestamp_custom_schema: custom schema used on the max_timestamp derivation\n from the table holding the extraction requests information.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction.__init__", "kind": "function", "doc": "

\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231012165159',\tmax_timestamp_custom_schema: Optional[str] = None)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from particularly relevant JDBC sources.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.__init__", "kind": "function", "doc": "

Construct JDBCExtractionUtils.

\n\n

Args:\n jdbc_extraction: JDBC Extraction configurations. Can be of type:\n JDBCExtraction, SAPB4Extraction or SAPBWExtraction.

\n", "signature": "(jdbc_extraction: Any)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_additional_spark_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_additional_spark_options", "kind": "function", "doc": "

Helper to get additional Spark Options initially passed.

\n\n

If people provide additional Spark options, not covered by the util function\narguments (get_spark_jdbc_options), we need to consider them.\nThus, we update the options retrieved by the utils, by checking if there is\nany Spark option initially provided that is not yet considered in the retrieved\noptions or function arguments and if the value for the key is not None.\nIf these conditions are filled, we add the options and return the complete dict.

\n\n

Args:\n input_spec: the input specification.\n options: dict with Spark options.\n ignore_options: list of options to be ignored by the process.\n Spark read has two different approaches to parallelize\n reading process, one of them is using upper/lower bound,\n another one is using predicates, those process can't be\n executed at the same time, you must choose one of them.\n By choosing predicates you can't pass lower and upper bound,\n also can't pass number of partitions and partition column\n otherwise spark will interpret the execution partitioned by\n upper and lower bound and will expect to fill all variables.\n To avoid fill all predicates hardcoded at the acon, there is\n a feature that automatically generates all predicates for init\n or delta load based on input partition column, but at the end\n of the process, partition column can't be passed to the options,\n because we are choosing predicates execution, that is why to\n generate predicates we need to pass some options to ignore.

\n\n

Returns:\n a dict with all the options passed as argument, plus the options that\n were initially provided, but were not used in the util\n (get_spark_jdbc_options).

\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\toptions: dict,\tignore_options: List = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_predicates", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_predicates", "kind": "function", "doc": "

Get the predicates list, based on a predicates query.

\n\n

Args:\n predicates_query: query to use as the basis to get the distinct values for\n a specified column, based on which predicates are generated.

\n\n

Returns:\n List containing the predicates to use to split the extraction from\n JDBC sources.

\n", "signature": "(self, predicates_query: str) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_options", "kind": "function", "doc": "

Get the Spark options to extract data from a JDBC source.

\n\n

Returns:\n The Spark jdbc args dictionary, including the query to submit\n and also options args dictionary.

\n", "signature": "(self) -> Tuple[dict, dict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "kind": "function", "doc": "

Get an optimal upperBound to properly split a Spark JDBC extraction.

\n\n

Returns:\n Either an int, date or timestamp to serve as upperBound Spark JDBC option.

\n", "signature": "(self) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "kind": "module", "doc": "

Utilities module for SAP B4 extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes", "kind": "class", "doc": "

Standardise the types of ADSOs we can have for Extractions from SAP B4.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.AQ", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.AQ", "kind": "variable", "doc": "

\n", "annotation": ": str", "default_value": "<ADSOTypes.AQ: 'AQ'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.CL", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.CL", "kind": "variable", "doc": "

\n", "annotation": ": str", "default_value": "<ADSOTypes.CL: 'CL'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.SUPPORTED_TYPES", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.SUPPORTED_TYPES", "kind": "variable", "doc": "

\n", "annotation": ": list", "default_value": "<ADSOTypes.SUPPORTED_TYPES: ['AQ', 'CL']>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction", "kind": "class", "doc": "

Configurations available for an Extraction from SAP B4.

\n\n

It inherits from JDBCExtraction configurations, so it can use\nand/or overwrite those configurations.

\n\n

These configurations cover:\n latest_timestamp_input_col: the column containing the request timestamps\n in the dataset in latest_timestamp_data_location. Default: REQTSN.\n request_status_tbl: the name of the SAP B4 table having information\n about the extraction requests. Composed of database.table.\n Default: SAPHANADB.RSPMREQUEST.\n request_col_name: name of the column having the request timestamp to join\n with the request status table. Default: REQUEST_TSN.\n data_target: the data target to extract from. User in the join operation with\n the request status table.\n act_req_join_condition: the join condition into activation table\n can be changed using this property.\n Default: 'tbl.reqtsn = req.request_col_name'.\n include_changelog_tech_cols: whether to include the technical columns\n (usually coming from the changelog) table or not.\n extra_cols_req_status_tbl: columns to be added from request status table.\n It needs to contain the prefix \"req.\". E.g. \"req.col1 as column_one,\n req.col2 as column_two\".\n request_status_tbl_filter: filter to use for filtering the request status table,\n influencing the calculation of the max timestamps and the delta extractions.\n adso_type: the type of ADSO that you are extracting from. Can be \"AQ\" or \"CL\".\n max_timestamp_custom_schema: the custom schema to apply on the calculation of\n the max timestamp to consider for the delta extractions.\n Default: timestamp DECIMAL(23,0).\n default_max_timestamp: the timestamp to use as default, when it is not possible\n to derive one.\n custom_schema: specify custom_schema for particular columns of the\n returned dataframe in the init/delta extraction of the source table.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction.__init__", "kind": "function", "doc": "

\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: str = 'REQTSN DECIMAL(23,0)',\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231012165159',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)',\tlatest_timestamp_input_col: str = 'REQTSN',\trequest_status_tbl: str = 'SAPHANADB.RSPMREQUEST',\trequest_col_name: str = 'REQUEST_TSN',\tdata_target: Optional[str] = None,\tact_req_join_condition: Optional[str] = None,\tinclude_changelog_tech_cols: Optional[bool] = None,\textra_cols_req_status_tbl: Optional[str] = None,\trequest_status_tbl_filter: Optional[str] = None,\tadso_type: Optional[str] = None,\tdefault_max_timestamp: str = '1970000000000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from SAP B4.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.__init__", "kind": "function", "doc": "

Construct SAPB4ExtractionUtils.

\n\n

Args:\n sap_b4_extraction: SAP B4 Extraction configurations.

\n", "signature": "(\tsap_b4_extraction: lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.get_data_target", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.get_data_target", "kind": "function", "doc": "

Get the data_target from the data_target option or derive it.

\n\n

By definition data_target is the same for the table and changelog table and\nis the same string ignoring everything before / and the first and last\ncharacter after /. E.g. for a dbtable /BIC/abtable12, the data_target\nwould be btable1.

\n\n

Args:\n input_spec_opt: options from the input_spec.

\n\n

Returns:\n A string with the data_target.

\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "kind": "module", "doc": "

Utilities module for SAP BW extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction", "kind": "class", "doc": "

Configurations available for an Extraction from SAP BW.

\n\n

It inherits from SAPBWExtraction configurations, so it can use\nand/or overwrite those configurations.

\n\n

These configurations cover:\n latest_timestamp_input_col: the column containing the actrequest timestamp\n in the dataset in latest_timestamp_data_location. Default:\n \"actrequest_timestamp\".\n act_request_table: the name of the SAP BW activation requests table.\n Composed of database.table. Default: SAPPHA.RSODSACTREQ.\n request_col_name: name of the column having the request to join\n with the activation request table. Default: actrequest.\n act_req_join_condition: the join condition into activation table\n can be changed using this property.\n Default: 'changelog_tbl.request = act_req.request_col_name'.\n odsobject: name of BW Object, used for joining with the activation request\n table to get the max actrequest_timestamp to consider while filtering\n the changelog table.\n include_changelog_tech_cols: whether to include the technical columns\n (usually coming from the changelog) table or not. Default: True.\n extra_cols_act_request: list of columns to be added from act request table.\n It needs to contain the prefix \"act_req.\". E.g. \"act_req.col1\n as column_one, act_req.col2 as column_two\".\n get_timestamp_from_act_request: whether to get init timestamp\n from act request table or assume current/given timestamp.\n sap_bw_schema: sap bw schema. Default: SAPPHA.\n max_timestamp_custom_schema: the custom schema to apply on the calculation of\n the max timestamp to consider for the delta extractions.\n Default: timestamp DECIMAL(23,0).\n default_max_timestamp: the timestamp to use as default, when it is not possible\n to derive one.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction.__init__", "kind": "function", "doc": "

\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231012165159',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)',\tlatest_timestamp_input_col: str = 'actrequest_timestamp',\tact_request_table: str = 'SAPPHA.RSODSACTREQ',\trequest_col_name: str = 'actrequest',\tact_req_join_condition: Optional[str] = None,\todsobject: Optional[str] = None,\tinclude_changelog_tech_cols: bool = True,\textra_cols_act_request: Optional[str] = None,\tget_timestamp_from_act_request: bool = False,\tsap_bw_schema: str = 'SAPPHA',\tdefault_max_timestamp: str = '197000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from particularly relevant JDBC sources.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.__init__", "kind": "function", "doc": "

Construct SAPBWExtractionUtils.

\n\n

Args:\n sap_bw_extraction: SAP BW Extraction configurations.

\n", "signature": "(\tsap_bw_extraction: lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_changelog_table", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_changelog_table", "kind": "function", "doc": "

Get the changelog table, given an odsobject.

\n\n

Returns:\n String to use as changelog_table.

\n", "signature": "(self) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_odsobject", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_odsobject", "kind": "function", "doc": "

Get the odsobject based on the provided options.

\n\n

With the table name we may also get the db name, so we need to split.\nMoreover, there might be the need for people to specify odsobject if\nit is different from the dbtable.

\n\n

Args:\n input_spec_opt: options from the input_spec.

\n\n

Returns:\n A string with the odsobject.

\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "kind": "module", "doc": "

Utilities module for SFTP extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat", "kind": "class", "doc": "

Formats of algorithm input.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.CSV", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.CSV", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.FWF", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.FWF", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.FWF: 'fwf'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.JSON", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.JSON", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.XML", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.XML", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.XML: 'xml'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter", "kind": "class", "doc": "

Standardize the types of filters we can have from a SFTP source.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.file_name_contains", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.file_name_contains", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.file_name_contains: 'file_name_contains'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LATEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LATEST_FILE", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.LATEST_FILE: 'latest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.EARLIEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.EARLIEST_FILE", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.EARLIEST_FILE: 'earliest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.GREATER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.GREATER_THAN", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.GREATER_THAN: 'date_time_gt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LOWER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LOWER_THAN", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.LOWER_THAN: 'date_time_lt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from particularly relevant SFTP sources.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_files_list", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_files_list", "kind": "function", "doc": "

Get a list of files to be extracted from SFTP.

\n\n

The arguments (options_args) to list files are:\ndate_time_gt(str):\n Filter the files greater than the string datetime\n formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".\ndate_time_lt(str):\n Filter the files lower than the string datetime\n formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".\nearliest_file(bool):\n Filter the earliest dated file in the directory.\nfile_name_contains(str):\n Filter files when match the pattern.\nlatest_file(bool):\n Filter the most recent dated file in the directory.\nsub_dir(bool):\n When true, the engine will search files into subdirectories\n of the remote_path.\n It will consider one level below the remote_path.\n When sub_dir is used with latest_file/earliest_file argument,\n the engine will retrieve the latest_file/earliest_file\n for each subdirectory.

\n\n

Args:\n sftp: the SFTP client object.\n remote_path: path of files to be filtered.\n options_args: options from the acon.

\n\n

Returns:\n A list containing the file names to be passed to Spark.

\n", "signature": "(\tcls,\tsftp: paramiko.sftp_client.SFTPClient,\tremote_path: str,\toptions_args: dict) -> Set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_sftp_client", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_sftp_client", "kind": "function", "doc": "

Get the SFTP client.

\n\n

The SFTP client is used to open an SFTP session across an open\nSSH Transport and perform remote file operations.

\n\n

Args:\n options_args: dictionary containing SFTP connection parameters.\n The Paramiko arguments expected to connect are:\n \"hostname\": the server to connect to.\n \"port\": the server port to connect to.\n \"username\": the username to authenticate as.\n \"password\": used for password authentication.\n \"pkey\": optional - an optional public key to use for authentication.\n \"passphrase\" \u2013 optional - options used for decrypting private keys.\n \"key_filename\" \u2013 optional - the filename, or list of filenames,\n of optional private key(s) and/or certs to try for authentication.\n \"timeout\" \u2013 an optional timeout (in seconds) for the TCP connect.\n \"allow_agent\" \u2013 optional - set to False to disable\n connecting to the SSH agent.\n \"look_for_keys\" \u2013 optional - set to False to disable searching\n for discoverable private key files in ~/.ssh/.\n \"compress\" \u2013 optional - set to True to turn on compression.\n \"sock\" - optional - an open socket or socket-like object\n to use for communication to the target host.\n \"gss_auth\" \u2013 optional - True if you want to use GSS-API authentication.\n \"gss_kex\" \u2013 optional - Perform GSS-API Key Exchange and\n user authentication.\n \"gss_deleg_creds\" \u2013 optional - Delegate GSS-API client\n credentials or not.\n \"gss_host\" \u2013 optional - The targets name in the kerberos database.\n \"gss_trust_dns\" \u2013 optional - Indicates whether or\n not the DNS is trusted to securely canonicalize the name of the\n host being connected to (default True).\n \"banner_timeout\" \u2013 an optional timeout (in seconds)\n to wait for the SSH banner to be presented.\n \"auth_timeout\" \u2013 an optional timeout (in seconds)\n to wait for an authentication response.\n \"disabled_algorithms\" \u2013 an optional dict passed directly to Transport\n and its keyword argument of the same name.\n \"transport_factory\" \u2013 an optional callable which is handed a subset of\n the constructor arguments (primarily those related to the socket,\n GSS functionality, and algorithm selection) and generates a\n Transport instance to be used by this client.\n Defaults to Transport.__init__.

\n\n
The parameter to specify the private key is expected to be in RSA format.\nAttempting a connection with a blank host key is not allowed\nunless the argument \"add_auto_policy\" is explicitly set to True.\n
\n\n

Returns:\n sftp -> a new SFTPClient session object.\n transport -> the Transport for this connection.

\n", "signature": "(\tcls,\toptions_args: dict) -> Tuple[paramiko.sftp_client.SFTPClient, paramiko.transport.Transport]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_format", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_format", "kind": "function", "doc": "

Validate the file extension based on the format definitions.

\n\n

Args:\n files_format: a string containing the file extension.

\n\n

Returns:\n The string validated and formatted.

\n", "signature": "(cls, files_format: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_location", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_location", "kind": "function", "doc": "

Validate the location. Add \"/\" in the case it does not exist.

\n\n

Args:\n location: file path.

\n\n

Returns:\n The location validated.

\n", "signature": "(cls, location: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.file_utils", "modulename": "lakehouse_engine.utils.file_utils", "kind": "module", "doc": "

Utilities for file name based operations.

\n"}, {"fullname": "lakehouse_engine.utils.file_utils.get_file_names_without_file_type", "modulename": "lakehouse_engine.utils.file_utils", "qualname": "get_file_names_without_file_type", "kind": "function", "doc": "

Function to retrieve list of file names in a folder.

\n\n

This function filters by file type and removes the extension of the file name\nit returns.

\n\n

Args:\n path: path to the folder to list files\n file_type: type of the file to include in list\n exclude_regex: regex of file names to exclude

\n\n

Returns:\n A list of file names without file type.

\n", "signature": "(path: str, file_type: str, exclude_regex: str) -> list:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler", "modulename": "lakehouse_engine.utils.logging_handler", "kind": "module", "doc": "

Module to configure project logging.

\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData", "kind": "class", "doc": "

Logging filter to hide sensitive data from being shown in the logs.

\n", "bases": "logging.Filter"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData.filter", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData.filter", "kind": "function", "doc": "

Hide sensitive information from being shown in the logs.

\n\n

Based on the configured regex and replace strings, the content of the log\nrecords is replaced and then all the records are allowed to be logged\n(return True).

\n\n

Args:\n record: the LogRecord event being logged.

\n\n

Returns:\n The transformed record to be logged.

\n", "signature": "(self, record: logging.LogRecord) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler", "kind": "class", "doc": "

Handle the logging of the lakehouse engine project.

\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.__init__", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.__init__", "kind": "function", "doc": "

Construct a LoggingHandler instance.

\n\n

Args:\n class_name: name of the class to be indicated in the logs.

\n", "signature": "(class_name: str)"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.get_logger", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.get_logger", "kind": "function", "doc": "

Get the _logger instance variable.

\n\n
Returns
\n\n
\n

the logger object.

\n
\n", "signature": "(self) -> logging.Logger:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils", "modulename": "lakehouse_engine.utils.schema_utils", "kind": "module", "doc": "

Utilities to facilitate dataframe schema management.

\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils", "kind": "class", "doc": "

Schema utils that help retrieve and manage schemas of dataframes.

\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file", "kind": "function", "doc": "

Get a spark schema from a file (spark StructType json file) in a file system.

\n\n

Args:\n file_path: path of the file in a file system. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html

\n\n

Returns:\n Spark schema struct type.

\n", "signature": "(file_path: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file_to_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file_to_dict", "kind": "function", "doc": "

Get a dict with the spark schema from a file in a file system.

\n\n

Args:\n file_path: path of the file in a file system. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html

\n\n

Returns:\n Spark schema in a dict.

\n", "signature": "(file_path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_dict", "kind": "function", "doc": "

Get a spark schema from a dict.

\n\n

Args:\n struct_type: dict containing a spark schema structure. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html

\n\n

Returns:\n Spark schema struct type.

\n", "signature": "(struct_type: dict) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_table_schema", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_table_schema", "kind": "function", "doc": "

Get a spark schema from a table.

\n\n

Args:\n table: table name from which to inherit the schema.

\n\n

Returns:\n Spark schema struct type.

\n", "signature": "(table: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_input_spec", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_input_spec", "kind": "function", "doc": "

Get a spark schema from an input specification.

\n\n

This covers scenarios where the schema is provided as part of the input\nspecification of the algorithm. Schema can come from the table specified in the\ninput specification (enforce_schema_from_table) or by the dict with the spark\nschema provided there also.

\n\n

Args:\n input_spec: input specification.

\n\n

Returns:\n spark schema struct type.

\n", "signature": "(\tcls,\tinput_spec: lakehouse_engine.core.definitions.InputSpec) -> Optional[pyspark.sql.types.StructType]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.schema_flattener", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.schema_flattener", "kind": "function", "doc": "

Recursive method to flatten the schema of the dataframe.

\n\n

Args:\n schema: schema to be flattened.\n prefix: prefix of the struct to get the value for. Only relevant\n for being used in the internal recursive logic.\n level: level of the depth in the schema being flattened. Only relevant\n for being used in the internal recursive logic.\n max_level: level until which you want to flatten the schema. Default: None.\n shorten_names: whether to shorten the names of the prefixes of the fields\n being flattened or not. Default: False.\n alias: whether to define alias for the columns being flattened or\n not. Default: True.\n num_chars: number of characters to consider when shortening the names of\n the fields. Default: 7.\n ignore_cols: columns which you don't want to flatten. Default: None.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tschema: pyspark.sql.types.StructType,\tprefix: str = None,\tlevel: int = 1,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage", "modulename": "lakehouse_engine.utils.storage", "kind": "module", "doc": "

Utilities to interact with storage systems.

\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage", "modulename": "lakehouse_engine.utils.storage.file_storage", "kind": "module", "doc": "

Module for abstract representation of a storage system holding files.

\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage", "kind": "class", "doc": "

Abstract file storage class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.get_file_payload", "kind": "function", "doc": "

Get the payload of a file.

\n\n

Args:\n url: url of the file.

\n\n

Returns:\n File payload/content.

\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n

Args:\n url: url of the file.\n content: content to write into the file.

\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "kind": "module", "doc": "

Module for common file storage functions.

\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions", "kind": "class", "doc": "

Class for common file storage functions.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.read_json", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.read_json", "kind": "function", "doc": "

Read a json file.

\n\n

The file should be in a supported file system (e.g., s3 or local filesystem -\nfor local tests only).

\n\n

Args:\n path: path to the json file.

\n\n

Returns:\n Dict with json file content.

\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "kind": "module", "doc": "

Module to represent a local file storage system.

\n"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage", "kind": "class", "doc": "

Class to represent a local file storage system.

\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.get_file_payload", "kind": "function", "doc": "

Get the payload of a file.

\n\n

Args:\n url: url of the file.

\n\n

Returns:\n file payload/content.

\n", "signature": "(cls, url: urllib.parse.ParseResult) -> <class 'TextIO'>:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n

Args:\n url: url of the file.\n content: content to write into the file.

\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "kind": "module", "doc": "

Module to represent a s3 file storage system.

\n"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage", "kind": "class", "doc": "

Class to represent a s3 file storage system.

\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.get_file_payload", "kind": "function", "doc": "

Get the payload of a config file.

\n\n

Args:\n url: url of the file.

\n\n

Returns:\n File payload/content.

\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n

Args:\n url: url of the file.\n content: content to write into the file.

\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}]; + /** pdoc search index */const docs = [{"fullname": "lakehouse_engine", "modulename": "lakehouse_engine", "kind": "module", "doc": "

Lakehouse engine package containing all the system subpackages.

\n"}, {"fullname": "lakehouse_engine.algorithms", "modulename": "lakehouse_engine.algorithms", "kind": "module", "doc": "

Package containing all the lakehouse engine algorithms.

\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "kind": "module", "doc": "

Module containing the Algorithm class.

\n"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm", "kind": "class", "doc": "

Class to define the behavior of every algorithm based on ACONs.

\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.__init__", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.__init__", "kind": "function", "doc": "

Construct Algorithm instances.

\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.algorithm.Algorithm.get_dq_spec", "modulename": "lakehouse_engine.algorithms.algorithm", "qualname": "Algorithm.get_dq_spec", "kind": "function", "doc": "

Get data quality specification object from acon.

\n\n

Args:\n spec: data quality specifications.

\n\n

Returns:\n The DQSpec and the List of DQ Functions Specs.

\n", "signature": "(\tcls,\tspec: dict) -> Tuple[lakehouse_engine.core.definitions.DQSpec, List[lakehouse_engine.core.definitions.DQFunctionSpec], List[lakehouse_engine.core.definitions.DQFunctionSpec]]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader", "modulename": "lakehouse_engine.algorithms.data_loader", "kind": "module", "doc": "

Module to define DataLoader class.

\n"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader", "kind": "class", "doc": "

Load data using an algorithm configuration (ACON represented as dict).

\n\n

This algorithm focuses on the cases where users will be specifying all the algorithm\nsteps and configurations through a dict based configuration, which we name ACON\nin our framework.

\n\n

Since an ACON is a dict you can pass a custom transformer through a python function\nand, therefore, the DataLoader can also be used to load data with custom\ntransformations not provided in our transformers package.

\n\n

As the algorithm base class of the lakehouse-engine framework is based on the\nconcept of ACON, this DataLoader algorithm simply inherits from Algorithm,\nwithout overriding anything. We designed the codebase like this to avoid\ninstantiating the Algorithm class directly, which was always meant to be an\nabstraction for any specific algorithm included in the lakehouse-engine framework.

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.__init__", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.__init__", "kind": "function", "doc": "

Construct DataLoader algorithm instances.

\n\n

A data loader needs several specifications to work properly,\nbut some of them might be optional. The available specifications are:

\n\n
- input specifications (mandatory): specify how to read data.\n- transform specifications (optional): specify how to transform data.\n- data quality specifications (optional): specify how to execute the data\n    quality process.\n- output specifications (mandatory): specify how to write data to the\n    target.\n- terminate specifications (optional): specify what to do after writing into\n    the target (e.g., optimizing target table, vacuum, compute stats, etc).\n
\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.read", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.read", "kind": "function", "doc": "

Read data from an input location into a distributed dataframe.

\n\n

Returns:\n An ordered dict with all the dataframes that were read.

\n", "signature": "(self) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.transform", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.transform", "kind": "function", "doc": "

Transform (optionally) the data that was read.

\n\n

If there isn't a transformation specification this step will be skipped, and the\noriginal dataframes that were read will be returned.\nTransformations can have dependency from another transformation result, however\nwe need to keep in mind if we are using streaming source and for some reason we\nneed to enable micro batch processing, this result cannot be used as input to\nanother transformation. Micro batch processing in pyspark streaming is only\navailable in .write(), which means this transformation with micro batch needs\nto be the end of the process.

\n\n

Args:\n data: input dataframes in an ordered dict.

\n\n

Returns:\n Another ordered dict with the transformed dataframes, according to the\n transformation specification.

\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.process_dq", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.process_dq", "kind": "function", "doc": "

Process the data quality tasks for the data that was read and/or transformed.

\n\n

It supports multiple input dataframes. Although just one is advisable.

\n\n

It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.

\n\n

Args:\n data: dataframes from previous steps of the algorithm that we which to\n run the DQ process on.

\n\n

Returns:\n Another ordered dict with the validated dataframes.

\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.write", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.write", "kind": "function", "doc": "

Write the data that was read and transformed (if applicable).

\n\n

It supports writing multiple datasets. However, we only recommend to write one\ndataframe. This recommendation is based on easy debugging and reproducibility,\nsince if we start mixing several datasets being fueled by the same algorithm, it\nwould unleash an infinite sea of reproducibility issues plus tight coupling and\ndependencies between datasets. Having said that, there may be cases where\nwriting multiple datasets is desirable according to the use case requirements.\nUse it accordingly.

\n\n

Args:\n data: dataframes that were read and transformed (if applicable).

\n\n

Returns:\n Dataframes that were written.

\n", "signature": "(self, data: collections.OrderedDict) -> collections.OrderedDict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.terminate", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.terminate", "kind": "function", "doc": "

Terminate the algorithm.

\n\n

Args:\n data: dataframes that were written.

\n", "signature": "(self, data: collections.OrderedDict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.data_loader.DataLoader.execute", "modulename": "lakehouse_engine.algorithms.data_loader", "qualname": "DataLoader.execute", "kind": "function", "doc": "

Define the algorithm execution behaviour.

\n", "signature": "(self) -> Optional[collections.OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator", "modulename": "lakehouse_engine.algorithms.dq_validator", "kind": "module", "doc": "

Module to define Data Validator class.

\n"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator", "kind": "class", "doc": "

Validate data using an algorithm configuration (ACON represented as dict).

\n\n

This algorithm focuses on isolate Data Quality Validations from loading,\napplying a set of data quality functions to a specific input dataset,\nwithout the need to define any output specification.\nYou can use any input specification compatible with the lakehouse engine\n(dataframe, table, files, etc).

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.__init__", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.__init__", "kind": "function", "doc": "

Construct DQValidator algorithm instances.

\n\n

A data quality validator needs the following specifications to work\nproperly:\n - input specification (mandatory): specify how and what data to\n read.\n - data quality specification (mandatory): specify how to execute\n the data quality process.\n - restore_prev_version (optional): specify if, having\n delta table/files as input, they should be restored to the\n previous version if the data quality process fails. Note: this\n is only considered if fail_on_error is kept as True.

\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.read", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.read", "kind": "function", "doc": "

Read data from an input location into a distributed dataframe.

\n\n

Returns:\n Dataframe with data that was read.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.process_dq", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.process_dq", "kind": "function", "doc": "

Process the data quality tasks for the data that was read.

\n\n

It supports a single input dataframe.

\n\n

It is possible to use data quality validators/expectations that will validate\nyour data and fail the process in case the expectations are not met. The DQ\nprocess also generates and keeps updating a site containing the results of the\nexpectations that were done on your data. The location of the site is\nconfigurable and can either be on file system or S3. If you define it to be\nstored on S3, you can even configure your S3 bucket to serve the site so that\npeople can easily check the quality of your data. Moreover, it is also\npossible to store the result of the DQ process into a defined result sink.

\n\n

Args:\n data: input dataframe on which to run the DQ process.

\n\n

Returns:\n Validated dataframe.

\n", "signature": "(\tself,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.dq_validator.DQValidator.execute", "modulename": "lakehouse_engine.algorithms.dq_validator", "qualname": "DQValidator.execute", "kind": "function", "doc": "

Define the algorithm execution behaviour.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.exceptions", "modulename": "lakehouse_engine.algorithms.exceptions", "kind": "module", "doc": "

Package defining all the algorithm custom exceptions.

\n"}, {"fullname": "lakehouse_engine.algorithms.exceptions.ReconciliationFailedException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "ReconciliationFailedException", "kind": "class", "doc": "

Exception for when the reconciliation process fails.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.NoNewDataException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "NoNewDataException", "kind": "class", "doc": "

Exception for when no new data is available.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.SensorAlreadyExistsException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "SensorAlreadyExistsException", "kind": "class", "doc": "

Exception for when a sensor with same sensor id already exists.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.exceptions.RestoreTypeNotFoundException", "modulename": "lakehouse_engine.algorithms.exceptions", "qualname": "RestoreTypeNotFoundException", "kind": "class", "doc": "

Exception for when the restore type is not found.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.algorithms.reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "kind": "module", "doc": "

Module containing the Reconciliator class.

\n"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType", "kind": "class", "doc": "

Type of Reconciliation.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.PCT", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.PCT", "kind": "variable", "doc": "

\n", "default_value": "<ReconciliationType.PCT: 'percentage'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationType.ABS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationType.ABS", "kind": "variable", "doc": "

\n", "default_value": "<ReconciliationType.ABS: 'absolute'>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers", "kind": "class", "doc": "

Transformers Available for the Reconciliation Algorithm.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "ReconciliationTransformers.AVAILABLE_TRANSFORMERS", "kind": "variable", "doc": "

\n", "annotation": ": dict", "default_value": "<ReconciliationTransformers.AVAILABLE_TRANSFORMERS: {'cache': <bound method Optimizers.cache of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>, 'persist': <bound method Optimizers.persist of <class 'lakehouse_engine.transformers.optimizers.Optimizers'>>}>"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator", "kind": "class", "doc": "

Class to define the behavior of an algorithm that checks if data reconciles.

\n\n

Checking if data reconciles, using this algorithm, is a matter of reading the\n'truth' data and the 'current' data. You can use any input specification compatible\nwith the lakehouse engine to read 'truth' or 'current' data. On top of that, you\ncan pass a 'truth_preprocess_query' and a 'current_preprocess_query' so you can\npreprocess the data before it goes into the actual reconciliation process.\nMoreover, you can use the 'truth_preprocess_query_args' and\n'current_preprocess_query_args' to pass additional arguments to be used to apply\nadditional operations on top of the dataframe, resulting from the previous steps.\nWith these arguments you can apply additional operations like caching or persisting\nthe Dataframe. The way to pass the additional arguments for the operations is\nsimilar to the TransformSpec, but only a few operations are allowed. Those are\ndefined in ReconciliationTransformers.AVAILABLE_TRANSFORMERS.

\n\n

The reconciliation process is focused on joining 'truth' with 'current' by all\nprovided columns except the ones passed as 'metrics'. After that it calculates the\ndifferences in the metrics attributes (either percentage or absolute difference).\nFinally, it aggregates the differences, using the supplied aggregation function\n(e.g., sum, avg, min, max, etc).

\n\n

All of these configurations are passed via the ACON to instantiate a\nReconciliatorSpec object.

\n\n

Notes:\n - It is crucial that both the current and truth datasets have exactly the same\n structure.\n - You should not use 0 as yellow or red threshold, as the algorithm will verify\n if the difference between the truth and current values is bigger\n or equal than those thresholds.\n - The reconciliation does not produce any negative values or percentages, as we\n use the absolute value of the differences. This means that the recon result\n will not indicate if it was the current values that were bigger or smaller\n than the truth values, or vice versa.

\n", "bases": "lakehouse_engine.core.executable.Executable"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.__init__", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.__init__", "kind": "function", "doc": "

Construct Algorithm instances.

\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_source_of_truth", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_source_of_truth", "kind": "function", "doc": "

Get the source of truth (expected result) for the reconciliation process.

\n\n

Returns:\n DataFrame containing the source of truth.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.get_current_results", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.get_current_results", "kind": "function", "doc": "

Get the current results from the table that we are checking if it reconciles.

\n\n

Returns:\n DataFrame containing the current results.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.reconciliator.Reconciliator.execute", "modulename": "lakehouse_engine.algorithms.reconciliator", "qualname": "Reconciliator.execute", "kind": "function", "doc": "

Reconcile the current results against the truth dataset.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.algorithms.sensor", "modulename": "lakehouse_engine.algorithms.sensor", "kind": "module", "doc": "

Module to define Sensor algorithm behavior.

\n"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor", "kind": "class", "doc": "

Class representing a sensor to check if the upstream has new data.

\n", "bases": "lakehouse_engine.algorithms.algorithm.Algorithm"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.__init__", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.__init__", "kind": "function", "doc": "

Construct Sensor instances.

\n\n

Args:\n acon: algorithm configuration.

\n", "signature": "(acon: dict)"}, {"fullname": "lakehouse_engine.algorithms.sensor.Sensor.execute", "modulename": "lakehouse_engine.algorithms.sensor", "qualname": "Sensor.execute", "kind": "function", "doc": "

Execute the sensor.

\n", "signature": "(self) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.configs", "modulename": "lakehouse_engine.configs", "kind": "module", "doc": "

This module receives a config file which is included in the wheel.

\n"}, {"fullname": "lakehouse_engine.core", "modulename": "lakehouse_engine.core", "kind": "module", "doc": "

Package with the core behaviour of the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.core.definitions", "modulename": "lakehouse_engine.core.definitions", "kind": "module", "doc": "

Definitions of standard values and structures for core components.

\n"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat", "kind": "class", "doc": "

Formats of algorithm input.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JDBC", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.AVRO", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.JSON", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CSV", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.PARQUET", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DELTAFILES", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.CLOUDFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.CLOUDFILES", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.CLOUDFILES: 'cloudfiles'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.KAFKA", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SQL", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SQL", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SQL: 'sql'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_BW", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_BW", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SAP_BW: 'sap_bw'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SAP_B4", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SAP_B4", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SAP_B4: 'sap_b4'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.DATAFRAME", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.SFTP", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.SFTP", "kind": "variable", "doc": "

\n", "default_value": "<InputFormat.SFTP: 'sftp'>"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.values", "kind": "function", "doc": "

Generates a list containing all enum values.

\n\n

Return:\n A list with all enum values.

\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.InputFormat.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputFormat.exists", "kind": "function", "doc": "

Checks if the input format exists in the enum values.

\n\n

Args:\n input_format: format to check if exists.

\n\n

Return:\n If the input format exists in our enum.

\n", "signature": "(cls, input_format: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat", "kind": "class", "doc": "

Formats of algorithm output.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JDBC", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JDBC", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.JDBC: 'jdbc'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.AVRO", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.AVRO", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.AVRO: 'avro'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.JSON", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.JSON", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CSV", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CSV", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.PARQUET", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.PARQUET", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.PARQUET: 'parquet'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DELTAFILES", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DELTAFILES", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.DELTAFILES: 'delta'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.KAFKA", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.KAFKA", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.KAFKA: 'kafka'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.CONSOLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.CONSOLE", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.CONSOLE: 'console'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.NOOP", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.NOOP", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.NOOP: 'noop'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.DATAFRAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.DATAFRAME", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.DATAFRAME: 'dataframe'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.FILE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.FILE", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.FILE: 'file'>"}, {"fullname": "lakehouse_engine.core.definitions.OutputFormat.TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputFormat.TABLE", "kind": "variable", "doc": "

\n", "default_value": "<OutputFormat.TABLE: 'table'>"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType", "kind": "class", "doc": "

Type of notifier available.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotifierType.EMAIL", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotifierType.EMAIL", "kind": "variable", "doc": "

\n", "default_value": "<NotifierType.EMAIL: 'email'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationEmailServers", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationEmailServers", "kind": "class", "doc": "

Types of email server with special behaviour.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters", "kind": "class", "doc": "

Parameters to be replaced in runtime.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_JOB_NAME", "kind": "variable", "doc": "

\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_JOB_NAME: 'databricks_job_name'>"}, {"fullname": "lakehouse_engine.core.definitions.NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "modulename": "lakehouse_engine.core.definitions", "qualname": "NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID", "kind": "variable", "doc": "

\n", "default_value": "<NotificationRuntimeParameters.DATABRICKS_WORKSPACE_ID: 'databricks_workspace_id'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType", "kind": "class", "doc": "

Define the types of read operations.

\n\n

BATCH - read the data in batch mode (e.g., Spark batch).\nSTREAMING - read the data in streaming mode (e.g., Spark streaming).

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.BATCH", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.BATCH", "kind": "variable", "doc": "

\n", "default_value": "<ReadType.BATCH: 'batch'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadType.STREAMING", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadType.STREAMING", "kind": "variable", "doc": "

\n", "default_value": "<ReadType.STREAMING: 'streaming'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode", "kind": "class", "doc": "

Different modes that control how we handle compliance to the provided schema.

\n\n

These read modes map to Spark's read modes at the moment.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.PERMISSIVE", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.PERMISSIVE", "kind": "variable", "doc": "

\n", "default_value": "<ReadMode.PERMISSIVE: 'PERMISSIVE'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.FAILFAST", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.FAILFAST", "kind": "variable", "doc": "

\n", "default_value": "<ReadMode.FAILFAST: 'FAILFAST'>"}, {"fullname": "lakehouse_engine.core.definitions.ReadMode.DROPMALFORMED", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReadMode.DROPMALFORMED", "kind": "variable", "doc": "

\n", "default_value": "<ReadMode.DROPMALFORMED: 'DROPMALFORMED'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults", "kind": "class", "doc": "

Defaults used on the data quality process.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_STORE", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.FILE_SYSTEM_STORE: 'file_system'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.FILE_SYSTEM_S3_STORE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.FILE_SYSTEM_S3_STORE", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_BATCH_IDENTIFIERS", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_BATCH_IDENTIFIERS", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DQ_BATCH_IDENTIFIERS: ['spec_id', 'input_id', 'timestamp']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_CLASS_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATASOURCE_CLASS_NAME: 'Datasource'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATASOURCE_EXECUTION_ENGINE", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATASOURCE_EXECUTION_ENGINE", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATASOURCE_EXECUTION_ENGINE: 'SparkDFExecutionEngine'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_CLASS_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CONNECTORS_CLASS_NAME: 'RuntimeDataConnector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CONNECTORS_MODULE_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CONNECTORS_MODULE_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CONNECTORS_MODULE_NAME: 'great_expectations.datasource.data_connector'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CLASS_NAME", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CLASS_NAME: 'SimpleCheckpoint'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_CHECKPOINTS_CONFIG_VERSION: 1.0>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.STORE_BACKEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.STORE_BACKEND", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.FILE_SYSTEM_S3_STORE: 's3'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.EXPECTATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.EXPECTATIONS_STORE_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.EXPECTATIONS_STORE_PREFIX: 'dq/expectations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATIONS_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATIONS_STORE_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.VALIDATIONS_STORE_PREFIX: 'dq/validations/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DATA_DOCS_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DATA_DOCS_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DATA_DOCS_PREFIX: 'dq/data_docs/site/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CHECKPOINT_STORE_PREFIX", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CHECKPOINT_STORE_PREFIX", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.CHECKPOINT_STORE_PREFIX: 'dq/checkpoints/'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.VALIDATION_COLUMN_IDENTIFIER", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.VALIDATION_COLUMN_IDENTIFIER: 'validationresultidentifier'>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.CUSTOM_EXPECTATION_LIST", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.CUSTOM_EXPECTATION_LIST", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.CUSTOM_EXPECTATION_LIST: ['expect_column_values_to_be_date_not_older_than', 'expect_column_pair_a_to_be_smaller_or_equal_than_b', 'expect_multicolumn_column_a_must_equal_b_or_c', 'expect_queried_column_agg_value_to_be']>"}, {"fullname": "lakehouse_engine.core.definitions.DQDefaults.DQ_VALIDATIONS_SCHEMA", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQDefaults.DQ_VALIDATIONS_SCHEMA", "kind": "variable", "doc": "

\n", "default_value": "<DQDefaults.DQ_VALIDATIONS_SCHEMA: StructType([StructField('dq_validations', StructType([StructField('run_name', StringType(), True), StructField('run_success', BooleanType(), True), StructField('raised_exceptions', BooleanType(), True), StructField('run_row_success', BooleanType(), True), StructField('dq_failure_details', ArrayType(StructType([StructField('expectation_type', StringType(), True), StructField('kwargs', StringType(), True)]), True), True)]), True)])>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType", "kind": "class", "doc": "

Types of write operations.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.OVERWRITE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.OVERWRITE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.OVERWRITE: 'overwrite'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.COMPLETE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.COMPLETE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.COMPLETE: 'complete'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.APPEND", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.APPEND", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.APPEND: 'append'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.UPDATE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.UPDATE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.UPDATE: 'update'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.MERGE", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.MERGE", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.MERGE: 'merge'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.ERROR_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.ERROR_IF_EXISTS", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.ERROR_IF_EXISTS: 'error'>"}, {"fullname": "lakehouse_engine.core.definitions.WriteType.IGNORE_IF_EXISTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "WriteType.IGNORE_IF_EXISTS", "kind": "variable", "doc": "

\n", "default_value": "<WriteType.IGNORE_IF_EXISTS: 'ignore'>"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec", "kind": "class", "doc": "

Specification of an algorithm input.

\n\n

This is very aligned with the way the execution environment connects to the sources\n(e.g., spark sources).

\n\n

spec_id: spec_id of the input specification read_type: ReadType type of read\noperation.\ndata_format: format of the input.\nsftp_files_format: format of the files (csv, fwf, json, xml...) in a sftp\n directory.\ndf_name: dataframe name.\ndb_table: table name in the form of .

.\nlocation: uri that identifies from where to read data in the specified format.\nenforce_schema_from_table: if we want to enforce the table schema or not, by\n providing a table name in the form of .
.\nquery: sql query to execute and return the dataframe. Use it if you do not want to\n read from a file system nor from a table, but rather from a sql query instead.\nschema: dict representation of a schema of the input (e.g., Spark struct type\n schema).\nschema_path: path to a file with a representation of a schema of the input (e.g.,\n Spark struct type schema).\nwith_filepath: if we want to include the path of the file that is being read. Only\n works with the file reader (batch and streaming modes are supported).\noptions: dict with other relevant options according to the execution\n environment (e.g., spark) possible sources.\ncalculate_upper_bound: when to calculate upper bound to extract from SAP BW or not.\ncalc_upper_bound_schema: specific schema for the calculated upper_bound.\ngenerate_predicates: when to generate predicates to extract from SAP BW or not.\npredicates_add_null: if we want to include is null on partition by predicates.

\n"}, {"fullname": "lakehouse_engine.core.definitions.InputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "InputSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tread_type: str,\tdata_format: Optional[str] = None,\tsftp_files_format: Optional[str] = None,\tdf_name: Optional[pyspark.sql.dataframe.DataFrame] = None,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tquery: Optional[str] = None,\tenforce_schema_from_table: Optional[str] = None,\tschema: Optional[dict] = None,\tschema_path: Optional[str] = None,\twith_filepath: bool = False,\toptions: Optional[dict] = None,\tjdbc_args: Optional[dict] = None,\tcalculate_upper_bound: bool = False,\tcalc_upper_bound_schema: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates_add_null: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec", "kind": "class", "doc": "

Transformer Specification, i.e., a single transformation amongst many.

\n\n

function: name of the function (or callable function) to be executed.\nargs: (not applicable if using a callable function) dict with the arguments to pass\nto the function pairs with the name of the parameter of the function and the\nrespective value.

\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformerSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformerSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(function: str, args: dict)"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec", "kind": "class", "doc": "

Transformation Specification.

\n\n

I.e., the specification that defines the many transformations to be done to the data\nthat was read.

\n\n

spec_id: id of the terminate specification input_id: id of the corresponding input\nspecification.\ntransformers: list of transformers to execute.\nforce_streaming_foreach_batch_processing: sometimes, when using streaming, we want\n to force the transform to be executed in the foreachBatch function to ensure\n non-supported streaming operations can be properly executed.

\n"}, {"fullname": "lakehouse_engine.core.definitions.TransformSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TransformSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tinput_id: str,\ttransformers: List[lakehouse_engine.core.definitions.TransformerSpec],\tforce_streaming_foreach_batch_processing: bool = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQType", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType", "kind": "class", "doc": "

Available data quality tasks.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.DQType.VALIDATOR", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.VALIDATOR", "kind": "variable", "doc": "

\n", "default_value": "<DQType.VALIDATOR: 'validator'>"}, {"fullname": "lakehouse_engine.core.definitions.DQType.ASSISTANT", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQType.ASSISTANT", "kind": "variable", "doc": "

\n", "default_value": "<DQType.ASSISTANT: 'assistant'>"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec", "kind": "class", "doc": "

Defines a data quality function specification.

\n\n

function - name of the data quality function (expectation) to execute.\nIt follows the great_expectations api https://greatexpectations.io/expectations/.\nargs - args of the function (expectation). Follow the same api as above.

\n"}, {"fullname": "lakehouse_engine.core.definitions.DQFunctionSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQFunctionSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(function: str, args: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec", "kind": "class", "doc": "

Data quality overall specification.

\n\n
spec_id - id of the specification.\ninput_id - id of the input specification.\ndq_type - type of DQ process to execute (e.g. validator).\ndq_functions - list of function specifications to execute.\nunexpected_rows_pk - the list of columns composing the primary key of the\n    source data to identify the rows failing the DQ validations. Note: only one\n    of tbl_to_derive_pk or unexpected_rows_pk arguments need to be provided. It\n    is mandatory to provide one of these arguments when using tag_source_data\n    as True. When tag_source_data is False, this is not mandatory, but still\n    recommended.\ntbl_to_derive_pk - db.table to automatically derive the unexpected_rows_pk from.\n    Note: only one of tbl_to_derive_pk or unexpected_rows_pk arguments need to\n    be provided. It is mandatory to provide one of these arguments when using\n    tag_source_data as True. hen tag_source_data is False, this is not\n    mandatory, but still recommended.\ngx_result_format - great expectations result format. Default: \"COMPLETE\".\n
\n\n

\u00b4 tag_source_data - when set to true, this will ensure that the DQ process ends by\n tagging the source data with an additional column with information about the\n DQ results. This column makes it possible to identify if the DQ run was\n succeeded in general and, if not, it unlocks the insights to know what\n specific rows have made the DQ validations fail and why. Default: False.\n Note: it only works if result_sink_explode is True, gx_result_format is\n COMPLETE, fail_on_error is False (which is done automatically when\n you specify tag_source_data as True) and tbl_to_derive_pk or\n unexpected_rows_pk is configured.\n store_backend - which store_backend to use (e.g. s3 or file_system).\n local_fs_root_dir - path of the root directory. Note: only applicable for\n store_backend file_system.\n bucket - the bucket name to consider for the store_backend (store DQ artefacts).\n Note: only applicable for store_backend s3.\n data_docs_bucket - the bucket name for data docs only. When defined, it will\n supersede bucket parameter.\n expectations_store_prefix - prefix where to store expectations' data. Note: only\n applicable for store_backend s3.\n validations_store_prefix - prefix where to store validations' data. Note: only\n applicable for store_backend s3.\n data_docs_prefix - prefix where to store data_docs' data. Note: only applicable\n for store_backend s3.\n checkpoint_store_prefix - prefix where to store checkpoints' data. Note: only\n applicable for store_backend s3.\n data_asset_name - name of the data asset to consider when configuring the great\n expectations' data source.\n expectation_suite_name - name to consider for great expectations' suite.\n assistant_options - additional options to pass to the DQ assistant processor.\n result_sink_db_table - db.table_name indicating the database and table in which\n to save the results of the DQ process.\n result_sink_location - file system location in which to save the results of the\n DQ process.\n result_sink_partitions - the list of partitions to consider.\n result_sink_format - format of the result table (e.g. delta, parquet, kafka...).\n result_sink_options - extra spark options for configuring the result sink.\n E.g: can be used to configure a Kafka sink if result_sink_format is kafka.\n result_sink_explode - flag to determine if the output table/location should have\n the columns exploded (as True) or not (as False). Default: True.\n result_sink_extra_columns - list of extra columns to be exploded (following\n the pattern \".*\") or columns to be selected. It is only used when\n result_sink_explode is set to True.\n source - name of data source, to be easier to identify in analysis. If not\n specified, it is set as default . This will be only used\n when result_sink_explode is set to True.\n fail_on_error - whether to fail the algorithm if the validations of your data in\n the DQ process failed.\n cache_df - whether to cache the dataframe before running the DQ process or not.\n critical_functions - functions that should not fail. When this argument is\n defined, fail_on_error is nullified.\n max_percentage_failure - percentage of failure that should be allowed.\n This argument has priority over both fail_on_error and critical_functions.

\n"}, {"fullname": "lakehouse_engine.core.definitions.DQSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tinput_id: str,\tdq_type: str,\tdq_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tunexpected_rows_pk: Optional[List[str]] = None,\ttbl_to_derive_pk: Optional[str] = None,\tgx_result_format: Optional[str] = 'COMPLETE',\ttag_source_data: Optional[bool] = False,\tassistant_options: Optional[dict] = None,\tstore_backend: str = 's3',\tlocal_fs_root_dir: Optional[str] = None,\tbucket: Optional[str] = None,\tdata_docs_bucket: Optional[str] = None,\texpectations_store_prefix: str = 'dq/expectations/',\tvalidations_store_prefix: str = 'dq/validations/',\tdata_docs_prefix: str = 'dq/data_docs/site/',\tcheckpoint_store_prefix: str = 'dq/checkpoints/',\tdata_asset_name: Optional[str] = None,\texpectation_suite_name: Optional[str] = None,\tresult_sink_db_table: Optional[str] = None,\tresult_sink_location: Optional[str] = None,\tresult_sink_partitions: Optional[List[str]] = None,\tresult_sink_format: str = 'delta',\tresult_sink_options: Optional[dict] = None,\tresult_sink_explode: bool = True,\tresult_sink_extra_columns: Optional[List[str]] = None,\tsource: Optional[str] = None,\tfail_on_error: bool = True,\tcache_df: bool = False,\tcritical_functions: Optional[List[lakehouse_engine.core.definitions.DQFunctionSpec]] = None,\tmax_percentage_failure: Optional[float] = None)"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions", "kind": "class", "doc": "

Options for a merge operation.

\n\n

merge_predicate: predicate to apply to the merge operation so that we can check if a\n new record corresponds to a record already included in the historical data.\ninsert_only: indicates if the merge should only insert data (e.g., deduplicate\n scenarios).\ndelete_predicate: predicate to apply to the delete operation.\nupdate_predicate: predicate to apply to the update operation.\ninsert_predicate: predicate to apply to the insert operation.\nupdate_column_set: rules to apply to the update operation which allows to set the\n value for each column to be updated.\n (e.g. {\"data\": \"new.data\", \"count\": \"current.count + 1\"} )\ninsert_column_set: rules to apply to the insert operation which allows to set the\n value for each column to be inserted.\n (e.g. {\"date\": \"updates.date\", \"count\": \"1\"} )

\n"}, {"fullname": "lakehouse_engine.core.definitions.MergeOptions.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "MergeOptions.__init__", "kind": "function", "doc": "

\n", "signature": "(\tmerge_predicate: str,\tinsert_only: bool = False,\tdelete_predicate: Optional[str] = None,\tupdate_predicate: Optional[str] = None,\tinsert_predicate: Optional[str] = None,\tupdate_column_set: Optional[dict] = None,\tinsert_column_set: Optional[dict] = None)"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec", "kind": "class", "doc": "

Specification of an algorithm output.

\n\n

This is very aligned with the way the execution environment connects to the output\nsystems (e.g., spark outputs).

\n\n

spec_id: id of the output specification.\ninput_id: id of the corresponding input specification.\nwrite_type: type of write operation.\ndata_format: format of the output. Defaults to DELTA.\ndb_table: table name in the form of .

.\nlocation: uri that identifies from where to write data in the specified format.\npartitions: list of partition input_col names.\nmerge_opts: options to apply to the merge operation.\nstreaming_micro_batch_transformers: transformers to invoke for each streaming micro\n batch, before writing (i.e., in Spark's foreachBatch structured\n streaming function). Note: the lakehouse engine manages this for you, so\n you don't have to manually specify streaming transformations here, so we don't\n advise you to manually specify transformations through this parameter. Supply\n them as regular transformers in the transform_specs sections of an ACON.\nstreaming_once: if the streaming query is to be executed just once, or not,\n generating just one micro batch.\nstreaming_processing_time: if streaming query is to be kept alive, this indicates\n the processing time of each micro batch.\nstreaming_available_now: if set to True, set a trigger that processes all available\n data in multiple batches then terminates the query.\n When using streaming, this is the default trigger that the lakehouse-engine will\n use, unless you configure a different one.\nstreaming_continuous: set a trigger that runs a continuous query with a given\n checkpoint interval.\nstreaming_await_termination: whether to wait (True) for the termination of the\n streaming query (e.g. timeout or exception) or not (False). Default: True.\nstreaming_await_termination_timeout: a timeout to set to the\n streaming_await_termination. Default: None.\nwith_batch_id: whether to include the streaming batch id in the final data, or not.\n It only takes effect in streaming mode.\noptions: dict with other relevant options according to the execution environment\n (e.g., spark) possible outputs. E.g.,: JDBC options, checkpoint location for\n streaming, etc.\nstreaming_micro_batch_dq_processors: similar to streaming_micro_batch_transformers\n but for the DQ functions to be executed. Used internally by the lakehouse\n engine, so you don't have to supply DQ functions through this parameter. Use the\n dq_specs of the acon instead.

\n"}, {"fullname": "lakehouse_engine.core.definitions.OutputSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "OutputSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tspec_id: str,\tinput_id: str,\twrite_type: str,\tdata_format: str = 'delta',\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tmerge_opts: Optional[lakehouse_engine.core.definitions.MergeOptions] = None,\tpartitions: Optional[List[str]] = None,\tstreaming_micro_batch_transformers: Optional[List[lakehouse_engine.core.definitions.TransformerSpec]] = None,\tstreaming_once: Optional[bool] = None,\tstreaming_processing_time: Optional[str] = None,\tstreaming_available_now: bool = True,\tstreaming_continuous: Optional[str] = None,\tstreaming_await_termination: bool = True,\tstreaming_await_termination_timeout: Optional[int] = None,\twith_batch_id: bool = False,\toptions: Optional[dict] = None,\tstreaming_micro_batch_dq_processors: Optional[List[lakehouse_engine.core.definitions.DQSpec]] = None)"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec", "kind": "class", "doc": "

Terminator Specification.

\n\n

I.e., the specification that defines a terminator operation to be executed. Examples\nare compute statistics, vacuum, optimize, etc.

\n\n

spec_id: id of the terminate specification.\nfunction: terminator function to execute.\nargs: arguments of the terminator function.\ninput_id: id of the corresponding output specification (Optional).

\n"}, {"fullname": "lakehouse_engine.core.definitions.TerminatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "TerminatorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tfunction: str,\targs: Optional[dict] = None,\tinput_id: Optional[str] = None)"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec", "kind": "class", "doc": "

Reconciliator Specification.

\n\n

metrics: list of metrics in the form of:\n [{\n metric: name of the column present in both truth and current datasets,\n aggregation: sum, avg, max, min, ...,\n type: percentage or absolute,\n yellow: value,\n red: value\n }].\nrecon_type: reconciliation type (percentage or absolute). Percentage calculates\n the difference between truth and current results as a percentage (x-y/x), and\n absolute calculates the raw difference (x - y).\ntruth_input_spec: input specification of the truth data.\ncurrent_input_spec: input specification of the current results data\ntruth_preprocess_query: additional query on top of the truth input data to\n preprocess the truth data before it gets fueled into the reconciliation process.\n Important note: you need to assume that the data out of\n the truth_input_spec is referencable by a table called 'truth'.\ntruth_preprocess_query_args: optional dict having the functions/transformations to\n apply on top of the truth_preprocess_query and respective arguments. Note: cache\n is being applied on the Dataframe, by default. For turning the default behavior\n off, pass \"truth_preprocess_query_args\": [].\ncurrent_preprocess_query: additional query on top of the current results input data\n to preprocess the current results data before it gets fueled into the\n reconciliation process. Important note: you need to assume that the data out of\n the current_results_input_spec is referencable by a table called 'current'.\ncurrent_preprocess_query_args: optional dict having the functions/transformations to\n apply on top of the current_preprocess_query and respective arguments. Note:\n cache is being applied on the Dataframe, by default. For turning the default\n behavior off, pass \"current_preprocess_query_args\": [].\nignore_empty_df: optional boolean, to ignore the recon process if source & target\n dataframes are empty, recon will exit success code (passed)

\n"}, {"fullname": "lakehouse_engine.core.definitions.ReconciliatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "ReconciliatorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tmetrics: List[dict],\ttruth_input_spec: lakehouse_engine.core.definitions.InputSpec,\tcurrent_input_spec: lakehouse_engine.core.definitions.InputSpec,\ttruth_preprocess_query: Optional[str] = None,\ttruth_preprocess_query_args: Optional[List[dict]] = None,\tcurrent_preprocess_query: Optional[str] = None,\tcurrent_preprocess_query_args: Optional[List[dict]] = None,\tignore_empty_df: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec", "kind": "class", "doc": "

Data Quality Validator Specification.

\n\n

input_spec: input specification of the data to be checked/validated.\ndq_spec: data quality specification.\nrestore_prev_version: specify if, having\ndelta table/files as input, they should be restored to the\nprevious version if the data quality process fails. Note: this\nis only considered if fail_on_error is kept as True.

\n"}, {"fullname": "lakehouse_engine.core.definitions.DQValidatorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "DQValidatorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\trestore_prev_version: Optional[bool] = False)"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions", "kind": "class", "doc": "

SQL definitions statements.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.compute_table_stats", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.compute_table_stats", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.compute_table_stats: 'ANALYZE TABLE {} COMPUTE STATISTICS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_table_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_table_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.drop_table_stmt: 'DROP TABLE IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.drop_view_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.drop_view_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.drop_view_stmt: 'DROP VIEW IF EXISTS'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.truncate_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.truncate_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.truncate_stmt: 'TRUNCATE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.describe_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.describe_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.describe_stmt: 'DESCRIBE TABLE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.optimize_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.optimize_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.optimize_stmt: 'OPTIMIZE'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.show_tbl_props_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.show_tbl_props_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.show_tbl_props_stmt: 'SHOW TBLPROPERTIES'>"}, {"fullname": "lakehouse_engine.core.definitions.SQLDefinitions.delete_where_stmt", "modulename": "lakehouse_engine.core.definitions", "qualname": "SQLDefinitions.delete_where_stmt", "kind": "variable", "doc": "

\n", "default_value": "<SQLDefinitions.delete_where_stmt: 'DELETE FROM {} WHERE {}'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys", "kind": "class", "doc": "

File Manager s3 api keys.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTENTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTENTS", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.CONTENTS: 'Contents'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.KEY", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.KEY", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.KEY: 'Key'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.CONTINUATION", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.CONTINUATION", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.CONTINUATION: 'NextContinuationToken'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.BUCKET", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.BUCKET", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.BUCKET: 'Bucket'>"}, {"fullname": "lakehouse_engine.core.definitions.FileManagerAPIKeys.OBJECTS", "modulename": "lakehouse_engine.core.definitions", "qualname": "FileManagerAPIKeys.OBJECTS", "kind": "variable", "doc": "

\n", "default_value": "<FileManagerAPIKeys.OBJECTS: 'Objects'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec", "kind": "class", "doc": "

Sensor Specification.

\n\n

sensor_id: sensor id.\nassets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.\ncontrol_db_table_name: db.table to store sensor metadata.\ninput_spec: input specification of the source to be checked for new data.\npreprocess_query: SQL query to transform/filter the result from the\n upstream. Consider that we should refer to 'new_data' whenever\n we are referring to the input of the sensor. E.g.:\n \"SELECT dummy_col FROM new_data WHERE ...\"\ncheckpoint_location: optional location to store checkpoints to resume\n from. These checkpoints use the same as Spark checkpoint strategy.\n For Spark readers that do not support checkpoints, use the\n preprocess_query parameter to form a SQL query to filter the result\n from the upstream accordingly.\nfail_on_empty_result: if the sensor should throw an error if there is no new\n data in the upstream. Default: True.

\n"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.__init__", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.__init__", "kind": "function", "doc": "

\n", "signature": "(\tsensor_id: str,\tassets: List[str],\tcontrol_db_table_name: str,\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\tpreprocess_query: Optional[str],\tcheckpoint_location: Optional[str],\tfail_on_empty_result: bool = True)"}, {"fullname": "lakehouse_engine.core.definitions.SensorSpec.create_from_acon", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorSpec.create_from_acon", "kind": "function", "doc": "

Create SensorSpec from acon.

\n\n

Args:\n acon: sensor ACON.

\n", "signature": "(cls, acon: dict):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus", "kind": "class", "doc": "

Status for a sensor.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.ACQUIRED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.ACQUIRED_NEW_DATA", "kind": "variable", "doc": "

\n", "default_value": "<SensorStatus.ACQUIRED_NEW_DATA: 'ACQUIRED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SensorStatus.PROCESSED_NEW_DATA", "modulename": "lakehouse_engine.core.definitions", "qualname": "SensorStatus.PROCESSED_NEW_DATA", "kind": "variable", "doc": "

\n", "default_value": "<SensorStatus.PROCESSED_NEW_DATA: 'PROCESSED_NEW_DATA'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain", "kind": "class", "doc": "

Defaults used on consuming data from SAP Logchain.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.DBTABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.DBTABLE", "kind": "variable", "doc": "

\n", "default_value": "<SAPLogchain.DBTABLE: 'SAPPHA.RSPCLOGCHAIN'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.GREEN_STATUS", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.GREEN_STATUS", "kind": "variable", "doc": "

\n", "default_value": "<SAPLogchain.GREEN_STATUS: 'G'>"}, {"fullname": "lakehouse_engine.core.definitions.SAPLogchain.ENGINE_TABLE", "modulename": "lakehouse_engine.core.definitions", "qualname": "SAPLogchain.ENGINE_TABLE", "kind": "variable", "doc": "

\n", "default_value": "<SAPLogchain.ENGINE_TABLE: 'sensor_new_data'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType", "kind": "class", "doc": "

Archive types.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.BULK", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.BULK", "kind": "variable", "doc": "

\n", "default_value": "<RestoreType.BULK: 'Bulk'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.STANDARD", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.STANDARD", "kind": "variable", "doc": "

\n", "default_value": "<RestoreType.STANDARD: 'Standard'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.EXPEDITED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.EXPEDITED", "kind": "variable", "doc": "

\n", "default_value": "<RestoreType.EXPEDITED: 'Expedited'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.values", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.values", "kind": "function", "doc": "

Generates a list containing all enum values.

\n\n

Return:\n A list with all enum values.

\n", "signature": "(cls):", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreType.exists", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreType.exists", "kind": "function", "doc": "

Checks if the restore type exists in the enum values.

\n\n

Args:\n restore_type: restore type to check if exists.

\n\n

Return:\n If the restore type exists in our enum.

\n", "signature": "(cls, restore_type: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus", "kind": "class", "doc": "

Archive types.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.NOT_STARTED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.NOT_STARTED", "kind": "variable", "doc": "

\n", "default_value": "<RestoreStatus.NOT_STARTED: 'not_started'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.ONGOING", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.ONGOING", "kind": "variable", "doc": "

\n", "default_value": "<RestoreStatus.ONGOING: 'ongoing'>"}, {"fullname": "lakehouse_engine.core.definitions.RestoreStatus.RESTORED", "modulename": "lakehouse_engine.core.definitions", "qualname": "RestoreStatus.RESTORED", "kind": "variable", "doc": "

\n", "default_value": "<RestoreStatus.RESTORED: 'restored'>"}, {"fullname": "lakehouse_engine.core.exec_env", "modulename": "lakehouse_engine.core.exec_env", "kind": "module", "doc": "

Module to take care of creating a singleton of the execution environment class.

\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv", "kind": "class", "doc": "

Represents the basic resources regarding the engine execution environment.

\n\n

Currently, it is solely used to encapsulate the logic to get a Spark session.

\n"}, {"fullname": "lakehouse_engine.core.exec_env.ExecEnv.get_or_create", "modulename": "lakehouse_engine.core.exec_env", "qualname": "ExecEnv.get_or_create", "kind": "function", "doc": "

Get or create an execution environment session (currently Spark).

\n\n

It instantiates a singleton session that can be accessed anywhere from the\nlakehouse engine.

\n\n

Args:\n session: spark session.\n enable_hive_support: whether to enable hive support or not.\n app_name: application name.\n config: extra spark configs to supply to the spark session.

\n", "signature": "(\tcls,\tsession: pyspark.sql.session.SparkSession = None,\tenable_hive_support: bool = True,\tapp_name: str = None,\tconfig: dict = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.executable", "modulename": "lakehouse_engine.core.executable", "kind": "module", "doc": "

Module representing an executable lakehouse engine component.

\n"}, {"fullname": "lakehouse_engine.core.executable.Executable", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable", "kind": "class", "doc": "

Abstract class defining the behaviour of an executable component.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.core.executable.Executable.execute", "modulename": "lakehouse_engine.core.executable", "qualname": "Executable.execute", "kind": "function", "doc": "

Define the executable component behaviour.

\n\n

E.g., the behaviour of an algorithm inheriting from this.

\n", "signature": "(self) -> Optional[Any]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager", "modulename": "lakehouse_engine.core.file_manager", "kind": "module", "doc": "

File manager module.

\n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager", "kind": "class", "doc": "

Set of actions to manipulate files in several ways.

\n"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.__init__", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.__init__", "kind": "function", "doc": "

Construct FileManager algorithm instances.

\n\n

Args:\n configs: configurations for the FileManager algorithm.

\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.get_function", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.get_function", "kind": "function", "doc": "

Get a specific function to execute.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.delete_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.delete_objects", "kind": "function", "doc": "

Delete objects and 'directories' in s3.

\n\n

If dry_run is set to True the function will print a dict with all the\npaths that would be deleted based on the given keys.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.copy_objects", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.copy_objects", "kind": "function", "doc": "

Copies objects and 'directories' in s3.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.request_restore", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.request_restore", "kind": "function", "doc": "

Request the restore of archived data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.check_restore_status", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.check_restore_status", "kind": "function", "doc": "

Check the restore status of archived data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.FileManager.request_restore_to_destination_and_wait", "modulename": "lakehouse_engine.core.file_manager", "qualname": "FileManager.request_restore_to_destination_and_wait", "kind": "function", "doc": "

Request and wait for the restore to complete, polling the restore status.

\n\n

After the restore is done, copy the restored files to destination

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager", "kind": "class", "doc": "

Set of actions to restore archives.

\n"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.check_restore_status", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.check_restore_status", "kind": "function", "doc": "

Check the restore status of archived data.

\n\n

Args:\n source_bucket: name of bucket to check the restore status.\n source_object: object to check the restore status.

\n\n

Returns:\n A dict containing the amount of objects in each status.

\n", "signature": "(source_bucket: str, source_object: str) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.request_restore", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.request_restore", "kind": "function", "doc": "

Request the restore of archived data.

\n\n

Args:\n source_bucket: name of bucket to perform the restore.\n source_object: object to be restored.\n restore_expiration: restore expiration in days.\n retrieval_tier: type of restore, possible values are:\n Bulk, Standard or Expedited.\n dry_run: if dry_run is set to True the function will print a dict with\n all the paths that would be deleted based on the given keys.

\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.file_manager.ArchiveFileManager.request_restore_and_wait", "modulename": "lakehouse_engine.core.file_manager", "qualname": "ArchiveFileManager.request_restore_and_wait", "kind": "function", "doc": "

Request and wait for the restore to complete, polling the restore status.

\n\n

Args:\n source_bucket: name of bucket to perform the restore.\n source_object: object to be restored.\n restore_expiration: restore expiration in days.\n retrieval_tier: type of restore, possible values are:\n Bulk, Standard or Expedited.\n dry_run: if dry_run is set to True the function will print a dict with\n all the paths that would be deleted based on the given keys.

\n", "signature": "(\tsource_bucket: str,\tsource_object: str,\trestore_expiration: int,\tretrieval_tier: str,\tdry_run: bool) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager", "modulename": "lakehouse_engine.core.sensor_manager", "kind": "module", "doc": "

Module to define Sensor Manager classes.

\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager", "kind": "class", "doc": "

Class to control the Sensor execution.

\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.check_if_sensor_has_acquired_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.check_if_sensor_has_acquired_data", "kind": "function", "doc": "

Check if sensor has acquired new data.

\n\n

Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to control sensor runs.

\n\n

Returns:\n True if acquired new data, otherwise False

\n", "signature": "(cls, sensor_id: str, control_db_table_name: str) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.update_sensor_status", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.update_sensor_status", "kind": "function", "doc": "

Control sensor execution storing the execution data in a delta table.

\n\n

Args:\n sensor_spec: sensor spec containing all sensor\n information we need to update the control status.\n status: status of the sensor.\n upstream_key: upstream key (e.g., used to store an attribute\n name from the upstream so that new data can be detected\n automatically).\n upstream_value: upstream value (e.g., used to store the max\n attribute value from the upstream so that new data can be\n detected automatically).

\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec,\tstatus: str,\tupstream_key: str = None,\tupstream_value: str = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorControlTableManager.read_sensor_table_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorControlTableManager.read_sensor_table_data", "kind": "function", "doc": "

Read data from delta table containing sensor status info.

\n\n

Args:\n sensor_id: sensor id. If this parameter is defined search occurs\n only considering this parameter. Otherwise, it considers sensor\n assets and checkpoint location.\n control_db_table_name: db.table to control sensor runs.\n assets: list of assets that are fueled by the pipeline\n where this sensor is.

\n\n

Return:\n Row containing the data for the provided sensor_id.

\n", "signature": "(\tcls,\tcontrol_db_table_name: str,\tsensor_id: str = None,\tassets: list = None) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager", "kind": "class", "doc": "

Class to deal with Sensor Upstream data.

\n"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_filter_exp_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_filter_exp_query", "kind": "function", "doc": "

Generates a sensor preprocess query based on timestamp logic.

\n\n

Args:\n sensor_id: sensor id.\n filter_exp: expression to filter incoming new data.\n You can use the placeholder ?upstream_value so that\n it can be replaced by the upstream_value in the\n control_db_table_name for this specific sensor_id.\n control_db_table_name: db.table to retrieve the last status change\n timestamp. This is only relevant for the jdbc sensor.\n upstream_key: the key of custom sensor information\n to control how to identify new data from the\n upstream (e.g., a time column in the upstream).\n upstream_value: value for custom sensor\n to identify new data from the upstream\n (e.g., the value of a time present in the upstream)\n If none we will set the default value.\n Note: This parameter is used just to override the\n default value -2147483647.\n upstream_table_name: value for custom sensor\n to query new data from the upstream.\n If none we will set the default value,\n our sensor_new_data view.

\n\n

Return:\n The query string.

\n", "signature": "(\tcls,\tsensor_id: str,\tfilter_exp: str,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_table_preprocess_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_table_preprocess_query", "kind": "function", "doc": "

Generates a query to be used for a sensor having other sensor as upstream.

\n\n

Args:\n sensor_id: sensor id.

\n\n

Return:\n The query string.

\n", "signature": "(cls, sensor_id: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.read_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.read_new_data", "kind": "function", "doc": "

Read new data from the upstream into the sensor 'new_data_df'.

\n\n

Args:\n sensor_spec: sensor spec containing all sensor information.

\n\n

Return:\n An empty dataframe if it doesn't have new data otherwise the new data

\n", "signature": "(\tcls,\tsensor_spec: lakehouse_engine.core.definitions.SensorSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.get_new_data", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.get_new_data", "kind": "function", "doc": "

Get new data from upstream df if it's present.

\n\n

Args:\n new_data_df: DataFrame possibly containing new data.

\n\n

Return:\n Optional row, present if there is new data in the upstream,\n absent otherwise.

\n", "signature": "(\tcls,\tnew_data_df: pyspark.sql.dataframe.DataFrame) -> Optional[pyspark.sql.types.Row]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.sensor_manager.SensorUpstreamManager.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.core.sensor_manager", "qualname": "SensorUpstreamManager.generate_sensor_sap_logchain_query", "kind": "function", "doc": "

Generates a sensor query based in the SAP Logchain table.

\n\n

Args:\n chain_id: chain id to query the status on SAP.\n dbtable: db.table to retrieve the data to\n check if the sap chain is already finished.\n status: db.table to retrieve the last status change\n timestamp.\n engine_table_name: table name exposed with the SAP LOGCHAIN data.\n This table will be used in the jdbc query.

\n\n

Return:\n The query string.

\n", "signature": "(\tcls,\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager", "modulename": "lakehouse_engine.core.table_manager", "kind": "module", "doc": "

Table manager module.

\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager", "kind": "class", "doc": "

Set of actions to manipulate tables/views in several ways.

\n"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.__init__", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.__init__", "kind": "function", "doc": "

Construct TableManager algorithm instances.

\n\n

Args:\n configs: configurations for the TableManager algorithm.

\n", "signature": "(configs: dict)"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_function", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_function", "kind": "function", "doc": "

Get a specific function to execute.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create", "kind": "function", "doc": "

Create a new table or view on metastore.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.create_many", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.create_many", "kind": "function", "doc": "

Create multiple tables or views on metastore.

\n\n

In this function the path to the ddl files can be separated by comma.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.compute_table_statistics", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.compute_table_statistics", "kind": "function", "doc": "

Compute table statistics.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_table", "kind": "function", "doc": "

Delete table function deletes table from metastore and erases all data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.drop_view", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.drop_view", "kind": "function", "doc": "

Delete view function deletes view from metastore and erases all data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.truncate", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.truncate", "kind": "function", "doc": "

Truncate function erases all data but keeps metadata.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.vacuum", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.vacuum", "kind": "function", "doc": "

Vacuum function erases older versions from Delta Lake tables or locations.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.describe", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.describe", "kind": "function", "doc": "

Describe function describes metadata from some table or view.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.optimize", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.optimize", "kind": "function", "doc": "

Optimize function optimizes the layout of Delta Lake data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_multiple_sql_files", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_multiple_sql_files", "kind": "function", "doc": "

Execute multiple statements in multiple sql files.

\n\n

In this function the path to the files is separated by comma.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.execute_sql", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.execute_sql", "kind": "function", "doc": "

Execute sql commands separated by semicolon (;).

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.show_tbl_properties", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.show_tbl_properties", "kind": "function", "doc": "

Show Table Properties.

\n\n

Returns: a dataframe with the table properties.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.get_tbl_pk", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.get_tbl_pk", "kind": "function", "doc": "

Get the primary key of a particular table.

\n\n

Returns: the list of columns that are part of the primary key.

\n", "signature": "(self) -> List[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.repair_table", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.repair_table", "kind": "function", "doc": "

Run the repair table command.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.core.table_manager.TableManager.delete_where", "modulename": "lakehouse_engine.core.table_manager", "qualname": "TableManager.delete_where", "kind": "function", "doc": "

Run the delete where command.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors", "modulename": "lakehouse_engine.dq_processors", "kind": "module", "doc": "

Package to define data quality processes available in the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "kind": "module", "doc": "

Module containing the definition of a data assistant.

\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant.Assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "qualname": "Assistant", "kind": "class", "doc": "

Class containing the data assistant.

\n"}, {"fullname": "lakehouse_engine.dq_processors.assistant.Assistant.run_data_assistant", "modulename": "lakehouse_engine.dq_processors.assistant", "qualname": "Assistant.run_data_assistant", "kind": "function", "doc": "

Entrypoint to run the data assistant.

\n\n

Based on the data, it uses GE Onboarding Data Assistant to generate expectations\nthat can be applied to the data. Then, it returns the generated expectations\nand, depending on your configuration, it can display plots of the metrics,\nexpectations and also display or store the profiling of the data, for you to get\na better sense of it.

\n\n

Args:\n context: the BaseDataContext containing the configurations for the data\n source and store backend.\n batch_request: batch request to be able to query underlying data.\n expectation_suite_name: name of the expectation suite.\n assistant_options: additional options to pass to the DQ assistant processor.\n data: the input dataframe for which the DQ is running.\n profile_file_name: file name for storing the profiling html file.

\n\n

Returns:\n The context with the expectation suite stored.

\n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tassistant_options: dict,\tdata: pyspark.sql.dataframe.DataFrame,\tprofile_file_name: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations", "modulename": "lakehouse_engine.dq_processors.custom_expectations", "kind": "module", "doc": "

Package containing custom DQ expectations available in the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "kind": "module", "doc": "

Expectation to check if column 'a' is lower or equal than column 'b'.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ColumnPairCustom", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ColumnPairCustom", "kind": "class", "doc": "

Asserts that column 'A' is lower or equal than column 'B'.

\n\n

Additionally, the 'margin' parameter can be used to add a margin to the\ncheck between column 'A' and 'B': 'A' <= 'B' + 'margin'.

\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_pair_map_metric_provider.ColumnPairMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b.ExpectColumnPairAToBeSmallerOrEqualThanB", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_pair_a_to_be_smaller_or_equal_than_b", "qualname": "ExpectColumnPairAToBeSmallerOrEqualThanB", "kind": "class", "doc": "

Expect values in column A to be lower or equal than column B.

\n\n

Args:\n column_A: The first column name.\n column_B: The second column name.\n margin: additional approximation to column B value.

\n\n

Keyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).\n result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC (default), COMPLETE, or SUMMARY.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.

\n\n

Returns:\n An ExpectationSuiteValidationResult.

\n", "bases": "great_expectations.expectations.expectation.ColumnPairMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "kind": "module", "doc": "

Expectation to check if column value is a date within a timeframe.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ColumnValuesDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ColumnValuesDateNotOlderThan", "kind": "class", "doc": "

Asserts that column values are a date that isn't older than a given date.

\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.column_map_metric_provider.ColumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than.ExpectColumnValuesToBeDateNotOlderThan", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_column_values_to_be_date_not_older_than", "qualname": "ExpectColumnValuesToBeDateNotOlderThan", "kind": "class", "doc": "

Expect value in column to be date that is not older than a given time.

\n\n

Since timedelta can only define an interval up to weeks, a month is defined\nas 4 weeks and a year is defined as 52 weeks.

\n\n

Args:\n column: Name of column to validate\n Note: Column must be of type Date, Timestamp or String (with Timestamp format).\n Format: yyyy-MM-ddTHH:mm:ss\n timeframe: dict with the definition of the timeframe.\n kwargs: dict with additional parameters.

\n\n

Keyword Args:\n allow_cross_type_comparisons: If True, allow\n comparisons between types (e.g. integer and string).\n Otherwise, attempting such comparisons will raise an exception.\n ignore_row_if: \"both_values_are_missing\",\n \"either_value_is_missing\", \"neither\" (default).\n result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC (default), COMPLETE, or SUMMARY.\n include_config: If True (default), then include the expectation config\n as part of the result object.\n catch_exceptions: If True, then catch exceptions and\n include them as part of the result object. Default: False.\n meta: A JSON-serializable dictionary (nesting allowed)\n that will be included in the output without modification.

\n\n

Returns:\n An ExpectationSuiteValidationResult.

\n", "bases": "great_expectations.expectations.expectation.ColumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "kind": "module", "doc": "

Expectation to check if column 'a' equals 'b', or 'c'.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.MulticolumnCustomMetric", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "MulticolumnCustomMetric", "kind": "class", "doc": "

Expectation metric definition.

\n\n

This expectation asserts that column 'a' must equal to column 'b' or column 'c'.\nIn addition to this it is possible to validate that column 'b' or 'c' match a regex.

\n", "bases": "great_expectations.expectations.metrics.map_metric_provider.multicolumn_map_metric_provider.MulticolumnMapMetricProvider"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c.ExpectMulticolumnColumnAMustEqualBOrC", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_multicolumn_column_a_must_equal_b_or_c", "qualname": "ExpectMulticolumnColumnAMustEqualBOrC", "kind": "class", "doc": "

MultiColumn Expectation.

\n\n

Expect that the column 'a' is equal to 'b' when this is\nnot empty; otherwise 'a' must be equal to 'c'.

\n\n

Args:\n column_list: The column names to evaluate.

\n\n

Keyword Args:\n ignore_row_if: default to \"never\".\n result_format: Which output mode to use:\n BOOLEAN_ONLY, BASIC, COMPLETE, or SUMMARY.\n Default set to BASIC.\n include_config: If True, then include the expectation\n config as part of the result object.\n Default set to True.\n catch_exceptions: If True, then catch exceptions\n and include them as part of the result object.\n Default set to False.

\n\n

Returns:\n An ExpectationSuiteValidationResult.

\n", "bases": "great_expectations.expectations.expectation.MulticolumnMapExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "kind": "module", "doc": "

Expectation to check if aggregated column satisfy the condition.

\n"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe", "kind": "class", "doc": "

Expect agg of column to satisfy the condition specified.

\n\n

Args:\n template_dict: dict with the following keys:\n column (column to check sum).\n group_column_list (group by column names to be listed).\n condition (how to validate the aggregated value eg: between,\n greater, lesser).\n max_value (maximum allowed value).\n min_value (minimum allowed value).\n agg_type (sum/count/max/min).

\n", "bases": "great_expectations.expectations.expectation.QueryExpectation"}, {"fullname": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be.ExpectQueriedColumnAggValueToBe.validate_configuration", "modulename": "lakehouse_engine.dq_processors.custom_expectations.expect_queried_column_agg_value_to_be", "qualname": "ExpectQueriedColumnAggValueToBe.validate_configuration", "kind": "function", "doc": "

Validates that a configuration has been set.

\n\n

Args:\n configuration (OPTIONAL[ExpectationConfiguration]):\n An optional Expectation Configuration entry.

\n\n

Returns:\n None. Raises InvalidExpectationConfigurationError

\n", "signature": "(\tself,\tconfiguration: Optional[great_expectations.core.expectation_configuration.ExpectationConfiguration] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "kind": "module", "doc": "

Module containing the class definition of the Data Quality Factory.

\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory", "kind": "class", "doc": "

Class for the Data Quality Factory.

\n"}, {"fullname": "lakehouse_engine.dq_processors.dq_factory.DQFactory.run_dq_process", "modulename": "lakehouse_engine.dq_processors.dq_factory", "qualname": "DQFactory.run_dq_process", "kind": "function", "doc": "

Run the specified data quality process on a dataframe.

\n\n

Based on the dq_specs we apply the defined expectations on top of the dataframe\nin order to apply the necessary validations and then output the result of\nthe data quality process.

\n\n

Args:\n dq_spec: data quality specification.\n data: input dataframe to run the dq process on.

\n\n

Returns:\n The DataFrame containing the results of the DQ process.

\n", "signature": "(\tcls,\tdq_spec: lakehouse_engine.core.definitions.DQSpec,\tdata: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.exceptions", "modulename": "lakehouse_engine.dq_processors.exceptions", "kind": "module", "doc": "

Package defining all the DQ custom exceptions.

\n"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQValidationsFailedException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQValidationsFailedException", "kind": "class", "doc": "

Exception for when the data quality validations fail.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.exceptions.DQCheckpointsResultsException", "modulename": "lakehouse_engine.dq_processors.exceptions", "qualname": "DQCheckpointsResultsException", "kind": "class", "doc": "

Exception for when the checkpoint results parsing fail.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.dq_processors.validator", "modulename": "lakehouse_engine.dq_processors.validator", "kind": "module", "doc": "

Module containing the definition of a data quality validator.

\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator", "kind": "class", "doc": "

Class containing the data quality validator.

\n"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.get_dq_validator", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.get_dq_validator", "kind": "function", "doc": "

Get a validator according to the specification.

\n\n

We use getattr to dynamically execute any expectation available.\ngetattr(validator, function) is similar to validator.function(). With this\napproach, we can execute any expectation supported.

\n\n

Args:\n context: the BaseDataContext containing the configurations for the data\n source and store backend.\n batch_request: run time batch request to be able to query underlying data.\n expectation_suite_name: name of the expectation suite.\n dq_functions: a list of DQFunctionSpec to consider in the expectation suite.\n critical_functions: list of critical expectations in the expectation suite.

\n\n

Returns:\n The validator with the expectation suite stored.

\n", "signature": "(\tcls,\tcontext: <function BaseDataContext>,\tbatch_request: great_expectations.core.batch.RuntimeBatchRequest,\texpectation_suite_name: str,\tdq_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec],\tcritical_functions: List[lakehouse_engine.core.definitions.DQFunctionSpec]) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.dq_processors.validator.Validator.tag_source_with_dq", "modulename": "lakehouse_engine.dq_processors.validator", "qualname": "Validator.tag_source_with_dq", "kind": "function", "doc": "

Tags the source dataframe with a new column having the DQ results.

\n\n

Args:\n source_pk: the primary key of the source data.\n source_df: the source dataframe to be tagged with DQ results.\n results_df: dq results dataframe.

\n\n

Returns: a dataframe tagged with the DQ results.

\n", "signature": "(\tcls,\tsource_pk: List[str],\tsource_df: pyspark.sql.dataframe.DataFrame,\tresults_df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine", "modulename": "lakehouse_engine.engine", "kind": "module", "doc": "

Contract of the lakehouse engine with all the available functions to be executed.

\n"}, {"fullname": "lakehouse_engine.engine.load_data", "modulename": "lakehouse_engine.engine", "qualname": "load_data", "kind": "function", "doc": "

Load data using the DataLoader algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).

\n", "signature": "(\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_reconciliation", "modulename": "lakehouse_engine.engine", "qualname": "execute_reconciliation", "kind": "function", "doc": "

Execute the Reconciliator algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_dq_validation", "modulename": "lakehouse_engine.engine", "qualname": "execute_dq_validation", "kind": "function", "doc": "

Execute the DQValidator algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks or other\n apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_table", "modulename": "lakehouse_engine.engine", "qualname": "manage_table", "kind": "function", "doc": "

Manipulate tables/views using Table Manager algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.manage_files", "modulename": "lakehouse_engine.engine", "qualname": "manage_files", "kind": "function", "doc": "

Manipulate s3 files using File Manager algorithm.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.execute_sensor", "modulename": "lakehouse_engine.engine", "qualname": "execute_sensor", "kind": "function", "doc": "

Execute a sensor based on a Sensor Algorithm Configuration.

\n\n

A sensor is useful to check if an upstream system has new data.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).

\n", "signature": "(acon_path: Optional[str] = None, acon: Optional[dict] = None) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.update_sensor_status", "modulename": "lakehouse_engine.engine", "qualname": "update_sensor_status", "kind": "function", "doc": "

Update internal sensor status.

\n\n

Update the sensor status in the control table,\nit should be used to tell the system\nthat the sensor has processed all new data that was previously identified,\nhence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA,\nbut there might be scenarios - still to identify -\nwhere we can update the sensor status from/to different statuses.

\n\n

Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to store sensor checkpoints.\n status: status of the sensor.\n assets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.

\n", "signature": "(\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_query", "kind": "function", "doc": "

Generates a preprocess query to be used in a sensor configuration.

\n\n

Args:\n sensor_id: sensor id.\n filter_exp: expression to filter incoming new data.\n You can use the placeholder ?default_upstream_key and\n ?default_upstream_value, so that it can be replaced by the\n respective values in the control_db_table_name for this specific\n sensor_id.\n control_db_table_name: db.table to retrieve the last status change\n timestamp. This is only relevant for the jdbc sensor.\n upstream_key: the key of custom sensor information to control how to\n identify new data from the upstream (e.g., a time column in the\n upstream).\n upstream_value: the upstream value\n to identify new data from the upstream (e.g., the value of a time\n present in the upstream).\n upstream_table_name: value for custom sensor\n to query new data from the upstream\n If none we will set the default value,\n our sensor_new_data view.

\n\n

Return:\n The query string.

\n", "signature": "(\tsensor_id: str,\tfilter_exp: str = None,\tcontrol_db_table_name: str = None,\tupstream_key: str = None,\tupstream_value: str = None,\tupstream_table_name: str = None) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.generate_sensor_sap_logchain_query", "modulename": "lakehouse_engine.engine", "qualname": "generate_sensor_sap_logchain_query", "kind": "function", "doc": "

Generates a sensor query based in the SAP Logchain table.

\n\n

Args:\n chain_id: chain id to query the status on SAP.\n dbtable: db.table to retrieve the data to\n check if the sap chain is already finished.\n status: db.table to retrieve the last status change\n timestamp.\n engine_table_name: table name exposed with the SAP LOGCHAIN data.\n This table will be used in the jdbc query.

\n\n

Return:\n The query string.

\n", "signature": "(\tchain_id: str,\tdbtable: str = 'SAPPHA.RSPCLOGCHAIN',\tstatus: str = 'G',\tengine_table_name: str = 'sensor_new_data') -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.engine.send_notification", "modulename": "lakehouse_engine.engine", "qualname": "send_notification", "kind": "function", "doc": "

Send a notification using a notifier.

\n\n

Args:\n args: arguments for the notifier.

\n", "signature": "(args: dict) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io", "modulename": "lakehouse_engine.io", "kind": "module", "doc": "

Input and Output package responsible for the behaviour of reading and writing.

\n"}, {"fullname": "lakehouse_engine.io.exceptions", "modulename": "lakehouse_engine.io.exceptions", "kind": "module", "doc": "

Package defining all the io custom exceptions.

\n"}, {"fullname": "lakehouse_engine.io.exceptions.IncrementalFilterInputNotFoundException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "IncrementalFilterInputNotFoundException", "kind": "class", "doc": "

Exception for when the input of an incremental filter is not found.

\n\n

This may occur when tables are being loaded in incremental way, taking the increment\ndefinition out of a specific table, but the table still does not exist, mainly\nbecause probably it was not loaded for the first time yet.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.WrongIOFormatException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "WrongIOFormatException", "kind": "class", "doc": "

Exception for when a user provides a wrong I/O format.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.io.exceptions.NotSupportedException", "modulename": "lakehouse_engine.io.exceptions", "qualname": "NotSupportedException", "kind": "class", "doc": "

Exception for when a user provides a not supported operation.

\n", "bases": "builtins.RuntimeError"}, {"fullname": "lakehouse_engine.io.reader", "modulename": "lakehouse_engine.io.reader", "kind": "module", "doc": "

Defines abstract reader behaviour.

\n"}, {"fullname": "lakehouse_engine.io.reader.Reader", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader", "kind": "class", "doc": "

Abstract Reader class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader.Reader.__init__", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.__init__", "kind": "function", "doc": "

Construct Reader instances.

\n\n

Args:\n input_spec: input specification for reading data.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.reader.Reader.read", "modulename": "lakehouse_engine.io.reader", "qualname": "Reader.read", "kind": "function", "doc": "

Abstract read method.

\n\n

Returns:\n A dataframe read according to the input specification.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.reader_factory", "modulename": "lakehouse_engine.io.reader_factory", "kind": "module", "doc": "

Module for reader factory.

\n"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory", "kind": "class", "doc": "

Class for reader factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.reader_factory.ReaderFactory.get_data", "modulename": "lakehouse_engine.io.reader_factory", "qualname": "ReaderFactory.get_data", "kind": "function", "doc": "

Get data according to the input specification following a factory pattern.

\n\n

Args:\n spec: input specification to get the data.

\n\n

Returns:\n A dataframe containing the data.

\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.InputSpec) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers", "modulename": "lakehouse_engine.io.readers", "kind": "module", "doc": "

Readers package to define reading behaviour.

\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "kind": "module", "doc": "

Module to define behaviour to read from dataframes.

\n"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader", "kind": "class", "doc": "

Class to read data from a dataframe.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.__init__", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.__init__", "kind": "function", "doc": "

Construct DataFrameReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.dataframe_reader.DataFrameReader.read", "modulename": "lakehouse_engine.io.readers.dataframe_reader", "qualname": "DataFrameReader.read", "kind": "function", "doc": "

Read data from a dataframe.

\n\n

Returns:\n A dataframe containing the data from a dataframe previously\n computed.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.file_reader", "modulename": "lakehouse_engine.io.readers.file_reader", "kind": "module", "doc": "

Module to define behaviour to read from files.

\n"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader", "kind": "class", "doc": "

Class to read from files.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.__init__", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.__init__", "kind": "function", "doc": "

Construct FileReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.file_reader.FileReader.read", "modulename": "lakehouse_engine.io.readers.file_reader", "qualname": "FileReader.read", "kind": "function", "doc": "

Read file data.

\n\n

Returns:\n A dataframe containing the data from the files.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "kind": "module", "doc": "

Module to define behaviour to read from JDBC sources.

\n"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader", "kind": "class", "doc": "

Class to read from JDBC source.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.__init__", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.__init__", "kind": "function", "doc": "

Construct JDBCReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.jdbc_reader.JDBCReader.read", "modulename": "lakehouse_engine.io.readers.jdbc_reader", "qualname": "JDBCReader.read", "kind": "function", "doc": "

Read data from JDBC source.

\n\n

Returns:\n A dataframe containing the data from the JDBC source.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "kind": "module", "doc": "

Module to define behaviour to read from Kafka.

\n"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader", "kind": "class", "doc": "

Class to read from Kafka.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.__init__", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.__init__", "kind": "function", "doc": "

Construct KafkaReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.kafka_reader.KafkaReader.read", "modulename": "lakehouse_engine.io.readers.kafka_reader", "qualname": "KafkaReader.read", "kind": "function", "doc": "

Read Kafka data.

\n\n

Returns:\n A dataframe containing the data from Kafka.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.query_reader", "modulename": "lakehouse_engine.io.readers.query_reader", "kind": "module", "doc": "

Module to define behaviour to read from a query.

\n"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader", "kind": "class", "doc": "

Class to read data from a query.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.__init__", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.__init__", "kind": "function", "doc": "

Construct QueryReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.query_reader.QueryReader.read", "modulename": "lakehouse_engine.io.readers.query_reader", "qualname": "QueryReader.read", "kind": "function", "doc": "

Read data from a query.

\n\n

Returns:\n A dataframe containing the data from the query.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "kind": "module", "doc": "

Module to define behaviour to read from SAP B4 sources.

\n"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader", "kind": "class", "doc": "

Class to read from SAP B4 source.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.__init__", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.__init__", "kind": "function", "doc": "

Construct SAPB4Reader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_b4_reader.SAPB4Reader.read", "modulename": "lakehouse_engine.io.readers.sap_b4_reader", "qualname": "SAPB4Reader.read", "kind": "function", "doc": "

Read data from SAP B4 source.

\n\n

Returns:\n A dataframe containing the data from the SAP B4 source.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "kind": "module", "doc": "

Module to define behaviour to read from SAP BW sources.

\n"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader", "kind": "class", "doc": "

Class to read from SAP BW source.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.__init__", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.__init__", "kind": "function", "doc": "

Construct SAPBWReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sap_bw_reader.SAPBWReader.read", "modulename": "lakehouse_engine.io.readers.sap_bw_reader", "qualname": "SAPBWReader.read", "kind": "function", "doc": "

Read data from SAP BW source.

\n\n

Returns:\n A dataframe containing the data from the SAP BW source.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "kind": "module", "doc": "

Module to define behaviour to read from SFTP.

\n"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader", "kind": "class", "doc": "

Class to read from SFTP.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.__init__", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.__init__", "kind": "function", "doc": "

Construct SFTPReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.sftp_reader.SFTPReader.read", "modulename": "lakehouse_engine.io.readers.sftp_reader", "qualname": "SFTPReader.read", "kind": "function", "doc": "

Read SFTP data.

\n\n

Returns:\n A dataframe containing the data from SFTP.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.readers.table_reader", "modulename": "lakehouse_engine.io.readers.table_reader", "kind": "module", "doc": "

Module to define behaviour to read from tables.

\n"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader", "kind": "class", "doc": "

Class to read data from a table.

\n", "bases": "lakehouse_engine.io.reader.Reader"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.__init__", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.__init__", "kind": "function", "doc": "

Construct TableReader instances.

\n\n

Args:\n input_spec: input specification.

\n", "signature": "(input_spec: lakehouse_engine.core.definitions.InputSpec)"}, {"fullname": "lakehouse_engine.io.readers.table_reader.TableReader.read", "modulename": "lakehouse_engine.io.readers.table_reader", "qualname": "TableReader.read", "kind": "function", "doc": "

Read data from a table.

\n\n

Returns:\n A dataframe containing the data from the table.

\n", "signature": "(self) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer", "modulename": "lakehouse_engine.io.writer", "kind": "module", "doc": "

Defines abstract writer behaviour.

\n"}, {"fullname": "lakehouse_engine.io.writer.Writer", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer", "kind": "class", "doc": "

Abstract Writer class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer.Writer.__init__", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.__init__", "kind": "function", "doc": "

Construct Writer instances.

\n\n

Args:\n output_spec: output specification to write data.\n df: dataframe to write.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict = None)"}, {"fullname": "lakehouse_engine.io.writer.Writer.write", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write", "kind": "function", "doc": "

Abstract write method.

\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.write_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.write_transformed_micro_batch", "kind": "function", "doc": "

Define how to write a streaming micro batch after transforming it.

\n\n

This function must define an inner function that manipulates a streaming batch,\nand then return that function. Look for concrete implementations of this\nfunction for more clarity.

\n\n

Args:\n kwargs: any keyword arguments.

\n\n

Returns:\n A function to be executed in the foreachBatch spark write method.

\n", "signature": "(**kwargs: Any) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_transformed_micro_batch", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_transformed_micro_batch", "kind": "function", "doc": "

Get the result of the transformations applied to a micro batch dataframe.

\n\n

Args:\n output_spec: output specification associated with the writer.\n batch_df: batch dataframe (given from streaming foreachBatch).\n batch_id: if of the batch (given from streaming foreachBatch).\n data: list of all dfs generated on previous steps before writer\n to be available on micro batch transforms.

\n\n

Returns:\n The transformed dataframe.

\n", "signature": "(\tcls,\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tbatch_df: pyspark.sql.dataframe.DataFrame,\tbatch_id: int,\tdata: OrderedDict) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.get_streaming_trigger", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.get_streaming_trigger", "kind": "function", "doc": "

Define which streaming trigger will be used.

\n\n

Args:\n output_spec: output specification.

\n\n

Returns:\n A dict containing streaming trigger.

\n", "signature": "(cls, output_spec: lakehouse_engine.core.definitions.OutputSpec) -> Dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer.Writer.run_micro_batch_dq_process", "modulename": "lakehouse_engine.io.writer", "qualname": "Writer.run_micro_batch_dq_process", "kind": "function", "doc": "

Run the data quality process in a streaming micro batch dataframe.

\n\n

Iterates over the specs and performs the checks or analysis depending on the\ndata quality specification provided in the configuration.

\n\n

Args:\n df: the dataframe in which to run the dq process on.\n dq_spec: data quality specification.

\n\n

Returns: the validated dataframe.

\n", "signature": "(\tdf: pyspark.sql.dataframe.DataFrame,\tdq_spec: List[lakehouse_engine.core.definitions.DQSpec]) -> pyspark.sql.dataframe.DataFrame:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writer_factory", "modulename": "lakehouse_engine.io.writer_factory", "kind": "module", "doc": "

Module for writer factory.

\n"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory", "kind": "class", "doc": "

Class for writer factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.io.writer_factory.WriterFactory.get_writer", "modulename": "lakehouse_engine.io.writer_factory", "qualname": "WriterFactory.get_writer", "kind": "function", "doc": "

Get a writer according to the output specification using a factory pattern.

\n\n

Args:\n OutputSpec spec: output specification to write data.\n DataFrame df: dataframe to be written.\n OrderedDict data: list of all dfs generated on previous steps before writer.

\n\n

Returns:\n Writer: writer that will write the data.

\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict) -> lakehouse_engine.io.writer.Writer:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers", "modulename": "lakehouse_engine.io.writers", "kind": "module", "doc": "

Package containing the writers responsible for writing data.

\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer", "modulename": "lakehouse_engine.io.writers.console_writer", "kind": "module", "doc": "

Module to define behaviour to write to console.

\n"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter", "kind": "class", "doc": "

Class to write data to console.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.__init__", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.__init__", "kind": "function", "doc": "

Construct ConsoleWriter instances.

\n\n

Args:\n output_spec: output specification\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.console_writer.ConsoleWriter.write", "modulename": "lakehouse_engine.io.writers.console_writer", "qualname": "ConsoleWriter.write", "kind": "function", "doc": "

Write data to console.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "kind": "module", "doc": "

Module to define behaviour to write to dataframe.

\n"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter", "kind": "class", "doc": "

Class to write data to dataframe.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.__init__", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.__init__", "kind": "function", "doc": "

Construct DataFrameWriter instances.

\n\n

Args:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.dataframe_writer.DataFrameWriter.write", "modulename": "lakehouse_engine.io.writers.dataframe_writer", "qualname": "DataFrameWriter.write", "kind": "function", "doc": "

Write data to dataframe.

\n", "signature": "(self) -> Optional[OrderedDict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "kind": "module", "doc": "

Module to define the behaviour of delta merges.

\n"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter", "kind": "class", "doc": "

Class to merge data using delta lake.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.__init__", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.__init__", "kind": "function", "doc": "

Construct DeltaMergeWriter instances.

\n\n

Args:\n output_spec: output specification containing merge options and\n relevant information.\n df: the dataframe containing the new data to be merged.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.delta_merge_writer.DeltaMergeWriter.write", "modulename": "lakehouse_engine.io.writers.delta_merge_writer", "qualname": "DeltaMergeWriter.write", "kind": "function", "doc": "

Merge new data with current data.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.file_writer", "modulename": "lakehouse_engine.io.writers.file_writer", "kind": "module", "doc": "

Module to define behaviour to write to files.

\n"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter", "kind": "class", "doc": "

Class to write data to files.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.__init__", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.__init__", "kind": "function", "doc": "

Construct FileWriter instances.

\n\n

Args:\n output_spec: output specification\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.file_writer.FileWriter.write", "modulename": "lakehouse_engine.io.writers.file_writer", "qualname": "FileWriter.write", "kind": "function", "doc": "

Write data to files.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "kind": "module", "doc": "

Module that defines the behaviour to write to JDBC targets.

\n"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter", "kind": "class", "doc": "

Class to write to JDBC targets.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.__init__", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.__init__", "kind": "function", "doc": "

Construct JDBCWriter instances.

\n\n

Args:\n output_spec: output specification.\n df: dataframe to be writen.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.jdbc_writer.JDBCWriter.write", "modulename": "lakehouse_engine.io.writers.jdbc_writer", "qualname": "JDBCWriter.write", "kind": "function", "doc": "

Write data into JDBC target.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer", "modulename": "lakehouse_engine.io.writers.kafka_writer", "kind": "module", "doc": "

Module that defines the behaviour to write to Kafka.

\n"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter", "kind": "class", "doc": "

Class to write to a Kafka target.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.__init__", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.__init__", "kind": "function", "doc": "

Construct KafkaWriter instances.

\n\n

Args:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.kafka_writer.KafkaWriter.write", "modulename": "lakehouse_engine.io.writers.kafka_writer", "qualname": "KafkaWriter.write", "kind": "function", "doc": "

Write data to Kafka.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.io.writers.table_writer", "modulename": "lakehouse_engine.io.writers.table_writer", "kind": "module", "doc": "

Module that defines the behaviour to write to tables.

\n"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter", "kind": "class", "doc": "

Class to write to a table.

\n", "bases": "lakehouse_engine.io.writer.Writer"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.__init__", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.__init__", "kind": "function", "doc": "

Construct TableWriter instances.

\n\n

Args:\n output_spec: output specification.\n df: dataframe to be written.\n data: list of all dfs generated on previous steps before writer.

\n", "signature": "(\toutput_spec: lakehouse_engine.core.definitions.OutputSpec,\tdf: pyspark.sql.dataframe.DataFrame,\tdata: OrderedDict)"}, {"fullname": "lakehouse_engine.io.writers.table_writer.TableWriter.write", "modulename": "lakehouse_engine.io.writers.table_writer", "qualname": "TableWriter.write", "kind": "function", "doc": "

Write data to a table.

\n\n

After the write operation we repair the table (e.g., update partitions).\nHowever, there's a caveat to this, which is the fact that this repair\noperation is not reachable if we are running long-running streaming mode.\nTherefore, we recommend not using the TableWriter with formats other than\ndelta lake for those scenarios (as delta lake does not need msck repair).\nSo, you can: 1) use delta lake format for the table; 2) use the FileWriter\nand run the repair with a certain frequency in a separate task of your\npipeline.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators", "modulename": "lakehouse_engine.terminators", "kind": "module", "doc": "

Package to define algorithm terminators (e.g., vacuum, optimize, compute stats).

\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor", "modulename": "lakehouse_engine.terminators.cdf_processor", "kind": "module", "doc": "

Defines change data feed processor behaviour.

\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor", "kind": "class", "doc": "

Change data feed processor class.

\n"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.expose_cdf", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.expose_cdf", "kind": "function", "doc": "

Expose CDF to external location.

\n\n

Args:\n spec: terminator specification.

\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.delete_old_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.delete_old_data", "kind": "function", "doc": "

Delete old data from cdf delta table.

\n\n

Args:\n spec: terminator specifications.

\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.cdf_processor.CDFProcessor.vacuum_cdf_data", "modulename": "lakehouse_engine.terminators.cdf_processor", "qualname": "CDFProcessor.vacuum_cdf_data", "kind": "function", "doc": "

Vacuum old data from cdf delta table.

\n\n

Args:\n spec: terminator specifications.

\n", "signature": "(cls, spec: lakehouse_engine.core.definitions.TerminatorSpec) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "kind": "module", "doc": "

Module with dataset optimizer terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer", "kind": "class", "doc": "

Class with dataset optimizer terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.dataset_optimizer.DatasetOptimizer.optimize_dataset", "modulename": "lakehouse_engine.terminators.dataset_optimizer", "qualname": "DatasetOptimizer.optimize_dataset", "kind": "function", "doc": "

Optimize a dataset based on a set of pre-conceived optimizations.

\n\n

Most of the times the dataset is a table, but it can be a file-based one only.

\n\n

Args:\n db_table: database_name.table_name.\n location: dataset/table filesystem location.\n compute_table_stats: to compute table statistics or not.\n vacuum: (delta lake tables only) whether to vacuum the delta lake\n table or not.\n vacuum_hours: (delta lake tables only) number of hours to consider\n in vacuum operation.\n optimize: (delta lake tables only) whether to optimize the table or\n not. Custom optimize parameters can be supplied through ExecEnv (Spark)\n configs\n optimize_where: expression to use in the optimize function.\n optimize_zorder_col_list: (delta lake tables only) list of\n columns to consider in the zorder optimization process. Custom optimize\n parameters can be supplied through ExecEnv (Spark) configs.\n debug: flag indicating if we are just debugging this for local\n tests and therefore pass through all the exceptions to perform some\n assertions in local tests.

\n", "signature": "(\tcls,\tdb_table: Optional[str] = None,\tlocation: Optional[str] = None,\tcompute_table_stats: bool = True,\tvacuum: bool = True,\tvacuum_hours: int = 720,\toptimize: bool = True,\toptimize_where: Optional[str] = None,\toptimize_zorder_col_list: Optional[List[str]] = None,\tdebug: bool = False) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier", "modulename": "lakehouse_engine.terminators.notifier", "kind": "module", "doc": "

Module with notification terminator.

\n"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier", "kind": "class", "doc": "

Abstract Notification class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.__init__", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.__init__", "kind": "function", "doc": "

Construct Notification instances.

\n\n

Args:\n notification_spec: notification specification.

\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.create_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.create_notification", "kind": "function", "doc": "

Abstract create notification method.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.send_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.send_notification", "kind": "function", "doc": "

Abstract send notification method.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier.Notifier.check_if_notification_is_failure_notification", "modulename": "lakehouse_engine.terminators.notifier", "qualname": "Notifier.check_if_notification_is_failure_notification", "kind": "function", "doc": "

Check if given notification is a failure notification.

\n\n

Args:\n spec: spec to validate if it is a failure notification.

\n\n

Returns:\n A boolean telling if the notification is a failure notification

\n", "signature": "(spec: lakehouse_engine.core.definitions.TerminatorSpec) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory", "modulename": "lakehouse_engine.terminators.notifier_factory", "kind": "module", "doc": "

Module for notifier factory.

\n"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory", "kind": "class", "doc": "

Class for notification factory.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.get_notifier", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.get_notifier", "kind": "function", "doc": "

Get a notifier according to the terminator specs using a factory.

\n\n

Args:\n spec: terminator specification.

\n\n

Returns:\n Notifier: notifier that will handle notifications.

\n", "signature": "(\tcls,\tspec: lakehouse_engine.core.definitions.TerminatorSpec) -> lakehouse_engine.terminators.notifier.Notifier:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifier_factory.NotifierFactory.generate_failure_notification", "modulename": "lakehouse_engine.terminators.notifier_factory", "qualname": "NotifierFactory.generate_failure_notification", "kind": "function", "doc": "

Check if it is necessary to send a failure notification and generate it.

\n\n

Args:\n spec: List of termination specs\n exception: Exception that caused the failure.

\n", "signature": "(spec: list, exception: Exception) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers", "modulename": "lakehouse_engine.terminators.notifiers", "kind": "module", "doc": "

Notifications module.

\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "kind": "module", "doc": "

Module with email notifier.

\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier", "kind": "class", "doc": "

Base Notification class.

\n", "bases": "lakehouse_engine.terminators.notifier.Notifier"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.__init__", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.__init__", "kind": "function", "doc": "

Construct Email Notification instance.

\n\n

Args:\n notification_spec: notification specification.

\n", "signature": "(notification_spec: lakehouse_engine.core.definitions.TerminatorSpec)"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.create_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.create_notification", "kind": "function", "doc": "

Creates the notification to be sent.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.email_notifier.EmailNotifier.send_notification", "modulename": "lakehouse_engine.terminators.notifiers.email_notifier", "qualname": "EmailNotifier.send_notification", "kind": "function", "doc": "

Sends the notification by using a series of methods.

\n", "signature": "(self) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "kind": "module", "doc": "

Email notification templates.

\n"}, {"fullname": "lakehouse_engine.terminators.notifiers.notification_templates.NotificationsTemplates", "modulename": "lakehouse_engine.terminators.notifiers.notification_templates", "qualname": "NotificationsTemplates", "kind": "class", "doc": "

Templates for notifications.

\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "kind": "module", "doc": "

Defines terminator behaviour.

\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator", "kind": "class", "doc": "

Sensor Terminator class.

\n"}, {"fullname": "lakehouse_engine.terminators.sensor_terminator.SensorTerminator.update_sensor_status", "modulename": "lakehouse_engine.terminators.sensor_terminator", "qualname": "SensorTerminator.update_sensor_status", "kind": "function", "doc": "

Update internal sensor status.

\n\n

Update the sensor status in the control table, it should be used to tell the\nsystem that the sensor has processed all new data that was previously\nidentified, hence updating the shifted sensor status.\nUsually used to move from SensorStatus.ACQUIRED_NEW_DATA to\nSensorStatus.PROCESSED_NEW_DATA, but there might be scenarios - still\nto identify - where we can update the sensor status from/to different statuses.

\n\n

Args:\n sensor_id: sensor id.\n control_db_table_name: db.table to store sensor checkpoints.\n status: status of the sensor.\n assets: a list of assets that are considered as available to\n consume downstream after this sensor has status\n PROCESSED_NEW_DATA.

\n", "signature": "(\tcls,\tsensor_id: str,\tcontrol_db_table_name: str,\tstatus: str = 'PROCESSED_NEW_DATA',\tassets: List[str] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.spark_terminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "kind": "module", "doc": "

Defines terminator behaviour.

\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator", "kind": "class", "doc": "

Spark Terminator class.

\n"}, {"fullname": "lakehouse_engine.terminators.spark_terminator.SparkTerminator.terminate_spark", "modulename": "lakehouse_engine.terminators.spark_terminator", "qualname": "SparkTerminator.terminate_spark", "kind": "function", "doc": "

Terminate spark session.

\n", "signature": "(cls) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.terminators.terminator_factory", "modulename": "lakehouse_engine.terminators.terminator_factory", "kind": "module", "doc": "

Module with the factory pattern to return terminators.

\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory", "kind": "class", "doc": "

TerminatorFactory class following the factory pattern.

\n"}, {"fullname": "lakehouse_engine.terminators.terminator_factory.TerminatorFactory.execute_terminator", "modulename": "lakehouse_engine.terminators.terminator_factory", "qualname": "TerminatorFactory.execute_terminator", "kind": "function", "doc": "

Execute a terminator following the factory pattern.

\n\n

Args:\n spec: terminator specification.\n df: dataframe to be used in the terminator. Needed when a\n terminator requires one dataframe as input.

\n\n

Returns:\n Transformer function to be executed in .transform() spark function.

\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TerminatorSpec,\tdf: Optional[pyspark.sql.dataframe.DataFrame] = None) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers", "modulename": "lakehouse_engine.transformers", "kind": "module", "doc": "

Package to define transformers available in the lakehouse engine.

\n"}, {"fullname": "lakehouse_engine.transformers.aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "kind": "module", "doc": "

Aggregators module.

\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators", "kind": "class", "doc": "

Class containing all aggregation functions.

\n"}, {"fullname": "lakehouse_engine.transformers.aggregators.Aggregators.get_max_value", "modulename": "lakehouse_engine.transformers.aggregators", "qualname": "Aggregators.get_max_value", "kind": "function", "doc": "

Get the maximum value of a given column of a dataframe.

\n\n

Args:\n input_col: name of the input column.\n output_col: name of the output column (defaults to \"latest\").

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(input_col: str, output_col: str = 'latest') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators", "modulename": "lakehouse_engine.transformers.column_creators", "kind": "module", "doc": "

Column creators transformers module.

\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators", "kind": "class", "doc": "

Class containing all functions that can create columns to add value.

\n"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_row_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_row_id", "kind": "function", "doc": "

Create a sequential but not consecutive id.

\n\n

Args:\n output_col: optional name of the output column.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_auto_increment_id", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_auto_increment_id", "kind": "function", "doc": "

Create a sequential and consecutive id.

\n\n

Args:\n output_col: optional name of the output column.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cls, output_col: str = 'lhe_row_id') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_creators.ColumnCreators.with_literals", "modulename": "lakehouse_engine.transformers.column_creators", "qualname": "ColumnCreators.with_literals", "kind": "function", "doc": "

Create columns given a map of column names and literal values (constants).

\n\n

Args:\n Dict[str, Any] literals: map of column names and literal values (constants).

\n\n

Returns:\n Callable: A function to be executed in the .transform() spark function.

\n", "signature": "(cls, literals: Dict[str, Any]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "kind": "module", "doc": "

Module with column reshaping transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers", "kind": "class", "doc": "

Class containing column reshaping transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.cast", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.cast", "kind": "function", "doc": "

Cast specific columns into the designated type.

\n\n

Args:\n cols: dict with columns and respective target types.\n Target types need to have the exact name of spark types:\n https://spark.apache.org/docs/latest/sql-ref-datatypes.html

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, cols: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.column_selector", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.column_selector", "kind": "function", "doc": "

Select specific columns with specific output aliases.

\n\n

Args:\n cols: dict with columns to select and respective aliases.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, cols: collections.OrderedDict) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.flatten_schema", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.flatten_schema", "kind": "function", "doc": "

Flatten the schema of the dataframe.

\n\n

Args:\n max_level: level until which you want to flatten the schema.\n Default: None.\n shorten_names: whether to shorten the names of the prefixes\n of the fields being flattened or not. Default: False.\n alias: whether to define alias for the columns being flattened\n or not. Default: True.\n num_chars: number of characters to consider when shortening\n the names of the fields. Default: 7.\n ignore_cols: columns which you don't want to flatten.\n Default: None.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.explode_columns", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.explode_columns", "kind": "function", "doc": "

Explode columns with types like ArrayType and MapType.

\n\n

After it can be applied the flatten_schema transformation,\nif we desired for example to explode the map (as we explode a StructType)\nor to explode a StructType inside the array.\nWe recommend you to specify always the columns desired to explode\nand not explode all columns.

\n\n

Args:\n explode_arrays: whether you want to explode array columns (True)\n or not (False). Default: False.\n array_cols_to_explode: array columns which you want to explode.\n If you don't specify it will get all array columns and explode them.\n Default: None.\n explode_maps: whether you want to explode map columns (True)\n or not (False). Default: False.\n map_cols_to_explode: map columns which you want to explode.\n If you don't specify it will get all map columns and explode them.\n Default: None.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\texplode_arrays: bool = False,\tarray_cols_to_explode: List[str] = None,\texplode_maps: bool = False,\tmap_cols_to_explode: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.with_expressions", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.with_expressions", "kind": "function", "doc": "

Execute Spark SQL expressions to create the specified columns.

\n\n

This function uses the Spark expr function:\nhttps://spark.apache.org/docs/latest/api/python/reference/api/\npyspark.sql.functions.expr.html

\n\n

Args:\n cols_and_exprs: dict with columns and respective expressions to compute\n (Spark SQL expressions).

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, cols_and_exprs: Dict[str, str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.rename", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.rename", "kind": "function", "doc": "

Rename specific columns into the designated name.

\n\n

Args:\n cols: dict with columns and respective target names.\n escape_col_names: whether to escape column names (e.g. /BIC/COL1) or not.\n If True it creates a column with the new name and drop the old one.\n If False, uses the native withColumnRenamed Spark function. Default: True.

\n\n

Returns:\n Function to be called in .transform() spark function.

\n", "signature": "(cls, cols: Dict[str, str], escape_col_names: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro", "kind": "function", "doc": "

Select all attributes from avro.

\n\n

Args:\n schema: the schema string.\n key_col: the name of the key column.\n value_col: the name of the value column.\n options: extra options (e.g., mode: \"PERMISSIVE\").\n expand_key: whether you want to expand the content inside the key\n column or not. Default: false.\n expand_value: whether you want to expand the content inside the value\n column or not. Default: true.

\n\n

Returns:\n Function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tschema: str = None,\tkey_col: str = 'key',\tvalue_col: str = 'value',\toptions: dict = None,\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_avro_with_registry", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_avro_with_registry", "kind": "function", "doc": "

Select all attributes from avro using a schema registry.

\n\n

Args:\n schema_registry: the url to the schema registry.\n value_schema: the name of the value schema entry in the schema registry.\n value_col: the name of the value column.\n key_schema: the name of the key schema entry in the schema\n registry. Default: None.\n key_col: the name of the key column.\n expand_key: whether you want to expand the content inside the key\n column or not. Default: false.\n expand_value: whether you want to expand the content inside the value\n column or not. Default: true.

\n\n

Returns:\n Function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tschema_registry: str,\tvalue_schema: str,\tvalue_col: str = 'value',\tkey_schema: str = None,\tkey_col: str = 'key',\texpand_key: bool = False,\texpand_value: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.from_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.from_json", "kind": "function", "doc": "

Convert a json string into a json column (struct).

\n\n

The new json column can be added to the existing columns (default) or it can\nreplace all the others, being the only one to output. The new column gets the\nsame name as the original one suffixed with '_json'.

\n\n

Args:\n input_col: dict with columns and respective target names.\n schema_path: path to the StructType schema (spark schema).\n schema: dict with the StructType schema (spark schema).\n json_options: options to parse the json value.\n drop_all_cols: whether to drop all the input columns or not.\n Defaults to False.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tinput_col: str,\tschema_path: Optional[str] = None,\tschema: Optional[dict] = None,\tjson_options: Optional[dict] = None,\tdrop_all_cols: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.column_reshapers.ColumnReshapers.to_json", "modulename": "lakehouse_engine.transformers.column_reshapers", "qualname": "ColumnReshapers.to_json", "kind": "function", "doc": "

Convert dataframe columns into a json value.

\n\n

Args:\n in_cols: name(s) of the input column(s).\n Example values:\n \"*\" - all\n columns; \"my_col\" - one column named \"my_col\";\n \"my_col1, my_col2\" - two columns.\n out_col: name of the output column.\n json_options: options to parse the json value.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tin_cols: List[str],\tout_col: str,\tjson_options: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers", "modulename": "lakehouse_engine.transformers.condensers", "kind": "module", "doc": "

Condensers module.

\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers", "kind": "class", "doc": "

Class containing all the functions to condensate data for later merges.

\n"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.condense_record_mode_cdc", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.condense_record_mode_cdc", "kind": "function", "doc": "

Condense Change Data Capture (CDC) based on record_mode strategy.

\n\n

This CDC data is particularly seen in some CDC enabled systems. Other systems\nmay have different CDC strategies.

\n\n

Args:\n business_key: The business key (logical primary key) of the data.\n ranking_key_desc: In this type of CDC condensation the data needs to be\n ordered descendingly in a certain way, using columns specified in this\n parameter.\n ranking_key_asc: In this type of CDC condensation the data needs to be\n ordered ascendingly in a certain way, using columns specified in\n this parameter.\n record_mode_col: Name of the record mode input_col.\n valid_record_modes: Depending on the context, not all record modes may be\n considered for condensation. Use this parameter to skip those.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(\tcls,\tbusiness_key: List[str],\trecord_mode_col: str,\tvalid_record_modes: List[str],\tranking_key_desc: Optional[List[str]] = None,\tranking_key_asc: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.condensers.Condensers.group_and_rank", "modulename": "lakehouse_engine.transformers.condensers", "qualname": "Condensers.group_and_rank", "kind": "function", "doc": "

Condense data based on a simple group by + take latest mechanism.

\n\n

Args:\n group_key: list of column names to use in the group by.\n ranking_key: the data needs to be ordered descendingly using columns\n specified in this parameter.\n descending: if the ranking considers descending order or not. Defaults to\n True.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(\tcls,\tgroup_key: List[str],\tranking_key: List[str],\tdescending: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.custom_transformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "kind": "module", "doc": "

Custom transformers module.

\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers", "kind": "class", "doc": "

Class representing a CustomTransformers.

\n"}, {"fullname": "lakehouse_engine.transformers.custom_transformers.CustomTransformers.custom_transformation", "modulename": "lakehouse_engine.transformers.custom_transformers", "qualname": "CustomTransformers.custom_transformation", "kind": "function", "doc": "

Execute a custom transformation provided by the user.

\n\n

This transformer can be very useful whenever the user cannot use our provided\ntransformers, or they want to write complex logic in the transform step of the\nalgorithm.

\n\n

Attention!!! Please bare in mind that the custom_transformer function provided\nas argument needs to receive a DataFrame and return a DataFrame, because it is\nhow Spark's .transform method is able to chain the transformations.\nExample:\n def my_custom_logic(df: DataFrame) -> DataFrame:

\n\n

Args:\n custom_transformer: custom transformer function. A python function with all\n required pyspark logic provided by the user.

\n\n

Returns:\n Callable: the same function provided as parameter, in order to e called\n later in the TransformerFactory.

\n", "signature": "(custom_transformer: Callable) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers", "modulename": "lakehouse_engine.transformers.data_maskers", "kind": "module", "doc": "

Module with data masking transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers", "kind": "class", "doc": "

Class containing data masking transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.hash_masker", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.hash_masker", "kind": "function", "doc": "

Mask specific columns using an hashing approach.

\n\n

Args:\n cols: list of column names to mask.\n approach: hashing approach. Defaults to 'SHA'. There's \"MURMUR3\" as well.\n num_bits: number of bits of the SHA approach. Only applies to SHA approach.\n suffix: suffix to apply to new column name. Defaults to \"_hash\".\n Note: you can pass an empty suffix to have the original column replaced.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tcols: List[str],\tapproach: str = 'SHA',\tnum_bits: int = 256,\tsuffix: str = '_hash') -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.data_maskers.DataMaskers.column_dropper", "modulename": "lakehouse_engine.transformers.data_maskers", "qualname": "DataMaskers.column_dropper", "kind": "function", "doc": "

Drop specific columns.

\n\n

Args:\n cols: list of column names to drop.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, cols: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers", "modulename": "lakehouse_engine.transformers.date_transformers", "kind": "module", "doc": "

Module containing date transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers", "kind": "class", "doc": "

Class with set of transformers to transform dates in several forms.

\n"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.add_current_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.add_current_date", "kind": "function", "doc": "

Add column with current date.

\n\n

The current date comes from the driver as a constant, not from every executor.

\n\n

Args:\n output_col: name of the output column.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(output_col: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_date", "kind": "function", "doc": "

Convert multiple string columns with a source format into dates.

\n\n

Args:\n cols: list of names of the string columns to convert.\n source_format: dates source format (e.g., YYYY-MM-dd). Check here:\n https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.convert_to_timestamp", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.convert_to_timestamp", "kind": "function", "doc": "

Convert multiple string columns with a source format into timestamps.

\n\n

Args:\n cols: list of names of the string columns to convert.\n source_format: dates source format (e.g., MM-dd-yyyy HH:mm:ss.SSS). Check\n here: https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cols: List[str], source_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.format_date", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.format_date", "kind": "function", "doc": "

Convert multiple date/timestamp columns into strings with the target format.

\n\n

Args:\n cols: list of names of the string columns to convert.\n target_format: strings target format (e.g., YYYY-MM-dd). Check here:\n https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cols: List[str], target_format: Optional[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.date_transformers.DateTransformers.get_date_hierarchy", "modulename": "lakehouse_engine.transformers.date_transformers", "qualname": "DateTransformers.get_date_hierarchy", "kind": "function", "doc": "

Create day/month/week/quarter/year hierarchy for the provided date columns.

\n\n

Uses Spark's extract function.

\n\n

Args:\n cols: list of names of the date columns to create the hierarchy.\n formats: dict with the correspondence between the hierarchy and the format\n to apply.\n Example: {\n \"year\": \"year\",\n \"month\": \"month\",\n \"day\": \"day\",\n \"week\": \"week\",\n \"quarter\": \"quarter\"\n }\n Check here: https://docs.oracle.com/javase/10/docs/api/java/time/format/\n DateTimeFormatter.html

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(cols: List[str], formats: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.exceptions", "modulename": "lakehouse_engine.transformers.exceptions", "kind": "module", "doc": "

Module for all the transformers exceptions.

\n"}, {"fullname": "lakehouse_engine.transformers.exceptions.WrongArgumentsException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "WrongArgumentsException", "kind": "class", "doc": "

Exception for when a user provides wrong arguments to a transformer.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.exceptions.UnsupportedStreamingTransformerException", "modulename": "lakehouse_engine.transformers.exceptions", "qualname": "UnsupportedStreamingTransformerException", "kind": "class", "doc": "

Exception for when a user requests a transformer not supported in streaming.

\n", "bases": "builtins.Exception"}, {"fullname": "lakehouse_engine.transformers.filters", "modulename": "lakehouse_engine.transformers.filters", "kind": "module", "doc": "

Module containing the filters transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters", "kind": "class", "doc": "

Class containing the filters transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.incremental_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.incremental_filter", "kind": "function", "doc": "

Incrementally Filter a certain dataframe given an increment logic.

\n\n

This logic can either be an increment value or an increment dataframe from which\nthe get the latest value from. By default the operator for the filtering process\nis greater or equal to cover cases where we receive late arriving data not cover\nin a previous load. You can change greater_or_equal to false to use greater,\nwhen you trust the source will never output more data with the increment after\nyou have load the data (e.g., you will never load data until the source is still\ndumping data, which may cause you to get an incomplete picture of the last\narrived data).

\n\n

Args:\n input_col: input column name\n increment_value: value to which to filter the data, considering the\n provided input_Col.\n increment_df: a dataframe to get the increment value from.\n you either specify this or the increment_value (this takes precedence).\n This is a good approach to get the latest value from a given dataframe\n that was read and apply that value as filter here. In this way you can\n perform incremental loads based on the last value of a given dataframe\n (e.g., table or file based). Can be used together with the\n get_max_value transformer to accomplish these incremental based loads.\n See our append load feature tests to see how to provide an acon for\n incremental loads, taking advantage of the scenario explained here.\n increment_col: name of the column from which to get the increment\n value from from (when using increment_df approach). This assumes there's\n only one row in the increment_df, reason why is a good idea to use\n together with the get_max_value transformer. Defaults to \"latest\"\n because that's the default output column name provided by the\n get_max_value transformer.\n greater_or_equal: if filtering should be done by also including the\n increment value or not (useful for scenarios where you are performing\n increment loads but still want to include data considering the increment\n value, and not only values greater than that increment... examples may\n include scenarios where you already loaded data including those values,\n but the source produced more data containing those values).\n Defaults to false.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tinput_col: str,\tincrement_value: Optional[Any] = None,\tincrement_df: Optional[pyspark.sql.dataframe.DataFrame] = None,\tincrement_col: str = 'latest',\tgreater_or_equal: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.expression_filter", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.expression_filter", "kind": "function", "doc": "

Filter a dataframe based on an expression.

\n\n

Args:\n exp: filter expression.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(exp: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.column_filter_exp", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.column_filter_exp", "kind": "function", "doc": "

Filter a dataframe's columns based on a list of SQL expressions.

\n\n

Args:\n exp: column filter expressions.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(exp: List[str]) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.filters.Filters.drop_duplicate_rows", "modulename": "lakehouse_engine.transformers.filters", "qualname": "Filters.drop_duplicate_rows", "kind": "function", "doc": "

Drop duplicate rows using spark function dropDuplicates().

\n\n

This transformer can be used with or without arguments.\nThe provided argument needs to be a list of columns.\nFor example: [\u201cName\u201d,\u201dVAT\u201d] will drop duplicate records within\n\"Name\" and \"VAT\" columns.\nIf the transformer is used without providing any columns list or providing\nan empty list, such as [] the result will be the same as using\nthe distinct() pyspark function. If the watermark dict is present it will\nensure that the drop operation will apply to rows within the watermark timeline\nwindow.

\n\n

Args:\n cols: column names.\n watermarker: properties to apply watermarker to the transformer.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cols: List[str] = None, watermarker: dict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.joiners", "modulename": "lakehouse_engine.transformers.joiners", "kind": "module", "doc": "

Module with join transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners", "kind": "class", "doc": "

Class containing join transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.joiners.Joiners.join", "modulename": "lakehouse_engine.transformers.joiners", "qualname": "Joiners.join", "kind": "function", "doc": "

Join two dataframes based on specified type and columns.

\n\n

Some stream to stream joins are only possible if you apply Watermark, so this\nmethod also provides a parameter to enable watermarking specification.

\n\n

Args:\n left_df_alias: alias of the first dataframe.\n join_with: right dataframe.\n right_df_alias: alias of the second dataframe.\n join_condition: condition to join dataframes.\n join_type: type of join. Defaults to inner.\n Available values: inner, cross, outer, full, full outer,\n left, left outer, right, right outer, semi,\n left semi, anti, and left anti.\n broadcast_join: whether to perform a broadcast join or not.\n select_cols: list of columns to select at the end.\n watermarker: properties to apply watermarking.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tjoin_with: pyspark.sql.dataframe.DataFrame,\tjoin_condition: str,\tleft_df_alias: str = 'a',\tright_df_alias: str = 'b',\tjoin_type: str = 'inner',\tbroadcast_join: bool = True,\tselect_cols: Optional[List[str]] = None,\twatermarker: Optional[dict] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.null_handlers", "modulename": "lakehouse_engine.transformers.null_handlers", "kind": "module", "doc": "

Module with null handlers transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers", "kind": "class", "doc": "

Class containing null handler transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.null_handlers.NullHandlers.replace_nulls", "modulename": "lakehouse_engine.transformers.null_handlers", "qualname": "NullHandlers.replace_nulls", "kind": "function", "doc": "

Replace nulls in a dataframe.

\n\n

Args:\n replace_on_nums: if it is to replace nulls on numeric columns.\n Applies to ints, longs and floats.\n default_num_value: default integer value to use as replacement.\n replace_on_strings: if it is to replace nulls on string columns.\n default_string_value: default string value to use as replacement.\n subset_cols: list of columns in which to replace nulls. If not\n provided, all nulls in all columns will be replaced as specified.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\treplace_on_nums: bool = True,\tdefault_num_value: int = -999,\treplace_on_strings: bool = True,\tdefault_string_value: str = 'UNKNOWN',\tsubset_cols: List[str] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "kind": "module", "doc": "

Optimizers module.

\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers", "kind": "class", "doc": "

Class containing all the functions that can provide optimizations.

\n"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.cache", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.cache", "kind": "function", "doc": "

Caches the current dataframe.

\n\n

The default storage level used is MEMORY_AND_DISK.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.persist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.persist", "kind": "function", "doc": "

Caches the current dataframe with a specific StorageLevel.

\n\n

Args:\n storage_level: the type of StorageLevel, as default MEMORY_AND_DISK_DESER.\n More options here: https://spark.apache.org/docs/latest/api/python/\n reference/api/pyspark.StorageLevel.html

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, storage_level: str = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.optimizers.Optimizers.unpersist", "modulename": "lakehouse_engine.transformers.optimizers", "qualname": "Optimizers.unpersist", "kind": "function", "doc": "

Removes the dataframe from the disk and memory.

\n\n

Args:\n blocking: whether to block until all the data blocks are\n removed from disk/memory or run asynchronously.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, blocking: bool = False) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.regex_transformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "kind": "module", "doc": "

Regex transformers module.

\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers", "kind": "class", "doc": "

Class containing all regex functions.

\n"}, {"fullname": "lakehouse_engine.transformers.regex_transformers.RegexTransformers.with_regex_value", "modulename": "lakehouse_engine.transformers.regex_transformers", "qualname": "RegexTransformers.with_regex_value", "kind": "function", "doc": "

Get the result of applying a regex to an input column (via regexp_extract).

\n\n

Args:\n input_col: name of the input column.\n output_col: name of the output column.\n regex: regular expression.\n drop_input_col: whether to drop input_col or not.\n idx: index to return.

\n\n

Returns:\n A function to be executed in the .transform() spark function.

\n", "signature": "(\tinput_col: str,\toutput_col: str,\tregex: str,\tdrop_input_col: bool = False,\tidx: int = 1) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "kind": "module", "doc": "

Module with repartitioners transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners", "kind": "class", "doc": "

Class containing repartitioners transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.coalesce", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.coalesce", "kind": "function", "doc": "

Coalesce a dataframe into n partitions.

\n\n

Args:\n num_partitions: num of partitions to coalesce.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(cls, num_partitions: int) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.repartitioners.Repartitioners.repartition", "modulename": "lakehouse_engine.transformers.repartitioners", "qualname": "Repartitioners.repartition", "kind": "function", "doc": "

Repartition a dataframe into n partitions.

\n\n

If num_partitions is provided repartitioning happens based on the provided\nnumber, otherwise it happens based on the values of the provided cols (columns).

\n\n

Args:\n num_partitions: num of partitions to repartition.\n cols: list of columns to use for repartitioning.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tnum_partitions: Optional[int] = None,\tcols: Optional[List[str]] = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.transformer_factory", "modulename": "lakehouse_engine.transformers.transformer_factory", "kind": "module", "doc": "

Module with the factory pattern to return transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory", "kind": "class", "doc": "

TransformerFactory class following the factory pattern.

\n"}, {"fullname": "lakehouse_engine.transformers.transformer_factory.TransformerFactory.get_transformer", "modulename": "lakehouse_engine.transformers.transformer_factory", "qualname": "TransformerFactory.get_transformer", "kind": "function", "doc": "

Get a transformer following the factory pattern.

\n\n

Args:\n spec: transformer specification (individual transformation... not to be\n confused with list of all transformations).\n data: ordered dict of dataframes to be transformed. Needed when a\n transformer requires more than one dataframe as input.

\n\n

Returns:\n Transformer function to be executed in .transform() spark function.

\n", "signature": "(\tspec: lakehouse_engine.core.definitions.TransformerSpec,\tdata: OrderedDict = None) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions", "modulename": "lakehouse_engine.transformers.unions", "kind": "module", "doc": "

Module with union transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions", "kind": "class", "doc": "

Class containing union transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union", "kind": "function", "doc": "

Union dataframes, resolving columns by position (not by name).

\n\n

Args:\n union_with: list of dataframes to union.\n deduplication: whether to perform deduplication of elements or not.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.unions.Unions.union_by_name", "modulename": "lakehouse_engine.transformers.unions", "qualname": "Unions.union_by_name", "kind": "function", "doc": "

Union dataframes, resolving columns by name (not by position).

\n\n

Args:\n union_with: list of dataframes to union.\n deduplication: whether to perform deduplication of elements or not.\n allow_missing_columns: allow the union of DataFrames with different\n schemas.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tcls,\tunion_with: List[pyspark.sql.dataframe.DataFrame],\tdeduplication: bool = True,\tallow_missing_columns: bool = True) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.transformers.watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "kind": "module", "doc": "

Watermarker module.

\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker", "kind": "class", "doc": "

Class containing all watermarker transformers.

\n"}, {"fullname": "lakehouse_engine.transformers.watermarker.Watermarker.with_watermark", "modulename": "lakehouse_engine.transformers.watermarker", "qualname": "Watermarker.with_watermark", "kind": "function", "doc": "

Get the dataframe with watermarker defined.

\n\n

Args:\n watermarker_column: name of the input column to be considered for\n the watermarking. Note: it must be a timestamp.\n watermarker_time: time window to define the watermark value.

\n\n

Returns:\n A function to be executed on other transformers.

\n", "signature": "(watermarker_column: str, watermarker_time: str) -> Callable:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils", "modulename": "lakehouse_engine.utils", "kind": "module", "doc": "

Utilities package.

\n"}, {"fullname": "lakehouse_engine.utils.configs", "modulename": "lakehouse_engine.utils.configs", "kind": "module", "doc": "

Config utilities package.

\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils", "modulename": "lakehouse_engine.utils.configs.config_utils", "kind": "module", "doc": "

Module to read configurations.

\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils", "kind": "class", "doc": "

Config utilities class.

\n"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_acon", "kind": "function", "doc": "

Get acon based on a filesystem path or on a dict.

\n\n

Args:\n acon_path: path of the acon (algorithm configuration) file.\n acon: acon provided directly through python code (e.g., notebooks\n or other apps).

\n\n

Returns:\n Dict representation of an acon.

\n", "signature": "(\tcls,\tacon_path: Optional[str] = None,\tacon: Optional[dict] = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.get_config", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.get_config", "kind": "function", "doc": "

Get Lakehouse Engine configurations.

\n\n

Returns:\n A dictionary with the engine configurations.

\n", "signature": "() -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_json_acon", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_json_acon", "kind": "function", "doc": "

Read an acon (algorithm configuration) file.

\n\n

Args:\n path: path to the acon file.

\n\n

Returns:\n The acon file content as a dict.

\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.configs.config_utils.ConfigUtils.read_sql", "modulename": "lakehouse_engine.utils.configs.config_utils", "qualname": "ConfigUtils.read_sql", "kind": "function", "doc": "

Read a DDL file in Spark SQL format from a cloud object storage system.

\n\n

Args:\n path: path to the acon (algorithm configuration) file.

\n\n

Returns:\n Content of the SQL file.

\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "kind": "module", "doc": "

Utilities for databricks operations.

\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils", "kind": "class", "doc": "

Databricks utilities class.

\n"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_db_utils", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_db_utils", "kind": "function", "doc": "

Get db utils on databricks.

\n\n

Args:\n spark: spark session.

\n\n

Returns:\n Dbutils from databricks.

\n", "signature": "(spark: pyspark.sql.session.SparkSession) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.databricks_utils.DatabricksUtils.get_databricks_job_information", "modulename": "lakehouse_engine.utils.databricks_utils", "qualname": "DatabricksUtils.get_databricks_job_information", "kind": "function", "doc": "

Get notebook context from running acon.

\n\n

Returns:\n Dict containing databricks notebook context.

\n", "signature": "() -> Tuple[str, str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.expectations_utils", "modulename": "lakehouse_engine.utils.expectations_utils", "kind": "module", "doc": "

Utilities to be used by custom expectations.

\n"}, {"fullname": "lakehouse_engine.utils.expectations_utils.validate_result", "modulename": "lakehouse_engine.utils.expectations_utils", "qualname": "validate_result", "kind": "function", "doc": "

Validates the test results of the custom expectations.

\n\n

If you need to make additional validations on your custom expectation\nand/or require additional fields to be returned you can add them before\ncalling this function. The partial_success and partial_result\noptional parameters can be used to pass the result of additional\nvalidations and add more information to the result key of the\nreturned dict respectively.

\n\n

Args:\n expectation: Expectation to validate.\n configuration: Configuration used in the test.\n metrics: Test result metrics.\n partial_success: Result of validations done before calling this method.\n partial_result: Extra fields to be returned to the user.

\n\n

Returns:\n The result of the validation.

\n", "signature": "(\texpectation: great_expectations.expectations.expectation.Expectation,\tconfiguration: great_expectations.core.expectation_configuration.ExpectationConfiguration,\tmetrics: Dict,\tpartial_success: bool = True,\tpartial_result: dict = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction", "modulename": "lakehouse_engine.utils.extraction", "kind": "module", "doc": "

Extraction utilities package.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "kind": "module", "doc": "

Utilities module for JDBC extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType", "kind": "class", "doc": "

Standardize the types of extractions we can have from a JDBC source.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.INIT", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.INIT", "kind": "variable", "doc": "

\n", "default_value": "<JDBCExtractionType.INIT: 'init'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionType.DELTA", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionType.DELTA", "kind": "variable", "doc": "

\n", "default_value": "<JDBCExtractionType.DELTA: 'delta'>"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction", "kind": "class", "doc": "

Configurations available for an Extraction from a JDBC source.

\n\n

These configurations cover:\n user: username to connect to JDBC source.\n password: password to connect to JDBC source (always use secrets,\n don't use text passwords in your code).\n url: url to connect to JDBC source.\n dbtable: database.table to extract data from.\n calc_upper_bound_schema: custom schema used for the upper bound calculation.\n changelog_table: table of type changelog from which to extract data,\n when the extraction type is delta.\n partition_column: column used to split the extraction.\n latest_timestamp_data_location: data location (e.g., s3) containing the data\n to get the latest timestamp already loaded into bronze.\n latest_timestamp_data_format: the format of the dataset in\n latest_timestamp_data_location. Default: delta.\n extraction_type: type of extraction (delta or init). Default: \"delta\".\n driver: JDBC driver name. Default: \"com.sap.db.jdbc.Driver\".\n num_partitions: number of Spark partitions to split the extraction.\n lower_bound: lower bound to decide the partition stride.\n upper_bound: upper bound to decide the partition stride. If\n calculate_upper_bound is True, then upperBound will be\n derived by our upper bound optimizer, using the partition column.\n default_upper_bound: the value to use as default upper bound in case\n the result of the upper bound calculation is None. Default: \"1\".\n fetch_size: how many rows to fetch per round trip. Default: \"100000\".\n compress: enable network compression. Default: True.\n custom_schema: specify custom_schema for particular columns of the\n returned dataframe in the init/delta extraction of the source table.\n min_timestamp: min timestamp to consider to filter the changelog data.\n Default: None and automatically derived from the location provided.\n In case this one is provided it has precedence and the calculation\n is not done.\n max_timestamp: max timestamp to consider to filter the changelog data.\n Default: None and automatically derived from the table having information\n about the extraction requests, their timestamps and their status.\n In case this one is provided it has precedence and the calculation\n is not done.\n generate_predicates: whether to generate predicates automatically or not.\n Default: False.\n predicates: list containing all values to partition (if generate_predicates\n is used, the manual values provided are ignored). Default: None.\n predicates_add_null: whether to consider null on predicates list.\n Default: True.\n extraction_timestamp: the timestamp of the extraction. Default: current time\n following the format \"%Y%m%d%H%M%S\".\n max_timestamp_custom_schema: custom schema used on the max_timestamp derivation\n from the table holding the extraction requests information.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtraction.__init__", "kind": "function", "doc": "

\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231018182628',\tmax_timestamp_custom_schema: Optional[str] = None)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from particularly relevant JDBC sources.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.__init__", "kind": "function", "doc": "

Construct JDBCExtractionUtils.

\n\n

Args:\n jdbc_extraction: JDBC Extraction configurations. Can be of type:\n JDBCExtraction, SAPB4Extraction or SAPBWExtraction.

\n", "signature": "(jdbc_extraction: Any)"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_additional_spark_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_additional_spark_options", "kind": "function", "doc": "

Helper to get additional Spark Options initially passed.

\n\n

If people provide additional Spark options, not covered by the util function\narguments (get_spark_jdbc_options), we need to consider them.\nThus, we update the options retrieved by the utils, by checking if there is\nany Spark option initially provided that is not yet considered in the retrieved\noptions or function arguments and if the value for the key is not None.\nIf these conditions are filled, we add the options and return the complete dict.

\n\n

Args:\n input_spec: the input specification.\n options: dict with Spark options.\n ignore_options: list of options to be ignored by the process.\n Spark read has two different approaches to parallelize\n reading process, one of them is using upper/lower bound,\n another one is using predicates, those process can't be\n executed at the same time, you must choose one of them.\n By choosing predicates you can't pass lower and upper bound,\n also can't pass number of partitions and partition column\n otherwise spark will interpret the execution partitioned by\n upper and lower bound and will expect to fill all variables.\n To avoid fill all predicates hardcoded at the acon, there is\n a feature that automatically generates all predicates for init\n or delta load based on input partition column, but at the end\n of the process, partition column can't be passed to the options,\n because we are choosing predicates execution, that is why to\n generate predicates we need to pass some options to ignore.

\n\n

Returns:\n a dict with all the options passed as argument, plus the options that\n were initially provided, but were not used in the util\n (get_spark_jdbc_options).

\n", "signature": "(\tinput_spec: lakehouse_engine.core.definitions.InputSpec,\toptions: dict,\tignore_options: List = None) -> dict:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_predicates", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_predicates", "kind": "function", "doc": "

Get the predicates list, based on a predicates query.

\n\n

Args:\n predicates_query: query to use as the basis to get the distinct values for\n a specified column, based on which predicates are generated.

\n\n

Returns:\n List containing the predicates to use to split the extraction from\n JDBC sources.

\n", "signature": "(self, predicates_query: str) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_options", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_options", "kind": "function", "doc": "

Get the Spark options to extract data from a JDBC source.

\n\n

Returns:\n The Spark jdbc args dictionary, including the query to submit\n and also options args dictionary.

\n", "signature": "(self) -> Tuple[dict, dict]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "modulename": "lakehouse_engine.utils.extraction.jdbc_extraction_utils", "qualname": "JDBCExtractionUtils.get_spark_jdbc_optimal_upper_bound", "kind": "function", "doc": "

Get an optimal upperBound to properly split a Spark JDBC extraction.

\n\n

Returns:\n Either an int, date or timestamp to serve as upperBound Spark JDBC option.

\n", "signature": "(self) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "kind": "module", "doc": "

Utilities module for SAP B4 extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes", "kind": "class", "doc": "

Standardise the types of ADSOs we can have for Extractions from SAP B4.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.AQ", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.AQ", "kind": "variable", "doc": "

\n", "annotation": ": str", "default_value": "<ADSOTypes.AQ: 'AQ'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.CL", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.CL", "kind": "variable", "doc": "

\n", "annotation": ": str", "default_value": "<ADSOTypes.CL: 'CL'>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.ADSOTypes.SUPPORTED_TYPES", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "ADSOTypes.SUPPORTED_TYPES", "kind": "variable", "doc": "

\n", "annotation": ": list", "default_value": "<ADSOTypes.SUPPORTED_TYPES: ['AQ', 'CL']>"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction", "kind": "class", "doc": "

Configurations available for an Extraction from SAP B4.

\n\n

It inherits from JDBCExtraction configurations, so it can use\nand/or overwrite those configurations.

\n\n

These configurations cover:\n latest_timestamp_input_col: the column containing the request timestamps\n in the dataset in latest_timestamp_data_location. Default: REQTSN.\n request_status_tbl: the name of the SAP B4 table having information\n about the extraction requests. Composed of database.table.\n Default: SAPHANADB.RSPMREQUEST.\n request_col_name: name of the column having the request timestamp to join\n with the request status table. Default: REQUEST_TSN.\n data_target: the data target to extract from. User in the join operation with\n the request status table.\n act_req_join_condition: the join condition into activation table\n can be changed using this property.\n Default: 'tbl.reqtsn = req.request_col_name'.\n include_changelog_tech_cols: whether to include the technical columns\n (usually coming from the changelog) table or not.\n extra_cols_req_status_tbl: columns to be added from request status table.\n It needs to contain the prefix \"req.\". E.g. \"req.col1 as column_one,\n req.col2 as column_two\".\n request_status_tbl_filter: filter to use for filtering the request status table,\n influencing the calculation of the max timestamps and the delta extractions.\n adso_type: the type of ADSO that you are extracting from. Can be \"AQ\" or \"CL\".\n max_timestamp_custom_schema: the custom schema to apply on the calculation of\n the max timestamp to consider for the delta extractions.\n Default: timestamp DECIMAL(23,0).\n default_max_timestamp: the timestamp to use as default, when it is not possible\n to derive one.\n custom_schema: specify custom_schema for particular columns of the\n returned dataframe in the init/delta extraction of the source table.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4Extraction.__init__", "kind": "function", "doc": "

\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: str = 'REQTSN DECIMAL(23,0)',\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231018182628',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(23,0)',\tlatest_timestamp_input_col: str = 'REQTSN',\trequest_status_tbl: str = 'SAPHANADB.RSPMREQUEST',\trequest_col_name: str = 'REQUEST_TSN',\tdata_target: Optional[str] = None,\tact_req_join_condition: Optional[str] = None,\tinclude_changelog_tech_cols: Optional[bool] = None,\textra_cols_req_status_tbl: Optional[str] = None,\trequest_status_tbl_filter: Optional[str] = None,\tadso_type: Optional[str] = None,\tdefault_max_timestamp: str = '1970000000000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from SAP B4.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.__init__", "kind": "function", "doc": "

Construct SAPB4ExtractionUtils.

\n\n

Args:\n sap_b4_extraction: SAP B4 Extraction configurations.

\n", "signature": "(\tsap_b4_extraction: lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4Extraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils.SAPB4ExtractionUtils.get_data_target", "modulename": "lakehouse_engine.utils.extraction.sap_b4_extraction_utils", "qualname": "SAPB4ExtractionUtils.get_data_target", "kind": "function", "doc": "

Get the data_target from the data_target option or derive it.

\n\n

By definition data_target is the same for the table and changelog table and\nis the same string ignoring everything before / and the first and last\ncharacter after /. E.g. for a dbtable /BIC/abtable12, the data_target\nwould be btable1.

\n\n

Args:\n input_spec_opt: options from the input_spec.

\n\n

Returns:\n A string with the data_target.

\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "kind": "module", "doc": "

Utilities module for SAP BW extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction", "kind": "class", "doc": "

Configurations available for an Extraction from SAP BW.

\n\n

It inherits from SAPBWExtraction configurations, so it can use\nand/or overwrite those configurations.

\n\n

These configurations cover:\n latest_timestamp_input_col: the column containing the actrequest timestamp\n in the dataset in latest_timestamp_data_location. Default:\n \"actrequest_timestamp\".\n act_request_table: the name of the SAP BW activation requests table.\n Composed of database.table. Default: SAPPHA.RSODSACTREQ.\n request_col_name: name of the column having the request to join\n with the activation request table. Default: actrequest.\n act_req_join_condition: the join condition into activation table\n can be changed using this property.\n Default: 'changelog_tbl.request = act_req.request_col_name'.\n odsobject: name of BW Object, used for joining with the activation request\n table to get the max actrequest_timestamp to consider while filtering\n the changelog table.\n include_changelog_tech_cols: whether to include the technical columns\n (usually coming from the changelog) table or not. Default: True.\n extra_cols_act_request: list of columns to be added from act request table.\n It needs to contain the prefix \"act_req.\". E.g. \"act_req.col1\n as column_one, act_req.col2 as column_two\".\n get_timestamp_from_act_request: whether to get init timestamp\n from act request table or assume current/given timestamp.\n sap_bw_schema: sap bw schema. Default: SAPPHA.\n max_timestamp_custom_schema: the custom schema to apply on the calculation of\n the max timestamp to consider for the delta extractions.\n Default: timestamp DECIMAL(23,0).\n default_max_timestamp: the timestamp to use as default, when it is not possible\n to derive one.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtraction"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtraction.__init__", "kind": "function", "doc": "

\n", "signature": "(\tuser: str,\tpassword: str,\turl: str,\tdbtable: str,\tcalc_upper_bound_schema: Optional[str] = None,\tchangelog_table: Optional[str] = None,\tpartition_column: Optional[str] = None,\tlatest_timestamp_data_location: Optional[str] = None,\tlatest_timestamp_data_format: str = 'delta',\textraction_type: str = 'delta',\tdriver: str = 'com.sap.db.jdbc.Driver',\tnum_partitions: Optional[int] = None,\tlower_bound: Union[int, float, str, NoneType] = None,\tupper_bound: Union[int, float, str, NoneType] = None,\tdefault_upper_bound: str = '1',\tfetch_size: str = '100000',\tcompress: bool = True,\tcustom_schema: Optional[str] = None,\tmin_timestamp: Optional[str] = None,\tmax_timestamp: Optional[str] = None,\tgenerate_predicates: bool = False,\tpredicates: Optional[List] = None,\tpredicates_add_null: bool = True,\textraction_timestamp: str = '20231018182628',\tmax_timestamp_custom_schema: str = 'timestamp DECIMAL(15,0)',\tlatest_timestamp_input_col: str = 'actrequest_timestamp',\tact_request_table: str = 'SAPPHA.RSODSACTREQ',\trequest_col_name: str = 'actrequest',\tact_req_join_condition: Optional[str] = None,\todsobject: Optional[str] = None,\tinclude_changelog_tech_cols: bool = True,\textra_cols_act_request: Optional[str] = None,\tget_timestamp_from_act_request: bool = False,\tsap_bw_schema: str = 'SAPPHA',\tdefault_max_timestamp: str = '197000000000000')"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from particularly relevant JDBC sources.

\n", "bases": "lakehouse_engine.utils.extraction.jdbc_extraction_utils.JDBCExtractionUtils"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.__init__", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.__init__", "kind": "function", "doc": "

Construct SAPBWExtractionUtils.

\n\n

Args:\n sap_bw_extraction: SAP BW Extraction configurations.

\n", "signature": "(\tsap_bw_extraction: lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtraction)"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_changelog_table", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_changelog_table", "kind": "function", "doc": "

Get the changelog table, given an odsobject.

\n\n

Returns:\n String to use as changelog_table.

\n", "signature": "(self) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils.SAPBWExtractionUtils.get_odsobject", "modulename": "lakehouse_engine.utils.extraction.sap_bw_extraction_utils", "qualname": "SAPBWExtractionUtils.get_odsobject", "kind": "function", "doc": "

Get the odsobject based on the provided options.

\n\n

With the table name we may also get the db name, so we need to split.\nMoreover, there might be the need for people to specify odsobject if\nit is different from the dbtable.

\n\n

Args:\n input_spec_opt: options from the input_spec.

\n\n

Returns:\n A string with the odsobject.

\n", "signature": "(input_spec_opt: dict) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "kind": "module", "doc": "

Utilities module for SFTP extraction processes.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat", "kind": "class", "doc": "

Formats of algorithm input.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.CSV", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.CSV", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.CSV: 'csv'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.FWF", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.FWF", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.FWF: 'fwf'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.JSON", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.JSON", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.JSON: 'json'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPInputFormat.XML", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPInputFormat.XML", "kind": "variable", "doc": "

\n", "default_value": "<SFTPInputFormat.XML: 'xml'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter", "kind": "class", "doc": "

Standardize the types of filters we can have from a SFTP source.

\n", "bases": "enum.Enum"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.file_name_contains", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.file_name_contains", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.file_name_contains: 'file_name_contains'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LATEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LATEST_FILE", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.LATEST_FILE: 'latest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.EARLIEST_FILE", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.EARLIEST_FILE", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.EARLIEST_FILE: 'earliest_file'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.GREATER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.GREATER_THAN", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.GREATER_THAN: 'date_time_gt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionFilter.LOWER_THAN", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionFilter.LOWER_THAN", "kind": "variable", "doc": "

\n", "default_value": "<SFTPExtractionFilter.LOWER_THAN: 'date_time_lt'>"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils", "kind": "class", "doc": "

Utils for managing data extraction from particularly relevant SFTP sources.

\n"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_files_list", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_files_list", "kind": "function", "doc": "

Get a list of files to be extracted from SFTP.

\n\n

The arguments (options_args) to list files are:\ndate_time_gt(str):\n Filter the files greater than the string datetime\n formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".\ndate_time_lt(str):\n Filter the files lower than the string datetime\n formatted as \"YYYY-MM-DD\" or \"YYYY-MM-DD HH:MM:SS\".\nearliest_file(bool):\n Filter the earliest dated file in the directory.\nfile_name_contains(str):\n Filter files when match the pattern.\nlatest_file(bool):\n Filter the most recent dated file in the directory.\nsub_dir(bool):\n When true, the engine will search files into subdirectories\n of the remote_path.\n It will consider one level below the remote_path.\n When sub_dir is used with latest_file/earliest_file argument,\n the engine will retrieve the latest_file/earliest_file\n for each subdirectory.

\n\n

Args:\n sftp: the SFTP client object.\n remote_path: path of files to be filtered.\n options_args: options from the acon.

\n\n

Returns:\n A list containing the file names to be passed to Spark.

\n", "signature": "(\tcls,\tsftp: paramiko.sftp_client.SFTPClient,\tremote_path: str,\toptions_args: dict) -> Set[str]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.get_sftp_client", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.get_sftp_client", "kind": "function", "doc": "

Get the SFTP client.

\n\n

The SFTP client is used to open an SFTP session across an open\nSSH Transport and perform remote file operations.

\n\n

Args:\n options_args: dictionary containing SFTP connection parameters.\n The Paramiko arguments expected to connect are:\n \"hostname\": the server to connect to.\n \"port\": the server port to connect to.\n \"username\": the username to authenticate as.\n \"password\": used for password authentication.\n \"pkey\": optional - an optional public key to use for authentication.\n \"passphrase\" \u2013 optional - options used for decrypting private keys.\n \"key_filename\" \u2013 optional - the filename, or list of filenames,\n of optional private key(s) and/or certs to try for authentication.\n \"timeout\" \u2013 an optional timeout (in seconds) for the TCP connect.\n \"allow_agent\" \u2013 optional - set to False to disable\n connecting to the SSH agent.\n \"look_for_keys\" \u2013 optional - set to False to disable searching\n for discoverable private key files in ~/.ssh/.\n \"compress\" \u2013 optional - set to True to turn on compression.\n \"sock\" - optional - an open socket or socket-like object\n to use for communication to the target host.\n \"gss_auth\" \u2013 optional - True if you want to use GSS-API authentication.\n \"gss_kex\" \u2013 optional - Perform GSS-API Key Exchange and\n user authentication.\n \"gss_deleg_creds\" \u2013 optional - Delegate GSS-API client\n credentials or not.\n \"gss_host\" \u2013 optional - The targets name in the kerberos database.\n \"gss_trust_dns\" \u2013 optional - Indicates whether or\n not the DNS is trusted to securely canonicalize the name of the\n host being connected to (default True).\n \"banner_timeout\" \u2013 an optional timeout (in seconds)\n to wait for the SSH banner to be presented.\n \"auth_timeout\" \u2013 an optional timeout (in seconds)\n to wait for an authentication response.\n \"disabled_algorithms\" \u2013 an optional dict passed directly to Transport\n and its keyword argument of the same name.\n \"transport_factory\" \u2013 an optional callable which is handed a subset of\n the constructor arguments (primarily those related to the socket,\n GSS functionality, and algorithm selection) and generates a\n Transport instance to be used by this client.\n Defaults to Transport.__init__.

\n\n
The parameter to specify the private key is expected to be in RSA format.\nAttempting a connection with a blank host key is not allowed\nunless the argument \"add_auto_policy\" is explicitly set to True.\n
\n\n

Returns:\n sftp -> a new SFTPClient session object.\n transport -> the Transport for this connection.

\n", "signature": "(\tcls,\toptions_args: dict) -> Tuple[paramiko.sftp_client.SFTPClient, paramiko.transport.Transport]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_format", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_format", "kind": "function", "doc": "

Validate the file extension based on the format definitions.

\n\n

Args:\n files_format: a string containing the file extension.

\n\n

Returns:\n The string validated and formatted.

\n", "signature": "(cls, files_format: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.extraction.sftp_extraction_utils.SFTPExtractionUtils.validate_location", "modulename": "lakehouse_engine.utils.extraction.sftp_extraction_utils", "qualname": "SFTPExtractionUtils.validate_location", "kind": "function", "doc": "

Validate the location. Add \"/\" in the case it does not exist.

\n\n

Args:\n location: file path.

\n\n

Returns:\n The location validated.

\n", "signature": "(cls, location: str) -> str:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.file_utils", "modulename": "lakehouse_engine.utils.file_utils", "kind": "module", "doc": "

Utilities for file name based operations.

\n"}, {"fullname": "lakehouse_engine.utils.file_utils.get_file_names_without_file_type", "modulename": "lakehouse_engine.utils.file_utils", "qualname": "get_file_names_without_file_type", "kind": "function", "doc": "

Function to retrieve list of file names in a folder.

\n\n

This function filters by file type and removes the extension of the file name\nit returns.

\n\n

Args:\n path: path to the folder to list files\n file_type: type of the file to include in list\n exclude_regex: regex of file names to exclude

\n\n

Returns:\n A list of file names without file type.

\n", "signature": "(path: str, file_type: str, exclude_regex: str) -> list:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler", "modulename": "lakehouse_engine.utils.logging_handler", "kind": "module", "doc": "

Module to configure project logging.

\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData", "kind": "class", "doc": "

Logging filter to hide sensitive data from being shown in the logs.

\n", "bases": "logging.Filter"}, {"fullname": "lakehouse_engine.utils.logging_handler.FilterSensitiveData.filter", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "FilterSensitiveData.filter", "kind": "function", "doc": "

Hide sensitive information from being shown in the logs.

\n\n

Based on the configured regex and replace strings, the content of the log\nrecords is replaced and then all the records are allowed to be logged\n(return True).

\n\n

Args:\n record: the LogRecord event being logged.

\n\n

Returns:\n The transformed record to be logged.

\n", "signature": "(self, record: logging.LogRecord) -> bool:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler", "kind": "class", "doc": "

Handle the logging of the lakehouse engine project.

\n"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.__init__", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.__init__", "kind": "function", "doc": "

Construct a LoggingHandler instance.

\n\n

Args:\n class_name: name of the class to be indicated in the logs.

\n", "signature": "(class_name: str)"}, {"fullname": "lakehouse_engine.utils.logging_handler.LoggingHandler.get_logger", "modulename": "lakehouse_engine.utils.logging_handler", "qualname": "LoggingHandler.get_logger", "kind": "function", "doc": "

Get the _logger instance variable.

\n\n
Returns
\n\n
\n

the logger object.

\n
\n", "signature": "(self) -> logging.Logger:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils", "modulename": "lakehouse_engine.utils.schema_utils", "kind": "module", "doc": "

Utilities to facilitate dataframe schema management.

\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils", "kind": "class", "doc": "

Schema utils that help retrieve and manage schemas of dataframes.

\n"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file", "kind": "function", "doc": "

Get a spark schema from a file (spark StructType json file) in a file system.

\n\n

Args:\n file_path: path of the file in a file system. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html

\n\n

Returns:\n Spark schema struct type.

\n", "signature": "(file_path: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_file_to_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_file_to_dict", "kind": "function", "doc": "

Get a dict with the spark schema from a file in a file system.

\n\n

Args:\n file_path: path of the file in a file system. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html

\n\n

Returns:\n Spark schema in a dict.

\n", "signature": "(file_path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_dict", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_dict", "kind": "function", "doc": "

Get a spark schema from a dict.

\n\n

Args:\n struct_type: dict containing a spark schema structure. Check here:\n https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/\n StructType.html

\n\n

Returns:\n Spark schema struct type.

\n", "signature": "(struct_type: dict) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_table_schema", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_table_schema", "kind": "function", "doc": "

Get a spark schema from a table.

\n\n

Args:\n table: table name from which to inherit the schema.

\n\n

Returns:\n Spark schema struct type.

\n", "signature": "(table: str) -> pyspark.sql.types.StructType:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.from_input_spec", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.from_input_spec", "kind": "function", "doc": "

Get a spark schema from an input specification.

\n\n

This covers scenarios where the schema is provided as part of the input\nspecification of the algorithm. Schema can come from the table specified in the\ninput specification (enforce_schema_from_table) or by the dict with the spark\nschema provided there also.

\n\n

Args:\n input_spec: input specification.

\n\n

Returns:\n spark schema struct type.

\n", "signature": "(\tcls,\tinput_spec: lakehouse_engine.core.definitions.InputSpec) -> Optional[pyspark.sql.types.StructType]:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.schema_utils.SchemaUtils.schema_flattener", "modulename": "lakehouse_engine.utils.schema_utils", "qualname": "SchemaUtils.schema_flattener", "kind": "function", "doc": "

Recursive method to flatten the schema of the dataframe.

\n\n

Args:\n schema: schema to be flattened.\n prefix: prefix of the struct to get the value for. Only relevant\n for being used in the internal recursive logic.\n level: level of the depth in the schema being flattened. Only relevant\n for being used in the internal recursive logic.\n max_level: level until which you want to flatten the schema. Default: None.\n shorten_names: whether to shorten the names of the prefixes of the fields\n being flattened or not. Default: False.\n alias: whether to define alias for the columns being flattened or\n not. Default: True.\n num_chars: number of characters to consider when shortening the names of\n the fields. Default: 7.\n ignore_cols: columns which you don't want to flatten. Default: None.

\n\n

Returns:\n A function to be called in .transform() spark function.

\n", "signature": "(\tschema: pyspark.sql.types.StructType,\tprefix: str = None,\tlevel: int = 1,\tmax_level: int = None,\tshorten_names: bool = False,\talias: bool = True,\tnum_chars: int = 7,\tignore_cols: List = None) -> List:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage", "modulename": "lakehouse_engine.utils.storage", "kind": "module", "doc": "

Utilities to interact with storage systems.

\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage", "modulename": "lakehouse_engine.utils.storage.file_storage", "kind": "module", "doc": "

Module for abstract representation of a storage system holding files.

\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage", "kind": "class", "doc": "

Abstract file storage class.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.get_file_payload", "kind": "function", "doc": "

Get the payload of a file.

\n\n

Args:\n url: url of the file.

\n\n

Returns:\n File payload/content.

\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage.FileStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.file_storage", "qualname": "FileStorage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n

Args:\n url: url of the file.\n content: content to write into the file.

\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "kind": "module", "doc": "

Module for common file storage functions.

\n"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions", "kind": "class", "doc": "

Class for common file storage functions.

\n", "bases": "abc.ABC"}, {"fullname": "lakehouse_engine.utils.storage.file_storage_functions.FileStorageFunctions.read_json", "modulename": "lakehouse_engine.utils.storage.file_storage_functions", "qualname": "FileStorageFunctions.read_json", "kind": "function", "doc": "

Read a json file.

\n\n

The file should be in a supported file system (e.g., s3 or local filesystem -\nfor local tests only).

\n\n

Args:\n path: path to the json file.

\n\n

Returns:\n Dict with json file content.

\n", "signature": "(path: str) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "kind": "module", "doc": "

Module to represent a local file storage system.

\n"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage", "kind": "class", "doc": "

Class to represent a local file storage system.

\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.get_file_payload", "kind": "function", "doc": "

Get the payload of a file.

\n\n

Args:\n url: url of the file.

\n\n

Returns:\n file payload/content.

\n", "signature": "(cls, url: urllib.parse.ParseResult) -> <class 'TextIO'>:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.local_fs_storage.LocalFSStorage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.local_fs_storage", "qualname": "LocalFSStorage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n

Args:\n url: url of the file.\n content: content to write into the file.

\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "kind": "module", "doc": "

Module to represent a s3 file storage system.

\n"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage", "kind": "class", "doc": "

Class to represent a s3 file storage system.

\n", "bases": "lakehouse_engine.utils.storage.file_storage.FileStorage"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.get_file_payload", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.get_file_payload", "kind": "function", "doc": "

Get the payload of a config file.

\n\n

Args:\n url: url of the file.

\n\n

Returns:\n File payload/content.

\n", "signature": "(cls, url: urllib.parse.ParseResult) -> Any:", "funcdef": "def"}, {"fullname": "lakehouse_engine.utils.storage.s3_storage.S3Storage.write_payload_to_file", "modulename": "lakehouse_engine.utils.storage.s3_storage", "qualname": "S3Storage.write_payload_to_file", "kind": "function", "doc": "

Write payload into a file.

\n\n

Args:\n url: url of the file.\n content: content to write into the file.

\n", "signature": "(cls, url: urllib.parse.ParseResult, content: str) -> None:", "funcdef": "def"}]; // mirrored in build-search-index.js (part 1) // Also split on html tags. this is a cheap heuristic, but good enough.