fix: fixed dataframe loader errors - not handling none returns correc…

…tly with proper errors [2024-11-27]
CHRISCARLON · Nov 27, 2024 · 1e36042 · 1e36042
1 parent 5d361b4
commit 1e36042
Show file tree

Hide file tree

Showing 4 changed files with 201 additions and 208 deletions.
diff --git a/HerdingCats/data_loader/data_loader.py b/HerdingCats/data_loader/data_loader.py
@@ -55,98 +55,77 @@ def __init__(self):
     # ----------------------------
     # Load data into a variety of formats for aggregation and analysis
     # ----------------------------
-    def polars_data_loader(
-        self, resource_data: Optional[List]
-    ) -> Optional[pl.DataFrame]:
+    def polars_data_loader(self, resource_data: Optional[List]) -> pl.DataFrame:
         """
-        Isolate a specific resource using the Explorer Class.
-
-        Load a resource into a dataframe for further exploration.
-
-        # Example usage...
-        import HerdingCats as hc
-
-        def main():
-            with hc.CatSession(hc.CkanDataCatalogues.LONDON_DATA_STORE) as session:
-                explore = hc.CkanCatExplorer(session)
-                all_packages = explore.package_list_dictionary()
-                data = all_packages.get("number-bicycle-hires")
-                info = explore.package_show_info_json(data)
-                resource_list = explore.extract_resource_url(info, "tfl-daily-cycle-hires.xls")
-                resource_loader = hc.CkanCatResourceLoader()
-                polars_df = resource_loader.polars_data_loader(resource_list)
-                print(polars_df)
+        Load a resource into a Polars DataFrame from a URL.
+        Supports Excel (.xlsx) and CSV formats.
 
-        if __name__ =="__main__":
-            main()
+        Args:
+            resource_data: List containing [format, url] of the resource
 
+        Returns:
+            Optional[pl.DataFrame]: Loaded DataFrame or None if loading fails
         """
-        if resource_data:
+        try:
+            if not resource_data:
+                logger.error("No resource data provided")
+                raise
+
             url = resource_data[1]
+            file_format = resource_data[0].lower()
+
             response = requests.get(url)
             response.raise_for_status()
             binary_data = BytesIO(response.content)
 
-            file_format = resource_data[0]
-
-            if file_format and (
-                file_format.lower() == "spreadsheet" or file_format.lower() == "xlsx"
-            ):
-                df = pl.read_excel(binary_data)
-                return df
-            elif file_format and file_format.lower() == "csv":
-                df = pl.read_csv(binary_data)
-                return df
+            if file_format in ["spreadsheet", "xlsx"]:
+                return pl.read_excel(binary_data)
+            elif file_format == "csv":
+                return pl.read_csv(binary_data)
             else:
-                logger.error("Error")
-        else:
-            logger.error("Error")
+                logger.error(f"Unsupported format: {file_format}")
+                raise
+
+        except Exception as e:
+            logger.error(f"Failed to load data: {str(e)}")
+            raise
 
     def pandas_data_loader(
         self, resource_data: Optional[List]
-    ) -> Optional[pd.DataFrame]:
+    ) -> pd.DataFrame:
         """
-        Isolate a specific resource using the Explorer Class.
-
-        Load a resource into a dataframe for further exploration.
-
-        # Example usage...
-        import HerdingCats as hc
-
-        def main():
-            with hc.CatSession(hc.CkanDataCatalogues.LONDON_DATA_STORE) as session:
-                explore = hc.CkanCatExplorer(session)
-                all_packages = explore.package_list_dictionary()
-                data = all_packages.get("number-bicycle-hires")
-                info = explore.package_show_info_json(data)
-                resource_list = explore.extract_resource_url(info, "tfl-daily-cycle-hires.xls")
-                resource_loader = hc.CkanCatResourceLoader()
-                pandas_df = resource_loader.pandas_data_loader(resource_list)
-                print(pandas_df)
-
-        if __name__ =="__main__":
-            main()
+        Load a resource into a Polars DataFrame from a URL.
+        Supports Excel (.xlsx) and CSV formats.
+
+        Args:
+            resource_data: List containing [format, url] of the resource
+
+        Returns:
+            Optional[pl.DataFrame]: Loaded DataFrame or None if loading fails
         """
-        if resource_data:
+        try:
+            if not resource_data:
+                logger.error("No resource data provided")
+                raise
+
             url = resource_data[1]
+            file_format = resource_data[0].lower()
+
             response = requests.get(url)
             response.raise_for_status()
             binary_data = BytesIO(response.content)
 
-            file_format = resource_data[0]
-
-            if file_format and (
-                file_format.lower() == "spreadsheet" or file_format.lower() == "xlsx"
-            ):
-                df = pd.read_excel(binary_data)
-                return df
-            elif file_format and file_format.lower() == "csv":
-                df = pd.read_csv(binary_data)
-                return df
+            if file_format in ["spreadsheet", "xlsx"]:
+                return pd.read_excel(binary_data)
+            elif file_format == "csv":
+                return pd.read_csv(binary_data)
             else:
-                logger.error("Error")
-        else:
-            logger.error("Error")
+                logger.error(f"Unsupported format: {file_format}")
+                raise
+
+        except Exception as e:
+            logger.error(f"Failed to load data: {str(e)}")
+            raise
 
     def duckdb_data_loader(
         self, resource_data: Optional[List], duckdb_name: str, table_name: str

diff --git a/HerdingCats/explorer/cat_explore.py b/HerdingCats/explorer/cat_explore.py
@@ -430,7 +430,7 @@ def catalogue_freshness(self):
                 for entry in dictionary_prep
             ]
 
-            df = self._duckdb_explore(
+            df = self.__duckdb_explore(
                 dictionary_data,
                 "freshness",
                 """
@@ -492,7 +492,7 @@ def package_show_info_json(self, package_name: Union[str, dict, Any]) -> List[Di
             data = response.json()
             result_data = data["result"]
 
-            return self._extract_resource_data(result_data)
+            return self.__extract_resource_data(result_data)
 
         except requests.RequestException as e:
             raise CatExplorerError(f"Failed to search datasets: {str(e)}")
@@ -595,7 +595,7 @@ def main():
                     "Neither 'result' nor 'results' key found in the API response"
                 )
 
-            return self._extract_condensed_package_data(
+            return self.__extract_condensed_package_data(
                 result_data,
                 ["name", "notes_markdown"],
                 ["name", "created", "format", "url"],
@@ -685,7 +685,7 @@ def package_search_condense_dataframe_packed(
                     "Neither 'result' nor 'results' key found in the API response"
                 )
 
-            extracted_data = self._extract_condensed_package_data(
+            extracted_data = self.__extract_condensed_package_data(
                 result_data,
                 ["name", "notes_markdown", "num_resources"],
                 ["name", "created", "format", "url"],
@@ -791,16 +791,16 @@ def package_search_condense_dataframe_unpacked(
                     "Neither 'result' nor 'results' key found in the API response"
                 )
 
-            extracted_data = self._extract_condensed_package_data(
+            extracted_data = self.__extract_condensed_package_data(
                 result_data,
                 ["name", "notes_markdown"],
                 ["name", "created", "format", "url"],
             )
 
             if df_type.lower() == "polars":
-                return self._create_polars_dataframe(extracted_data)
+                return self.__create_polars_dataframe(extracted_data)
             else:  # pandas
-                return self._create_pandas_dataframe(extracted_data)
+                return self.__create_pandas_dataframe(extracted_data)
 
         except requests.RequestException as e:
             raise CatExplorerError(f"Failed to search datasets: {str(e)}")
@@ -853,7 +853,7 @@ def extract_resource_url(
                     return None
 
     @staticmethod
-    def _extract_condensed_package_data(
+    def __extract_condensed_package_data(
         data: List[Dict[str, Any]], fields: List[str], resource_fields: List[str]
     ) -> List[Dict[str, Any]]:
         """
@@ -900,7 +900,7 @@ def _extract_condensed_package_data(
         ]
 
     @staticmethod
-    def _create_pandas_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
+    def __create_pandas_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
         """TBC"""
         df = pd.json_normalize(
             data,
@@ -911,7 +911,7 @@ def _create_pandas_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
         return df
 
     @staticmethod
-    def _create_polars_dataframe(data: List[Dict[str, Any]]) -> pl.DataFrame:
+    def __create_polars_dataframe(data: List[Dict[str, Any]]) -> pl.DataFrame:
         """TBC"""
         df = pl.DataFrame(data)
         return (
@@ -926,7 +926,7 @@ def _create_polars_dataframe(data: List[Dict[str, Any]]) -> pl.DataFrame:
         )
 
     @staticmethod
-    def _extract_resource_data(data: Dict[str, Any]) -> List[Dict[str, Any]]:
+    def __extract_resource_data(data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
         Extracts specific fields for a specific package and creates a list of dictionaries,
         one for each resource, containing the specified fields.
@@ -954,7 +954,7 @@ def _extract_resource_data(data: Dict[str, Any]) -> List[Dict[str, Any]]:
         return result
 
     @staticmethod
-    def _duckdb_explore(
+    def __duckdb_explore(
         data: List[Dict[str, Any]],
         table_name: str,
         query: str = "",