Skip to content

Commit

Permalink
fix: fixed dataframe loader errors - not handling none returns correc…
Browse files Browse the repository at this point in the history
…tly with proper errors [2024-11-27]
  • Loading branch information
CHRISCARLON committed Nov 27, 2024
1 parent 5d361b4 commit 1e36042
Show file tree
Hide file tree
Showing 4 changed files with 201 additions and 208 deletions.
121 changes: 50 additions & 71 deletions HerdingCats/data_loader/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,98 +55,77 @@ def __init__(self):
# ----------------------------
# Load data into a variety of formats for aggregation and analysis
# ----------------------------
def polars_data_loader(
self, resource_data: Optional[List]
) -> Optional[pl.DataFrame]:
def polars_data_loader(self, resource_data: Optional[List]) -> pl.DataFrame:
"""
Isolate a specific resource using the Explorer Class.
Load a resource into a dataframe for further exploration.
# Example usage...
import HerdingCats as hc
def main():
with hc.CatSession(hc.CkanDataCatalogues.LONDON_DATA_STORE) as session:
explore = hc.CkanCatExplorer(session)
all_packages = explore.package_list_dictionary()
data = all_packages.get("number-bicycle-hires")
info = explore.package_show_info_json(data)
resource_list = explore.extract_resource_url(info, "tfl-daily-cycle-hires.xls")
resource_loader = hc.CkanCatResourceLoader()
polars_df = resource_loader.polars_data_loader(resource_list)
print(polars_df)
Load a resource into a Polars DataFrame from a URL.
Supports Excel (.xlsx) and CSV formats.
if __name__ =="__main__":
main()
Args:
resource_data: List containing [format, url] of the resource
Returns:
Optional[pl.DataFrame]: Loaded DataFrame or None if loading fails
"""
if resource_data:
try:
if not resource_data:
logger.error("No resource data provided")
raise

url = resource_data[1]
file_format = resource_data[0].lower()

response = requests.get(url)
response.raise_for_status()
binary_data = BytesIO(response.content)

file_format = resource_data[0]

if file_format and (
file_format.lower() == "spreadsheet" or file_format.lower() == "xlsx"
):
df = pl.read_excel(binary_data)
return df
elif file_format and file_format.lower() == "csv":
df = pl.read_csv(binary_data)
return df
if file_format in ["spreadsheet", "xlsx"]:
return pl.read_excel(binary_data)
elif file_format == "csv":
return pl.read_csv(binary_data)
else:
logger.error("Error")
else:
logger.error("Error")
logger.error(f"Unsupported format: {file_format}")
raise

except Exception as e:
logger.error(f"Failed to load data: {str(e)}")
raise

def pandas_data_loader(
self, resource_data: Optional[List]
) -> Optional[pd.DataFrame]:
) -> pd.DataFrame:
"""
Isolate a specific resource using the Explorer Class.
Load a resource into a dataframe for further exploration.
# Example usage...
import HerdingCats as hc
def main():
with hc.CatSession(hc.CkanDataCatalogues.LONDON_DATA_STORE) as session:
explore = hc.CkanCatExplorer(session)
all_packages = explore.package_list_dictionary()
data = all_packages.get("number-bicycle-hires")
info = explore.package_show_info_json(data)
resource_list = explore.extract_resource_url(info, "tfl-daily-cycle-hires.xls")
resource_loader = hc.CkanCatResourceLoader()
pandas_df = resource_loader.pandas_data_loader(resource_list)
print(pandas_df)
if __name__ =="__main__":
main()
Load a resource into a Polars DataFrame from a URL.
Supports Excel (.xlsx) and CSV formats.
Args:
resource_data: List containing [format, url] of the resource
Returns:
Optional[pl.DataFrame]: Loaded DataFrame or None if loading fails
"""
if resource_data:
try:
if not resource_data:
logger.error("No resource data provided")
raise

url = resource_data[1]
file_format = resource_data[0].lower()

response = requests.get(url)
response.raise_for_status()
binary_data = BytesIO(response.content)

file_format = resource_data[0]

if file_format and (
file_format.lower() == "spreadsheet" or file_format.lower() == "xlsx"
):
df = pd.read_excel(binary_data)
return df
elif file_format and file_format.lower() == "csv":
df = pd.read_csv(binary_data)
return df
if file_format in ["spreadsheet", "xlsx"]:
return pd.read_excel(binary_data)
elif file_format == "csv":
return pd.read_csv(binary_data)
else:
logger.error("Error")
else:
logger.error("Error")
logger.error(f"Unsupported format: {file_format}")
raise

except Exception as e:
logger.error(f"Failed to load data: {str(e)}")
raise

def duckdb_data_loader(
self, resource_data: Optional[List], duckdb_name: str, table_name: str
Expand Down
24 changes: 12 additions & 12 deletions HerdingCats/explorer/cat_explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ def catalogue_freshness(self):
for entry in dictionary_prep
]

df = self._duckdb_explore(
df = self.__duckdb_explore(
dictionary_data,
"freshness",
"""
Expand Down Expand Up @@ -492,7 +492,7 @@ def package_show_info_json(self, package_name: Union[str, dict, Any]) -> List[Di
data = response.json()
result_data = data["result"]

return self._extract_resource_data(result_data)
return self.__extract_resource_data(result_data)

except requests.RequestException as e:
raise CatExplorerError(f"Failed to search datasets: {str(e)}")
Expand Down Expand Up @@ -595,7 +595,7 @@ def main():
"Neither 'result' nor 'results' key found in the API response"
)

return self._extract_condensed_package_data(
return self.__extract_condensed_package_data(
result_data,
["name", "notes_markdown"],
["name", "created", "format", "url"],
Expand Down Expand Up @@ -685,7 +685,7 @@ def package_search_condense_dataframe_packed(
"Neither 'result' nor 'results' key found in the API response"
)

extracted_data = self._extract_condensed_package_data(
extracted_data = self.__extract_condensed_package_data(
result_data,
["name", "notes_markdown", "num_resources"],
["name", "created", "format", "url"],
Expand Down Expand Up @@ -791,16 +791,16 @@ def package_search_condense_dataframe_unpacked(
"Neither 'result' nor 'results' key found in the API response"
)

extracted_data = self._extract_condensed_package_data(
extracted_data = self.__extract_condensed_package_data(
result_data,
["name", "notes_markdown"],
["name", "created", "format", "url"],
)

if df_type.lower() == "polars":
return self._create_polars_dataframe(extracted_data)
return self.__create_polars_dataframe(extracted_data)
else: # pandas
return self._create_pandas_dataframe(extracted_data)
return self.__create_pandas_dataframe(extracted_data)

except requests.RequestException as e:
raise CatExplorerError(f"Failed to search datasets: {str(e)}")
Expand Down Expand Up @@ -853,7 +853,7 @@ def extract_resource_url(
return None

@staticmethod
def _extract_condensed_package_data(
def __extract_condensed_package_data(
data: List[Dict[str, Any]], fields: List[str], resource_fields: List[str]
) -> List[Dict[str, Any]]:
"""
Expand Down Expand Up @@ -900,7 +900,7 @@ def _extract_condensed_package_data(
]

@staticmethod
def _create_pandas_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
def __create_pandas_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
"""TBC"""
df = pd.json_normalize(
data,
Expand All @@ -911,7 +911,7 @@ def _create_pandas_dataframe(data: List[Dict[str, Any]]) -> pd.DataFrame:
return df

@staticmethod
def _create_polars_dataframe(data: List[Dict[str, Any]]) -> pl.DataFrame:
def __create_polars_dataframe(data: List[Dict[str, Any]]) -> pl.DataFrame:
"""TBC"""
df = pl.DataFrame(data)
return (
Expand All @@ -926,7 +926,7 @@ def _create_polars_dataframe(data: List[Dict[str, Any]]) -> pl.DataFrame:
)

@staticmethod
def _extract_resource_data(data: Dict[str, Any]) -> List[Dict[str, Any]]:
def __extract_resource_data(data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
Extracts specific fields for a specific package and creates a list of dictionaries,
one for each resource, containing the specified fields.
Expand Down Expand Up @@ -954,7 +954,7 @@ def _extract_resource_data(data: Dict[str, Any]) -> List[Dict[str, Any]]:
return result

@staticmethod
def _duckdb_explore(
def __duckdb_explore(
data: List[Dict[str, Any]],
table_name: str,
query: str = "",
Expand Down
Loading

0 comments on commit 1e36042

Please sign in to comment.