From 2eadf757456cfe79584695ba0cf58463c912725c Mon Sep 17 00:00:00 2001 From: Chris Carlon Date: Thu, 26 Dec 2024 21:18:42 +0000 Subject: [PATCH] feat!(french data loader): started french gouv catalogue data loader class [2024-12-26] BREAKING CHANGE: started french gouv catalogue data loader class --- HerdingCats/__init__.py | 5 +- HerdingCats/data_loader/data_loader.py | 590 +++++++++++++++---------- HerdingCats/endpoints/api_endpoints.py | 1 + HerdingCats/explorer/cat_explore.py | 44 +- README.md | 31 +- 5 files changed, 421 insertions(+), 250 deletions(-) diff --git a/HerdingCats/__init__.py b/HerdingCats/__init__.py index 2815cbc..a4f119a 100644 --- a/HerdingCats/__init__.py +++ b/HerdingCats/__init__.py @@ -1,4 +1,4 @@ -from .data_loader.data_loader import CkanCatResourceLoader, OpenDataSoftResourceLoader +from .data_loader.data_loader import CkanCatResourceLoader, OpenDataSoftResourceLoader, FrenchGouvResourceLoader from .explorer.cat_explore import CkanCatExplorer, OpenDataSoftCatExplorer, FrenchGouvCatExplorer from .session.cat_session import CatSession from .errors.cats_errors import CatSessionError, CatExplorerError, OpenDataSoftExplorerError @@ -17,7 +17,8 @@ "OpenDataSoftResourceLoader", "OpenDataSoftExplorerError", "FrenchGouvCatExplorer", - "FrenchGouvCatalogue" + "FrenchGouvCatalogue", + "FrenchGouvResourceLoader" ] __version__ = "0.1.6" diff --git a/HerdingCats/data_loader/data_loader.py b/HerdingCats/data_loader/data_loader.py index 23094cd..6a1b048 100644 --- a/HerdingCats/data_loader/data_loader.py +++ b/HerdingCats/data_loader/data_loader.py @@ -20,7 +20,7 @@ # START TO WRANGLE / ANALYSE -# LOAD DATA RESOURCES INTO STORAGE +# LOAD CKAN DATA RESOURCES INTO STORAGE class CkanCatResourceLoader: """A class to load data resources into various formats and storage systems.""" @@ -296,272 +296,400 @@ def aws_s3_data_loader(self, resource_data: List, bucket_name: str, logger.error(f"AWS S3 upload error: {e}") raise - +# START TO WRANGLE / ANALYSE +# LOAD OPEN DATA SOFT DATA RESOURCES INTO STORAGE class OpenDataSoftResourceLoader: + """A class to load OpenDataSoft resources into various formats and storage systems.""" + + SUPPORTED_FORMATS = { + "spreadsheet": ["xls", "xlsx"], + "csv": ["csv"], + "parquet": ["parquet"], + "geopackage": ["gpkg", "geopackage"] + } + def __init__(self) -> None: - pass + self._validate_dependencies() - def polars_data_loader( - self, resource_data: Optional[List[Dict]], format_type: Literal["parquet"], api_key: Optional[str] = None - ) -> pl.DataFrame: - """ - Load data from a resource URL into a Polars DataFrame. - Args: - resource_data: List of dictionaries containing resource information - format_type: Expected format type (currently only supports 'parquet') - api_key: Optional API key for authentication with OpenDataSoft - Returns: - Polars DataFrame - Raises: - OpenDataSoftExplorerError: If resource data is missing or download fails - - # Example usage - import HerdingCats as hc - - def main(): - with hc.CatSession(hc.OpenDataSoftDataCatalogues.UK_POWER_NETWORKS) as session: - explore = hc.OpenDataSoftCatExplorer(session) - data_loader = hc.OpenDataSoftResourceLoader() - - data = explore.show_dataset_export_options_dict("ukpn-smart-meter-installation-volumes") - pl_df = data_loader.polars_data_loader(data, "parquet", "api_key") - print(pl_df.head(10)) - - if __name__ == "__main__": - main() + def _validate_dependencies(self): + """Validate that all required dependencies are available.""" + required_modules = { + 'pandas': pd, + 'polars': pl, + 'duckdb': duckdb, + 'boto3': boto3, + 'pyarrow': pa + } + missing = [name for name, module in required_modules.items() if module is None] + if missing: + raise ImportError(f"Missing required dependencies: {', '.join(missing)}") - """ + @staticmethod + def validate_inputs(func): + """Decorator to validate resource data containing download URLs and formats.""" + @wraps(func) + def wrapper(self, resource_data: Optional[List[Dict]], *args, **kwargs): + # Check if resource data exists and is non-empty if not resource_data: - raise OpenDataSoftExplorerError("No resource data provided") + logger.error("No resource data provided") + raise ValueError("Resource data must be provided") + + if not isinstance(resource_data, list): + logger.error("Resource data must be a list") + raise ValueError("Resource data must be a list of dictionaries") + return func(self, resource_data, *args, **kwargs) + return wrapper + + def _validate_resource_data( + self, + resource_data: Optional[List[Dict[str, str]]], + format_type: str + ) -> str: + """Validate resource data and extract download URL.""" + if not resource_data: + raise OpenDataSoftExplorerError("No resource data provided") - headers = {'Accept': 'application/parquet'} + # Get all supported formats + all_formats = [fmt for formats in self.SUPPORTED_FORMATS.values() for fmt in formats] + + # If the provided format_type is a category, get its format + valid_formats = (self.SUPPORTED_FORMATS.get(format_type, []) + if format_type in self.SUPPORTED_FORMATS + else [format_type]) + + # Validate format type + if format_type not in self.SUPPORTED_FORMATS and format_type not in all_formats: + raise OpenDataSoftExplorerError( + f"Unsupported format: {format_type}. " + f"Supported formats: csv, parquet, xls, xlsx, geopackage" + ) + + # Find matching resource + url = next( + (r.get('download_url') for r in resource_data + if r.get('format', '').lower() in valid_formats), + None + ) + + # If format provided does not have a url provide the formats that do + if not url: + available_formats = [r['format'] for r in resource_data] + raise OpenDataSoftExplorerError( + f"No resource found with format: {format_type}. " + f"Available formats: {', '.join(available_formats)}" + ) + + return url + + def _fetch_data(self, url: str, api_key: Optional[str] = None) -> BytesIO: + """Fetch data from URL and return as BytesIO object.""" + try: + # Add API key to URL if provided if api_key: - headers['Authorization'] = f'apikey {api_key}' - - for resource in resource_data: - if resource.get('format', '').lower() == 'parquet': - url = resource.get('download_url') - if not url: - continue - try: - response = requests.get(url, headers=headers) - response.raise_for_status() - binary_data = BytesIO(response.content) - df = pl.read_parquet(binary_data) - - if df.height == 0 and not api_key: - raise OpenDataSoftExplorerError( - "Received empty DataFrame. This likely means an API key is required for this dataset. " - "Please provide an API key and try again. You can usually do this by creating an account with the datastore you are tyring to access" - ) - return df - - except (requests.RequestException, Exception) as e: - raise OpenDataSoftExplorerError("Failed to download resource", e) - - raise OpenDataSoftExplorerError("No parquet format resource found") + url = f"{url}?apikey={api_key}" + + response = requests.get(url) + response.raise_for_status() + return BytesIO(response.content) + except requests.RequestException as e: + raise OpenDataSoftExplorerError(f"Failed to download resource: {str(e)}", e) - def pandas_data_loader( - self, resource_data: Optional[List[Dict]], format_type: Literal["parquet"], api_key: Optional[str] = None - ) -> pd.DataFrame: - """ - Load data from a resource URL into a Polars DataFrame. - Args: - resource_data: List of dictionaries containing resource information - format_type: Expected format type (currently only supports 'parquet') - api_key: Optional API key for authentication with OpenDataSoft - Returns: - Polars DataFrame - Raises: - OpenDataSoftExplorerError: If resource data is missing or download fails - - # Example usage - import HerdingCats as hc - - def main(): - with hc.CatSession(hc.OpenDataSoftDataCatalogues.UK_POWER_NETWORKS) as session: - explore = hc.OpenDataSoftCatExplorer(session) - data_loader = hc.OpenDataSoftResourceLoader() - - data = explore.show_dataset_export_options_dict("ukpn-smart-meter-installation-volumes") - pd_df = data_loader.pandas_data_loader(data, "parquet", "api_key") - print(pd_df.head(10)) - - if __name__ == "__main__": - main() + def _verify_data(self, df: Union[pd.DataFrame, pl.DataFrame], api_key: Optional[str]) -> None: + """Verify that the DataFrame is not empty when no API key is provided.""" + is_empty = df.empty if isinstance(df, pd.DataFrame) else df.height == 0 + if is_empty and not api_key: + raise OpenDataSoftExplorerError( + "Received empty DataFrame. This likely means an API key is required. " + "Please provide an API key and try again." + ) - """ - if not resource_data: - raise OpenDataSoftExplorerError("No resource data provided") + def _load_dataframe( + self, + binary_data: BytesIO, + format_type: str, + loader_type: Literal["pandas", "polars"], + sheet_name: Optional[str] = None + ) -> Union[pd.DataFrame, pl.DataFrame]: + """Load binary data into specified DataFrame type.""" + try: + match (format_type, loader_type): + case ("parquet", "pandas"): + return pd.read_parquet(binary_data) + case ("parquet", "polars"): + return pl.read_parquet(binary_data) + case ("csv", "pandas"): + return pd.read_csv(binary_data) + case ("csv", "polars"): + return pl.read_csv(binary_data) + case (("xls" | "xlsx" | "spreadsheet"), "pandas"): + return pd.read_excel(binary_data, sheet_name=sheet_name) if sheet_name else pd.read_excel(binary_data) + case (("xls" | "xlsx" | "spreadsheet"), "polars"): + return pl.read_excel(binary_data, sheet_name=sheet_name) if sheet_name else pl.read_excel(binary_data) + case (("geopackage" | "gpkg"), _): + raise ValueError("Geopackage format requires using geopandas or a specialized GIS library") + case _: + raise ValueError(f"Unsupported format {format_type} or loader type {loader_type}") + except Exception as e: + raise OpenDataSoftExplorerError(f"Failed to load {loader_type} DataFrame: {str(e)}", e) - headers = {'Accept': 'application/parquet'} - if api_key: - headers['Authorization'] = f'apikey {api_key}' - - for resource in resource_data: - if resource.get('format', '').lower() == 'parquet': - url = resource.get('download_url') - if not url: - continue - try: - response = requests.get(url, headers=headers) - response.raise_for_status() - binary_data = BytesIO(response.content) - df = pd.read_parquet(binary_data) - - if df.size == 0 and not api_key: - raise OpenDataSoftExplorerError( - "Received empty DataFrame. This likely means an API key is required for this dataset. " - "Please provide an API key and try again. You can usually do this by creating an account with the datastore you are tyring to access" - ) - return df - - except (requests.RequestException, Exception) as e: - raise OpenDataSoftExplorerError("Failed to download resource", e) - - raise OpenDataSoftExplorerError("No parquet format resource found") + @overload + def _load_to_frame( + self, + resource_data: Optional[List[Dict[str, str]]], + format_type: str, + loader_type: Literal["pandas"], + api_key: Optional[str] = None, + sheet_name: Optional[str] = None + ) -> PandasDataFrame: ... + + @overload + def _load_to_frame( + self, + resource_data: Optional[List[Dict[str, str]]], + format_type: str, + loader_type: Literal["polars"], + api_key: Optional[str] = None, + sheet_name: Optional[str] = None + ) -> PolarsDataFrame: ... + + def _load_to_frame( + self, + resource_data: Optional[List[Dict[str, str]]], + format_type: str, + loader_type: Literal["pandas", "polars"], + api_key: Optional[str] = None, + sheet_name: Optional[str] = None + ) -> Union[pd.DataFrame, pl.DataFrame]: + """Common method for loading data into pandas or polars DataFrame.""" + url = self._validate_resource_data(resource_data, format_type) + binary_data = self._fetch_data(url, api_key) + df = self._load_dataframe(binary_data, format_type, loader_type, sheet_name) + self._verify_data(df, api_key) + return df + + @validate_inputs + def polars_data_loader( + self, + resource_data: Optional[List[Dict[str, str]]], + format_type: Literal["csv", "parquet", "spreadsheet", "xls", "xlsx"], + api_key: Optional[str] = None, + sheet_name: Optional[str] = None + ) -> pl.DataFrame: + """Load data from a resource URL into a Polars DataFrame.""" + return self._load_to_frame(resource_data, format_type, "polars", api_key, sheet_name) + + @validate_inputs + def pandas_data_loader( + self, + resource_data: Optional[List[Dict[str, str]]], + format_type: Literal["csv", "parquet", "spreadsheet", "xls", "xlsx"], + api_key: Optional[str] = None, + sheet_name: Optional[str] = None + ) -> pd.DataFrame: + """Load data from a resource URL into a Pandas DataFrame.""" + return self._load_to_frame(resource_data, format_type, "pandas", api_key, sheet_name) + @validate_inputs def duckdb_data_loader( - self, - resource_data: Optional[List[Dict]], - format_type: Literal["parquet", "xlsx", "csv"], - api_key: Optional[str] = None + self, + resource_data: Optional[List[Dict[str, str]]], + format_type: Literal["csv", "parquet", "xls", "xlsx"], + api_key: Optional[str] = None, + sheet_name: Optional[str] = None ) -> duckdb.DuckDBPyConnection: - """ - Load data from a resource URL directly into DuckDB. + """Load data from a resource URL directly into DuckDB.""" + url = self._validate_resource_data(resource_data, format_type) - Args: - resource_data: List of dictionaries containing resource information - format_type: Expected format type ('parquet', 'xlsx', or 'csv') - api_key: Optional API key for authentication with OpenDataSoft - - Returns: - DuckDB connection with loaded data + if api_key: + url = f"{url}?apikey={api_key}" - Raises: - OpenDataSoftExplorerError: If resource data is missing or download fails - """ - if not resource_data: - raise OpenDataSoftExplorerError("No resource data provided") - - # Create in-memory DuckDB connection con = duckdb.connect(':memory:') con.execute("SET force_download=true") + con.execute("INSTALL spatial") + con.execute("LOAD spatial") - for resource in resource_data: - match resource.get('format', '').lower(): - case fmt if fmt == format_type: - url = resource.get('download_url') - if not url: - continue - - try: - # Append API key to URL if provided - if api_key: - url = f"{url}?apikey={api_key}" - - # Load data based on format type - match format_type: - case "parquet": - con.execute( - "CREATE TABLE data AS SELECT * FROM read_parquet(?)", - [url] - ) - case "xlsx": - con.execute( - "CREATE TABLE data AS SELECT * FROM read_xlsx(?)", - [url] - ) - case "csv": - con.execute( - "CREATE TABLE data AS SELECT * FROM read_csv_auto(?)", - [url] - ) - - # Verify data was loaded - sample_data = con.execute("SELECT * FROM data LIMIT 10").fetchall() - if not sample_data and not api_key: - raise OpenDataSoftExplorerError( - "Received empty dataset. This likely means an API key is required. " - "Please provide an API key and try again. You can usually do this by " - "creating an account with the datastore you are trying to access" - ) - - return con - - except duckdb.Error as e: - raise OpenDataSoftExplorerError(f"Failed to load {format_type} resource into DuckDB", e) - + try: + # Use match statement for format handling + match format_type: + case "parquet": + con.execute("CREATE TABLE data AS SELECT * FROM read_parquet(?)", [url]) + case "csv": + con.execute("CREATE TABLE data AS SELECT * FROM read_csv(?)", [url]) + case "xls" | "xlsx" | "spreadsheet": + if sheet_name: + con.execute("CREATE TABLE data AS SELECT * FROM st_read(?, sheet_name=?)", [url, sheet_name]) + else: + con.execute("CREATE TABLE data AS SELECT * FROM st_read(?)", [url]) case _: - continue - - raise OpenDataSoftExplorerError(f"No {format_type} format resource found") + raise ValueError(f"Unsupported format type: {format_type}") + + # Verify data was loaded + sample_data = con.execute("SELECT * FROM data LIMIT 10").fetchall() + if not sample_data and not api_key: + raise OpenDataSoftExplorerError( + "Received empty dataset. This likely means an API key is required." + ) + + return con + + except duckdb.Error as e: + raise OpenDataSoftExplorerError(f"Failed to load {format_type} resource into DuckDB", e) + + + def _verify_s3_bucket(self, s3_client, bucket_name: str) -> None: + """Verify S3 bucket exists.""" + try: + s3_client.head_bucket(Bucket=bucket_name) + logger.success("Bucket Found") + except ClientError as e: + error_code = int(e.response["Error"]["Code"]) + if error_code == 404: + raise ValueError(f"Bucket '{bucket_name}' does not exist") + raise + + def _convert_to_parquet(self, binary_data: BytesIO, format_type: str) -> BytesIO: + """Convert input data to parquet format.""" + try: + match format_type: + case "csv": + df = pd.read_csv(binary_data) + case "xls" | "xlsx": + df = pd.read_excel(binary_data) + case _: + raise ValueError(f"Unsupported format type for Parquet conversion: {format_type}") + if df.empty: + raise ValueError("No data was loaded from the source file") + + # Convert to parquet + parquet_buffer = BytesIO() + table = pa.Table.from_pandas(df) + pq.write_table(table, parquet_buffer) + parquet_buffer.seek(0) + return parquet_buffer + except Exception as e: + raise OpenDataSoftExplorerError(f"Failed to convert to parquet: {str(e)}", e) + + @validate_inputs def aws_s3_data_loader( self, - resource_data: Optional[List[Dict]], + resource_data: List[Dict[str, str]], bucket_name: str, custom_name: str, - api_key: Optional[str] = None, - ) -> None: + mode: Literal["raw", "parquet"], + format_type: Literal["csv", "parquet", "xls", "xlsx"], + api_key: Optional[str] = None + ) -> str: """ - Load resource data into remote S3 storage as a parquet file. + Load resource data into remote S3 storage. Args: - resource_data: List of dictionaries containing resource information + resource_data: List of dictionaries containing format and download_url bucket_name: S3 bucket name custom_name: Custom prefix for the filename + mode: 'raw' to keep original format, 'parquet' to convert to parquet + format_type: Format to download ('csv', 'parquet', 'xls', 'xlsx') api_key: Optional API key for authentication - """ - if not resource_data: - raise OpenDataSoftExplorerError("No resource data provided") - if not bucket_name: - raise ValueError("No bucket name provided") - - # Create an S3 client + Returns: + str: Name of the uploaded file + """ + + # Validate inputs + if not all(isinstance(x, str) and x.strip() for x in [bucket_name, custom_name]): + raise ValueError("Bucket name and custom name must be non-empty strings") + + # Get URL for specified format + url = self._validate_resource_data(resource_data, format_type) + + # Fetch data + binary_data = self._fetch_data(url, api_key) + + # Setup S3 s3_client = boto3.client("s3") - logger.success("S3 Client Created") + self._verify_s3_bucket(s3_client, bucket_name) - # Check if the bucket exists try: - s3_client.head_bucket(Bucket=bucket_name) - logger.success("Bucket Found") - except ClientError as e: - error_code = int(e.response["Error"]["Code"]) - if error_code == 404: - logger.error(f"Bucket '{bucket_name}' does not exist.") - else: - logger.error(f"Error checking bucket '{bucket_name}': {e}") - return - - headers = {'Accept': 'application/parquet'} - if api_key: - headers['Authorization'] = f'apikey {api_key}' - - for resource in resource_data: - if resource.get('format', '').lower() == 'parquet': - url = resource.get('download_url') - if not url: - continue + match mode: + case "raw": + filename = f"{custom_name}-{uuid.uuid4()}.{format_type}" + s3_client.upload_fileobj(binary_data, bucket_name, filename) + case "parquet": + parquet_buffer = self._convert_to_parquet(binary_data, format_type) + filename = f"{custom_name}-{uuid.uuid4()}.parquet" + s3_client.upload_fileobj(parquet_buffer, bucket_name, filename) + + logger.success(f"File uploaded successfully to S3 as {filename}") + return filename + except Exception as e: + logger.error(f"AWS S3 upload error: {e}") + raise - try: - response = requests.get(url, headers=headers) - response.raise_for_status() - binary_data = BytesIO(response.content) +# START TO WRANGLE / ANALYSE +# LOAD FRENCH GOUV DATA RESOURCES INTO STORAGE +class FrenchGouvResourceLoader: + """A class to load French Gouv data resources into various formats and storage systems.""" - # Generate a unique filename - filename = f"{custom_name}-{uuid.uuid4()}.parquet" + SUPPORTED_FORMATS = { + "spreadsheet": ["xls", "xlsx"], + "csv": ["csv"], + "parquet": ["parquet"], + "geopackage": ["gpkg", "geopackage"] + } - # Upload the parquet file directly - s3_client.upload_fileobj(binary_data, bucket_name, filename) - logger.success("Parquet file uploaded successfully to S3") - return + def __init__(self) -> None: + self._validate_dependencies() - except requests.RequestException as e: - raise OpenDataSoftExplorerError("Failed to download resource", e) - except ClientError as e: - logger.error(f"Error: {e}") - return + def _validate_dependencies(self): + """Validate that all required dependencies are available.""" + required_modules = { + 'pandas': pd, + 'polars': pl, + 'duckdb': duckdb, + 'boto3': boto3, + 'pyarrow': pa + } + missing = [name for name, module in required_modules.items() if module is None] + if missing: + raise ImportError(f"Missing required dependencies: {', '.join(missing)}") + + def validate_resource_data( + self, + resource_data: Optional[List[Dict[str, str]]], + format_type: str + ) -> str: + """Validate resource data and extract download URL.""" + if not resource_data: + raise OpenDataSoftExplorerError("No resource data provided") - raise OpenDataSoftExplorerError("No parquet format resource found") + # Get all supported formats + all_formats = [fmt for formats in self.SUPPORTED_FORMATS.values() for fmt in formats] + + # If the provided format_type is a category, get its format + valid_formats = (self.SUPPORTED_FORMATS.get(format_type, []) + if format_type in self.SUPPORTED_FORMATS + else [format_type]) + + # Validate format type + if format_type not in self.SUPPORTED_FORMATS and format_type not in all_formats: + raise OpenDataSoftExplorerError( + f"Unsupported format: {format_type}. " + f"Supported formats: csv, parquet, xls, xlsx, geopackage" + ) + + # Find matching resource + url = next( + (r.get('resource_latest') for r in resource_data + if r.get('resource_format', '').lower() in valid_formats), + None + ) + + # If format provided does not have a url provide the formats that do + if not url: + available_formats = [r['resource_format'] for r in resource_data] + raise OpenDataSoftExplorerError( + f"No resource found with format: {format_type}. " + f"Available formats: {', '.join(available_formats)}" + ) + + return url \ No newline at end of file diff --git a/HerdingCats/endpoints/api_endpoints.py b/HerdingCats/endpoints/api_endpoints.py index ebbcd69..6364587 100644 --- a/HerdingCats/endpoints/api_endpoints.py +++ b/HerdingCats/endpoints/api_endpoints.py @@ -38,6 +38,7 @@ class OpenDataSoftDataCatalogues(Enum): ELIA_BELGIAN_ENERGY = "https://opendata.elia.be" EDF_ENERGY = "https://opendata.edf.fr" CADENT_GAS = "https://cadentgas.opendatasoft.com" + GRD_FRANCE = "https://opendata.agenceore.fr" # Add more catalogues as needed... class OpenDataSoftApiPaths: diff --git a/HerdingCats/explorer/cat_explore.py b/HerdingCats/explorer/cat_explore.py index 125c37c..ee3227a 100644 --- a/HerdingCats/explorer/cat_explore.py +++ b/HerdingCats/explorer/cat_explore.py @@ -1321,7 +1321,7 @@ def get_dataset_meta_dataframe(self, identifier: str, df_type: Literal["pandas", logger.error(f"Error fetching dataset {identifier}: {str(e)}") return pd.DataFrame() if df_type == "pandas" else pl.DataFrame() - def get_datasets_by_identifiers(self, identifiers: list) -> dict: + def get_multiple_datasets_meta(self, identifiers: list) -> dict: """ Fetches multiple datasets using a list of IDs or slugs. @@ -1352,7 +1352,7 @@ def get_datasets_by_identifiers(self, identifiers: list) -> dict: # ---------------------------- # Show available resources for a particular dataset # ---------------------------- - def get_dataset_resource(self, dataset_id: str, resource_id: str) -> dict: + def get_dataset_resource_export(self, dataset_id: str, resource_id: str) -> dict: """ Fetches metadata for a specific resource within a dataset. @@ -1417,6 +1417,46 @@ def get_dataset_resource_dataframe(self, dataset_id: str, resource_id: str, df_t except Exception as e: logger.error(f"Error fetching resource {resource_id}: {str(e)}") return pd.DataFrame() if df_type == "pandas" else pl.DataFrame() + + def get_dataset_resource_meta(self, data: dict) -> List[Dict[str, Any]] | None: + + if len(data) == 0: + raise ValueError("Data can't be empty") + + try: + result = self._extract_resource_data(data) + return result + except Exception as e: + logger.error("Error fetching resource: {str(e)}") + + @staticmethod + def _extract_resource_data(data: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Extracts specific fields for a specific package and creates a list of dictionaries, + one for each resource, containing the specified fields. + + Args: + data (Dict[str, Any]): The input package data dictionary. + + Returns: + List[Dict[str, Any]]: A list of dictionaries, each containing the specified fields for a resource. + """ + + base_fields = { + "dataset_id": data.get("id"), + "slug": data.get("slug"), + } + + resource_fields = ["created_at", "id", "format", "url", "title", "latest", "last_modified", "frequency", "extras"] + + result = [] + for resource in data.get("resources", []): + resource_data = base_fields.copy() + for field in resource_fields: + resource_data[f"resource_{field}"] = resource.get(field) + result.append(resource_data) + + return result # ---------------------------- # Show all organisation available diff --git a/README.md b/README.md index e41381e..b1acf07 100644 --- a/README.md +++ b/README.md @@ -39,20 +39,21 @@ I'll help format these tables in clean markdown: ## Supported Catalogues -| Catalogue Name | Website | Catalogue Backend | -| -------------------------- | -------------------------------- | ----------------- | -| London Datastore | data.london.gov.uk | CKAN | -| Subak Data Catalogue | data.subak.org | CKAN | -| UK Gov Open Data | data.gov.uk | CKAN | -| Humanitarian Data Exchange | data.humdata.org | CKAN | -| UK Power Networks | ukpowernetworks.opendatasoft.com | Open Datasoft | -| Infrabel | opendata.infrabel.be | Open Datasoft | -| Paris | opendata.paris.fr | Open Datasoft | -| Toulouse | data.toulouse-metropole.fr | Open Datasoft | -| Elia Belgian Energy | opendata.elia.be | Open Datasoft | -| EDF Energy | opendata.edf.fr | Open Datasoft | -| Cadent Gas | cadentgas.opendatasoft.com | Open Datasoft | -| French Gov Open Data | data.gouv.fr | CKAN | +| Catalogue Name | Website | Catalogue Backend | +| ------------------------------------------------------------------------- | -------------------------------- | ----------------- | +| London Datastore | data.london.gov.uk | CKAN | +| Subak Data Catalogue | data.subak.org | CKAN | +| UK Gov Open Data | data.gov.uk | CKAN | +| Humanitarian Data Exchange | data.humdata.org | CKAN | +| UK Power Networks | ukpowernetworks.opendatasoft.com | Open Datasoft | +| Infrabel | opendata.infrabel.be | Open Datasoft | +| Paris | opendata.paris.fr | Open Datasoft | +| Toulouse | data.toulouse-metropole.fr | Open Datasoft | +| Elia Belgian Energy | opendata.elia.be | Open Datasoft | +| EDF Energy | opendata.edf.fr | Open Datasoft | +| Cadent Gas | cadentgas.opendatasoft.com | Open Datasoft | +| French Gov Open Data | data.gouv.fr | CKAN | +| Gestionnaire de Réseaux de Distribution (French equivalent of GDNs in UK) | opendata.agenceore.fr | Open Datasoft | ## In Development @@ -63,7 +64,7 @@ I'll help format these tables in clean markdown: | Data Mill North | datamillnorth.org | TBC | Different implementation - may not work with all methods | | Canada Open Data | open.canada.ca | TBC | Different implementation needs investigation | -# Herding-Cats Quick Start!🏃‍♂️‍➡️ +## Herding-Cats Quick Start!🏃‍♂️‍➡️ ## Overview