From ec508e307c642c4099ded0c950174dfb0abead5b Mon Sep 17 00:00:00 2001 From: Chris Carlon Date: Mon, 2 Dec 2024 23:17:24 +0000 Subject: [PATCH] feat: added extra french gouv catalogue methods [2024-12-02] --- HerdingCats/endpoints/api_endpoints.py | 1 + HerdingCats/explorer/cat_explore.py | 164 +++++++++++++++++++++++++ makefile | 27 ++++ 3 files changed, 192 insertions(+) diff --git a/HerdingCats/endpoints/api_endpoints.py b/HerdingCats/endpoints/api_endpoints.py index 5bbdb52..d2aed23 100644 --- a/HerdingCats/endpoints/api_endpoints.py +++ b/HerdingCats/endpoints/api_endpoints.py @@ -68,6 +68,7 @@ class DcatApiPaths: class FrenchGouvApiPaths: BASE_PATH = "/api/1/{}" SHOW_DATASETS = BASE_PATH.format("datasets") + SHOW_DATASETS_BY_ID = BASE_PATH.format("datasets/{}") class FrenchGouvCatalogue(Enum): diff --git a/HerdingCats/explorer/cat_explore.py b/HerdingCats/explorer/cat_explore.py index 9ab5c80..631d6df 100644 --- a/HerdingCats/explorer/cat_explore.py +++ b/HerdingCats/explorer/cat_explore.py @@ -3,6 +3,7 @@ import polars as pl import duckdb import json +import time from typing import Any, Dict, Optional, Union, Literal, List, Tuple from loguru import logger @@ -1232,3 +1233,166 @@ def check_health_check(self) -> None: except requests.RequestException as e: logger.error(f"Health Check Failed: Unable to connect to French Gouv - {str(e)}") + + # ---------------------------- + # Get datasets available + # ---------------------------- + def get_all_datasets(self) -> dict: + """ + Paginates through all datasets in the French Government data catalogue + and creates a dictionary of acronyms and IDs. + + Returns: + dict: Dictionary with dataset IDs as keys and acronyms as values + """ + datasets = {} + page = 1 + base_url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS + + while True: + try: + # Make request with pagination + params = {'page': page} + response = self.cat_session.session.get(base_url, params=params) + + if response.status_code != 200: + logger.error(f"Failed to fetch page {page} with status code {response.status_code}") + break + + data = response.json() + + # Process datasets on current page + for dataset in data['data']: + dataset_id = dataset.get('id', '') + # Handle null or empty acronyms by setting to empty string + acronym = dataset.get('acronym') if dataset.get('acronym') else '' + datasets[dataset_id] = acronym + + # Check if we've reached the last page + if not data.get('next_page'): + break + + page += 1 + + # Optional: Log progress every 10 pages + if page % 10 == 0: + logger.info(f"Processed {page} pages ({len(datasets)} datasets)") + + except Exception as e: + logger.error(f"Error processing page {page}: {str(e)}") + break + + logger.success(f"Finished processing {len(datasets)} datasets") + return datasets + + def get_datasets_by_id_dict(self, id: str) -> dict: + """ + Paginates through all datasets in the French Government data catalogue + and creates a dictionary of acronyms and IDs. + + Returns: + dict: Dictionary with dataset IDs as keys and acronyms as values + """ + datasets = {} + page = 1 + base_url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS + + while True: + try: + # Make request with pagination + params = {'page': page} + response = self.cat_session.session.get(base_url, params=params) + + if response.status_code != 200: + logger.error(f"Failed to fetch page {page} with status code {response.status_code}") + break + + data = response.json() + + # Process datasets on current page + for dataset in data['data']: + dataset_id = dataset.get('id', '') + # Handle null or empty acronyms by setting to empty string + acronym = dataset.get('acronym') if dataset.get('acronym') else '' + datasets[dataset_id] = acronym + + # Check if we've reached the last page + if not data.get('next_page'): + break + + page += 1 + + # Optional: Log progress every 10 pages + if page % 10 == 0: + logger.info(f"Processed {page} pages ({len(datasets)} datasets)") + + except Exception as e: + logger.error(f"Error processing page {page}: {str(e)}") + break + + logger.success(f"Finished processing {len(datasets)} datasets") + return datasets + + def get_dataset_by_identifier(self, identifier: str) -> dict: + """ + Fetches a specific dataset using either its ID or slug. + + Args: + identifier (str): Dataset ID or slug to fetch + + Returns: + dict: Dataset details or empty dict if not found + + Example identifier: + ID: "674de63d05a9bbeddc66bdc1" + """ + try: + # Construct URL for specific dataset + url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS_BY_ID.format(identifier) + + # Make request + response = self.cat_session.session.get(url) + + # Handle response + if response.status_code == 200: + data = response.json() + logger.success(f"Successfully retrieved dataset: {identifier}") + return data + elif response.status_code == 404: + logger.warning(f"Dataset not found: {identifier}") + return {} + else: + logger.error(f"Failed to fetch dataset {identifier} with status code {response.status_code}") + return {} + + except Exception as e: + logger.error(f"Error fetching dataset {identifier}: {str(e)}") + return {} + + def get_datasets_by_identifiers(self, identifiers: list) -> dict: + """ + Fetches multiple datasets using a list of IDs or slugs. + + Args: + identifiers (list): List of dataset IDs or slugs to fetch + + Returns: + dict: Dictionary mapping identifiers to their dataset details + """ + results = {} + + for identifier in identifiers: + try: + dataset = self.get_dataset_by_identifier(identifier) + if dataset: + results[identifier] = dataset + + # Optional: Add a small delay to avoid overwhelming the API + time.sleep(0.1) + + except Exception as e: + logger.error(f"Error processing identifier {identifier}: {str(e)}") + results[identifier] = {} + + logger.success(f"Finished fetching {len(results)} datasets") + return results diff --git a/makefile b/makefile index 8de9161..381b01e 100644 --- a/makefile +++ b/makefile @@ -1,4 +1,5 @@ DATE := $(shell date +%Y-%m-%d) +VENV_PATH := .venv define COMMIT_TYPES feat: A new feature @@ -15,6 +16,32 @@ revert: Reverts a previous commit endef export COMMIT_TYPES +# Local development commands +.PHONY: dev ruff-watch dev-kill + +dev: + @if [ -z "$$TMUX" ]; then \ + tmux new-session -d -s herding-cats; \ + tmux send-keys 'cd $(shell pwd)' C-m; \ + tmux split-window -v -p 15; \ + tmux send-keys 'cd $(shell pwd) && source .venv/bin/activate && make ruff-watch' C-m; \ + tmux select-pane -t 0; \ + tmux send-keys 'source .venv/bin/activate' C-m; \ + tmux attach-session -t herding-cats; \ + else \ + tmux split-window -v -p 15 'source .venv/bin/activate && make ruff-watch'; \ + tmux select-pane -t 0; \ + fi + +ruff-watch: + @echo "Starting Ruff in watch mode..." + @ruff check --watch . + +dev-kill: + @echo "Killing HerdingCats dev session"s + tmux kill-session -t herding-cats + +# Git commands update: git-add git-commit git-push git-add: