Skip to content

Commit

Permalink
feat: added extra french gouv catalogue methods [2024-12-02]
Browse files Browse the repository at this point in the history
  • Loading branch information
CHRISCARLON committed Dec 2, 2024
1 parent ff0c1f5 commit ec508e3
Show file tree
Hide file tree
Showing 3 changed files with 192 additions and 0 deletions.
1 change: 1 addition & 0 deletions HerdingCats/endpoints/api_endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ class DcatApiPaths:
class FrenchGouvApiPaths:
BASE_PATH = "/api/1/{}"
SHOW_DATASETS = BASE_PATH.format("datasets")
SHOW_DATASETS_BY_ID = BASE_PATH.format("datasets/{}")


class FrenchGouvCatalogue(Enum):
Expand Down
164 changes: 164 additions & 0 deletions HerdingCats/explorer/cat_explore.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import polars as pl
import duckdb
import json
import time

from typing import Any, Dict, Optional, Union, Literal, List, Tuple
from loguru import logger
Expand Down Expand Up @@ -1232,3 +1233,166 @@ def check_health_check(self) -> None:

except requests.RequestException as e:
logger.error(f"Health Check Failed: Unable to connect to French Gouv - {str(e)}")

# ----------------------------
# Get datasets available
# ----------------------------
def get_all_datasets(self) -> dict:
"""
Paginates through all datasets in the French Government data catalogue
and creates a dictionary of acronyms and IDs.

Returns:
dict: Dictionary with dataset IDs as keys and acronyms as values
"""
datasets = {}
page = 1
base_url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS

while True:
try:
# Make request with pagination
params = {'page': page}
response = self.cat_session.session.get(base_url, params=params)

if response.status_code != 200:
logger.error(f"Failed to fetch page {page} with status code {response.status_code}")
break

data = response.json()

# Process datasets on current page
for dataset in data['data']:
dataset_id = dataset.get('id', '')
# Handle null or empty acronyms by setting to empty string
acronym = dataset.get('acronym') if dataset.get('acronym') else ''
datasets[dataset_id] = acronym

# Check if we've reached the last page
if not data.get('next_page'):
break

page += 1

# Optional: Log progress every 10 pages
if page % 10 == 0:
logger.info(f"Processed {page} pages ({len(datasets)} datasets)")

except Exception as e:
logger.error(f"Error processing page {page}: {str(e)}")
break

logger.success(f"Finished processing {len(datasets)} datasets")
return datasets

def get_datasets_by_id_dict(self, id: str) -> dict:
"""
Paginates through all datasets in the French Government data catalogue
and creates a dictionary of acronyms and IDs.

Returns:
dict: Dictionary with dataset IDs as keys and acronyms as values
"""
datasets = {}
page = 1
base_url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS

while True:
try:
# Make request with pagination
params = {'page': page}
response = self.cat_session.session.get(base_url, params=params)

if response.status_code != 200:
logger.error(f"Failed to fetch page {page} with status code {response.status_code}")
break

data = response.json()

# Process datasets on current page
for dataset in data['data']:
dataset_id = dataset.get('id', '')
# Handle null or empty acronyms by setting to empty string
acronym = dataset.get('acronym') if dataset.get('acronym') else ''
datasets[dataset_id] = acronym

# Check if we've reached the last page
if not data.get('next_page'):
break

page += 1

# Optional: Log progress every 10 pages
if page % 10 == 0:
logger.info(f"Processed {page} pages ({len(datasets)} datasets)")

except Exception as e:
logger.error(f"Error processing page {page}: {str(e)}")
break

logger.success(f"Finished processing {len(datasets)} datasets")
return datasets

def get_dataset_by_identifier(self, identifier: str) -> dict:
"""
Fetches a specific dataset using either its ID or slug.

Args:
identifier (str): Dataset ID or slug to fetch

Returns:
dict: Dataset details or empty dict if not found

Example identifier:
ID: "674de63d05a9bbeddc66bdc1"
"""
try:
# Construct URL for specific dataset
url = self.cat_session.base_url + FrenchGouvApiPaths.SHOW_DATASETS_BY_ID.format(identifier)

# Make request
response = self.cat_session.session.get(url)

# Handle response
if response.status_code == 200:
data = response.json()
logger.success(f"Successfully retrieved dataset: {identifier}")
return data
elif response.status_code == 404:
logger.warning(f"Dataset not found: {identifier}")
return {}
else:
logger.error(f"Failed to fetch dataset {identifier} with status code {response.status_code}")
return {}

except Exception as e:
logger.error(f"Error fetching dataset {identifier}: {str(e)}")
return {}

def get_datasets_by_identifiers(self, identifiers: list) -> dict:
"""
Fetches multiple datasets using a list of IDs or slugs.

Args:
identifiers (list): List of dataset IDs or slugs to fetch

Returns:
dict: Dictionary mapping identifiers to their dataset details
"""
results = {}

for identifier in identifiers:
try:
dataset = self.get_dataset_by_identifier(identifier)
if dataset:
results[identifier] = dataset

# Optional: Add a small delay to avoid overwhelming the API
time.sleep(0.1)

except Exception as e:
logger.error(f"Error processing identifier {identifier}: {str(e)}")
results[identifier] = {}

logger.success(f"Finished fetching {len(results)} datasets")
return results
27 changes: 27 additions & 0 deletions makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
DATE := $(shell date +%Y-%m-%d)
VENV_PATH := .venv

define COMMIT_TYPES
feat: A new feature
Expand All @@ -15,6 +16,32 @@ revert: Reverts a previous commit
endef
export COMMIT_TYPES

# Local development commands
.PHONY: dev ruff-watch dev-kill

dev:
@if [ -z "$$TMUX" ]; then \
tmux new-session -d -s herding-cats; \
tmux send-keys 'cd $(shell pwd)' C-m; \
tmux split-window -v -p 15; \
tmux send-keys 'cd $(shell pwd) && source .venv/bin/activate && make ruff-watch' C-m; \
tmux select-pane -t 0; \
tmux send-keys 'source .venv/bin/activate' C-m; \
tmux attach-session -t herding-cats; \
else \
tmux split-window -v -p 15 'source .venv/bin/activate && make ruff-watch'; \
tmux select-pane -t 0; \
fi

ruff-watch:
@echo "Starting Ruff in watch mode..."
@ruff check --watch .

dev-kill:
@echo "Killing HerdingCats dev session"s
tmux kill-session -t herding-cats

# Git commands
update: git-add git-commit git-push

git-add:
Expand Down

0 comments on commit ec508e3

Please sign in to comment.