Skip to content

Commit

Permalink
Updates 2024-10-04 - Documentation updates, improving core explorer m…
Browse files Browse the repository at this point in the history
…ethods
  • Loading branch information
CHRISCARLON committed Oct 4, 2024
1 parent 30711a0 commit 506c9b7
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 145 deletions.
6 changes: 3 additions & 3 deletions HerdingCats/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
from .explorer.cat_explore import CkanCatExplorer, OpenDataSoftCatExplorer
from .session.cat_session import CatSession
from .errors.cats_errors import CatSessionError, CatExplorerError
from .endpoints.api_endpoints import CkanApiPaths, OpenDataSoftDataCatalogues
from .endpoints.api_endpoints import CkanDataCatalogues, OpenDataSoftDataCatalogues

__all__ = [
"CkanCatResourceLoader",
"CkanCatExplorer",
"CatSession",
"CatSessionError",
"CatExplorerError",
"CkanApiPaths",
"CkanDataCatalogues",
"OpenDataSoftDataCatalogues",
"OpenDataSoftCatExplorer",
]

__version__ = "0.1.3"
__version__ = "0.1.4"
9 changes: 3 additions & 6 deletions HerdingCats/endpoints/api_endpoints.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from enum import Enum


# CKAN
class CkanApiPathsDocs:
PACKAGE_LIST = "https://docs.ckan.org/en/2.11/api/index.html#ckan.logic.action.get.package_list"
Expand All @@ -10,7 +9,6 @@ class CkanApiPathsDocs:

class CkanApiPaths:
BASE_PATH = "/api/3/action/{}"
SITE_READ = BASE_PATH.format("site_read")
PACKAGE_LIST = BASE_PATH.format("package_list")
PACKAGE_SEARCH = BASE_PATH.format("package_search")
PACKAGE_INFO = BASE_PATH.format("package_show")
Expand All @@ -26,20 +24,19 @@ class CkanDataCatalogues(Enum):
SUBAK = "https://data.subak.org"
HUMANITARIAN = "https://data.humdata.org"
AFRICA = "https://open.africa"
CANADA_GOV = "https://search.open.canada.ca/opendata"
NORTHERN_DATA_MILL = "https://datamillnorth.org"
# CANADA_GOV = "https://search.open.canada.ca/opendata" NEED TO LOOK INTO THIS ONE MORE
# NORTHERN_DATA_MILL = "https://datamillnorth.org" NEED TO LOOK INTO THIS ONE MORE
# Add more catalogues as needed...


# OPEN DATASOFT
class OpenDataSoftDataCatalogues(Enum):
UK_POWER_NETWORKS = "https://ukpowernetworks.opendatasoft.com"
INFRABEL = "https://opendata.infrabel.be"
PARIS = "https://opendata.paris.fr"
TOULOUSE = "https://data.toulouse-metropole.fr"
# Add more catalogues as needed...


# OPEN DATASOFT
class OpenDataSoftApiPaths:
# Normal base paths...
BASE_PATH = "/api/v2/catalog/{}"
Expand Down
50 changes: 27 additions & 23 deletions HerdingCats/explorer/cat_explore.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from numpy._core.multiarray import empty
import requests
import pandas as pd
import polars as pl
Expand Down Expand Up @@ -30,41 +31,41 @@ def __init__(self, cat_session: CatSession):
# Example usage...
if __name__ == "__main__":
with CatSession("data.london.gov.uk") as session:
explore = CatExplorer(session)
explore = CkanCatExplorer(session)
"""
self.cat_session = cat_session

# ----------------------------
# Check CKAN backend health
# Check CKAN site health
# ----------------------------
def check_site_health(self) -> None:
"""
Make sure the Ckan endpoints are healthy and reachable
This calls the Ckan site_read endpoint
Will return a dictionary with a "success" field if all is well
This calls the Ckan site package endpoint to check if site is reacheable.
# Example usage...
if __name__ == "__main__":
with CatSession("data.london.gov.uk") as session:
explore = CatExplorer(session)
explore = CkanCatExplorer(session)
health_check = explore.check_site_health()
"""
url = self.cat_session.base_url + CkanApiPaths.SITE_READ

response = self.cat_session.session.get(url)
response.raise_for_status()
data = response.json()
health_status = data.get("success")
url = self.cat_session.base_url + CkanApiPaths.PACKAGE_LIST
try:
response = self.cat_session.session.get(url)

if health_status:
logger.success("Health Check Passed: CKAN is running and available")
else:
logger.error(
"Health Check Failed: Something went wrong and CKAN is currently not available"
)
if response.status_code == 200:
data = response.json()
if data:
logger.success("Health Check Passed: CKAN is running and available")
else:
logger.warning("Health Check Warning: CKAN responded with an empty dataset")
else:
logger.error(f"Health Check Failed: CKAN responded with status code {response.status_code}")

except requests.RequestException as e:
logger.error(f"Health Check Failed: Unable to connect to CKAN - {str(e)}")

# ----------------------------
# Basic Available package lists + metadata
Expand Down Expand Up @@ -99,13 +100,13 @@ def get_package_count(self) -> int:

def package_list_dictionary(self) -> dict:
"""
Explore all packages that are available to query.
Explore all packages that are available to query as a dictionary.
Returns:
Dictionary of all available packages to use for further exploration.
It follows a {"package_name": "package_name"} structure so that you can use the package names for
additional methods
additional methods.
{'--lfb-financial-and-performance-reporting-2021-22': '--lfb-financial-and-performance-reporting-2021-22',
'-ghg-emissions-per-capita-from-food-and-non-alcoholic-drinks-': '-ghg-emissions-per-capita-from-food-and-non-alcoholic-drinks-',
Expand All @@ -120,8 +121,8 @@ def package_list_dictionary(self) -> dict:
# Example usage...
if __name__ == "__main__":
with CatSession("data.london.gov.uk") as session:
explore = CatExplorer(session)
all_packages = explore.package_list_json()
explore = CkanCatExplorer(session)
all_packages = explore.package_list_dictionary()
pprint(all_packages)
"""

Expand All @@ -148,6 +149,9 @@ def package_list_dataframe(
pandas
polars
Returns:
Dataframe with all dataset names
Example ouput:
shape: (68_995, 1)
┌─────────────────────
Expand All @@ -170,7 +174,7 @@ def package_list_dataframe(
# Example usage...
if __name__ == "__main__":
with CkanCatSession("uk gov") as session:
with CatSession("uk gov") as session:
explorer = CkanCatExplorer(session)
results = explorer.package_list_dataframe('polars')
print(results)
Expand Down
114 changes: 15 additions & 99 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,110 +57,26 @@ This will improve and speed up how people:

## Current Default Open Data Catalogues

**Herding-CATs supports the following catalogues by default**
Herding-CATs supports the following catalogues by default:

### Default

**Default**
| Catalogue Name | Website | Catalogue Endpoint | Comments |
|----------------|---------|-------------------|----------|
| London Datastore | https://data.london.gov.uk | CKAN | Works with all methods |
| Subak Data Catalogue | https://data.subak.org | CKAN | TBC |
| Gov Open Data | https://www.data.gov.uk | CKAN | TBC |
| Subak Data Catalogue | https://data.subak.org | CKAN | Works with all methods |
| Gov Open Data | https://www.data.gov.uk | CKAN | Works with all methods |
| Humanitarian Data Exchange | https://data.humdata.org | CKAN | Works with most methods |
| Data Mill North | https://datamillnorth.org | CKAN | Seems to have a slightly different implementation - may not work with all methods |
| UK Power Networks | https://ukpowernetworks.opendatasoft.com | Open Datasoft | Works with all methods |
| Infrabel | https://opendata.infrabel.be | Open Datasoft | Works with all methods |
| Paris | https://opendata.paris.fr | Open Datasoft | Works with all methods |
| Toulouse | https://data.toulouse-metropole.fr | Open Datasoft | Works with all methods but Endpoint deviates from standard implementation |

**TBC**
| Catalogue Name | Website | Catalogue API Endpoint Definition |
|----------------|---------|-------------------|
| Bristol Open Data | https://opendata.bristol.gov.uk | TBC |
| Icebreaker One | https://ib1.org | TBC |

## Basic usage examples:

```python
# Example usage 0: List all available data packages in the catalogue
if __name__ == "__main__":
with CkanCatSession("data.london.gov.uk") as session:
explore = CkanCatExplorer(session)
package_list = explore.package_list()
pprint(package_list)
```

```python
# Example usage 1: Basic Search: Look for packages with a basic search term
if __name__ == "__main__":
with CkanCatSession("data.london.gov.uk") as session:
explore = CkanCatExplorer(session)
census_package = explore.package_search_json(search_query="census")
pprint(census_package)
```

```python
# Example usage 2: List packages and show package info
if __name__ == "__main__":
with CkanCatSession("data.london.gov.uk") as session:
explore = ckanCatExplorer(session)
packlage_list = explore.package_list_json()
boundary_info = explore.package_show_info_json('2011-boundary-files')
pprint(show_info)
```

```python
# Example usage 3: Condensed package info view with resource info - either packed or unpacked
if __name__ == "__main__":
with CatSession("data.london.gov.uk") as session:
explorer = CatExplorer(session)

condensed_results = explorer.package_search_condense_dataframe_packed("police", 'polars')
print(condensed_results)

condensed_results = explorer.package_search_condense_dataframe_unpacked("police", 'polars')
print(condensed_results)
```

```python
# Example usage 4: Find the data you want, and then load it into a polars df for further processing
if __name__ == "__main__":
with CkanCatSession("data.london.gov.uk") as session:
explore = CkanCatExplorer(session)
all_packages = explore.package_list_dictionary()
data = all_packages.get("violence-reduction-unit")
info = explore.package_show_info_json(data)
dl_link = explore.extract_resource_url(info, "VRU Q1 2023-24 Dataset")

analyser = CkanCatAnalyser()
df = analyser.polars_data_loader(dl_link)
print(df)
# Use it like a normal Polars DF from here

# This works for different data catalogues as well
if __name__ == "__main__":
with CkanCatSession("HUMANITARIAN") as session:
explore = CkanCatExplorer(session)
all_packages = explore.package_list_dictionary()
data = all_packages.get("cameroon-humanitarian-needs")
info = explore.package_show_info_json(data)
dl_link = explore.extract_resource_url(info, "cmr_hpc_needs_2024")

analyser = CkanCatAnalyser()
df = analyser.polars_data_loader(dl_link)
print(df)
```

```python
# Example usage 5: Find the data you want, and then load it into a local duckdb for further processing
if __name__ == "__main__":
with CkanCatSession("humanitarian") as session:
explore = CkanCatExplorer(session)
all_packages = explore.package_list_dictionary()
data = all_packages.get("cameroon-humanitarian-needs")
info = explore.package_show_info_json(data)
dl_link = explore.extract_resource_url(info, "cmr_hpc_needs_2024")

analyser = CkanCatAnalyser()
df = analyser.duckdb_data_loader_persist(dl_link, "test", "test_table")
print(df)
```
| Toulouse | https://data.toulouse-metropole.fr | Open Datasoft | Works with all methods |

### TBC

| Catalogue Name | Website | Catalogue API Endpoint Definition | Comments |
|----------------|---------|-----------------------------------|----------|
| Bristol Open Data | https://opendata.bristol.gov.uk | TBC | Need to figure out how to call the catalogue backend |
| Icebreaker One | https://ib1.org | CKAN | Needs further investigation as authentication with an API key may be required |
| Data Mill North | https://datamillnorth.org | CKAN | Seems to have a slightly different implementation - may not work with all methods |
| Canada Open Data | https://open.canada.ca | CKAN | Needs further investigation due to different implementation |
70 changes: 70 additions & 0 deletions documentation/overview.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# CKAN and OpenDataSoft Explorer Documentation

## Overview

This module provides classes for exploring CKAN and OpenDataSoft data catalogues. It includes two main classes:

1. `CkanCatExplorer`: For exploring CKAN-based data catalogues
2. `OpenDataSoftCatExplorer`: For exploring OpenDataSoft-based data catalogues

Both classes are designed to work with a `CatSession` object, which handles the connection to the data catalogue.

## Usage

### CkanCatExplorer

#### Initialization

```python
import HerdingCats as hc

def main():
with hc.CatSession(hc.CkanDataCatalogues.LONDON_DATA_STORE) as session:
explore = hc.CkanCatExplorer(session)

if __name__ =="__main__":
main()

```

#### Methods

1. `check_site_health()`: Checks the health of the CKAN site.

2. `get_package_count()`: Returns the total number of packages in a catalogue.

3. `package_list_dictionary()`: Returns a dictionary of all available packages.

4. `package_list_dataframe(df_type: Literal["pandas", "polars"])`: Returns a dataframe of all available packages.

5. `package_list_dictionary_extra()`: Returns a dictionary with extra package information.

6. `catalogue_freshness()`: Provides a view of how many resources have been updated in the last 6 months.

7. `package_show_info_json(package_name: Union[str, dict, Any])`: Returns package metadata including resource information.

8. `package_search_json(search_query: str, num_rows: int)`: Searches for packages and returns results as JSON.

9. `package_search_condense_json_unpacked(search_query: str, num_rows: int)`: Returns a condensed view of package information.

10. `package_search_condense_dataframe_packed(search_query: str, num_rows: int, df_type: Literal["pandas", "polars"])`: Returns a condensed view of package information as a dataframe with packed resources.

11. `package_search_condense_dataframe_unpacked(search_query: str, num_rows: int, df_type: Literal["pandas", "polars"])`: Returns a condensed view of package information as a dataframe with unpacked resources.

12. `extract_resource_url(package_info: List[Dict], resource_name: str)`: Extracts the URL and format of a specific resource from a package.

### OpenDataSoftCatExplorer

#### Initialization

```python
from cat_session import CatSession
from cat_explorer import OpenDataSoftCatExplorer

with CatSession("ukpowernetworks.opendatasoft.com") as session:
explorer = OpenDataSoftCatExplorer(session)
```

#### Methods

1. `fetch_all_datasets()`: Retrieves all datasets from the OpenDataSoft catalogue.
Loading

0 comments on commit 506c9b7

Please sign in to comment.