Skip to content

Commit

Permalink
HDXDSYS-1086 Add HNO freeform column to hapi-pipelines (#183)
Browse files Browse the repository at this point in the history
* Read new freeform file

* Update requirements

* Add CHANGELOG

* Populate provider_admin1_name and provider_admin2_name

* Use unspecified for unknown admins
  • Loading branch information
mcarans authored Oct 15, 2024
1 parent c2c8db6 commit efc4c3b
Show file tree
Hide file tree
Showing 9 changed files with 21,853 additions and 75,728 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [0.10.11] - 2024-10-16

### Changed

- Use freeform category for humanitarian needs
- Populate provider_admin1_name and provider_admin2_name

## [0.10.10] - 2024-10-16

### Fixed
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ classifiers = [
requires-python = ">=3.8"

dependencies = [
"hapi-schema>=0.9.0",
"hapi-schema>= 0.9.2",
"hdx-python-api>= 6.3.4",
"hdx-python-country>= 3.8.1",
"hdx-python-database[postgresql]>= 1.3.1",
"hdx-python-database[postgresql]>= 1.3.4",
"hdx-python-scraper>= 2.5.0",
"hdx-python-utilities>= 3.7.4",
"libhxl",
Expand Down
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ greenlet==3.1.1
# via sqlalchemy
gspread==6.1.3
# via hdx-python-scraper
hapi-schema==0.9.0
hapi-schema==0.9.2
# via hapi-pipelines (pyproject.toml)
hdx-python-api==6.3.4
# via
Expand All @@ -65,7 +65,7 @@ hdx-python-country==3.8.1
# hapi-pipelines (pyproject.toml)
# hdx-python-api
# hdx-python-scraper
hdx-python-database==1.3.3
hdx-python-database==1.3.4
# via hapi-pipelines (pyproject.toml)
hdx-python-scraper==2.5.0
# via hapi-pipelines (pyproject.toml)
Expand Down Expand Up @@ -244,7 +244,7 @@ six==1.16.0
# sphinxcontrib-napoleon
sphinxcontrib-napoleon==0.7
# via defopt
sqlalchemy==2.0.35
sqlalchemy==2.0.36
# via
# hapi-pipelines (pyproject.toml)
# hapi-schema
Expand Down
35 changes: 15 additions & 20 deletions src/hapi/pipelines/database/admins.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def _update_admin_table(
and code in self._orphan_admin2s.keys()
):
parent_ref = self.admin1_data[
_get_admin1_to_location_connector_code(
get_admin1_to_location_connector_code(
location_code=self._orphan_admin2s[code]
)
]
Expand Down Expand Up @@ -120,7 +120,7 @@ def _add_admin1_connector_rows(self):
)
admin_row = DBAdmin1(
location_ref=location_ref,
code=_get_admin1_to_location_connector_code(
code=get_admin1_to_location_connector_code(
location_code=location_code
),
name="UNSPECIFIED",
Expand All @@ -140,7 +140,7 @@ def _add_admin2_connector_rows(self):
)
admin_row = DBAdmin2(
admin1_ref=admin1_ref,
code=_get_admin2_to_admin1_connector_code(
code=get_admin2_to_admin1_connector_code(
admin1_code=admin1_code
),
name="UNSPECIFIED",
Expand Down Expand Up @@ -172,29 +172,24 @@ def get_admin2_ref(
return ref


def _get_admin2_to_admin1_connector_code(admin1_code: str) -> str:
"""Get the code for an unspecified admin2, based on the admin1 code.
Note that if you need to make the connection between admin2 and
location, and only know the location code, you'll need to pass the
output of get_admin1_to_location_connector_code to this function, e.g.
```
location_code = "ABC"
admin1_code = get_admin1_to_location_connector_code(location_code)
admin2_code = get_admin2_to_admin1_connector_code(admin1_code)
```
"""
def get_admin2_to_admin1_connector_code(admin1_code: str) -> str:
"""Get the code for an unspecified admin2, based on the admin1 code."""
return f"{admin1_code}-XXX"


def _get_admin1_to_location_connector_code(location_code: str) -> str:
def get_admin2_to_location_connector_code(location_code: str) -> str:
"""Get the code for an unspecified admin2, based on the location code."""
return f"{location_code}-XXX-XXX"


def get_admin1_to_location_connector_code(location_code: str) -> str:
"""Get the code for an unspecified admin1, based on the location code."""
return f"{location_code}-XXX"


def get_admin1_code_based_on_level(admin_code: str, admin_level: str) -> str:
if admin_level == "national":
admin1_code = _get_admin1_to_location_connector_code(
admin1_code = get_admin1_to_location_connector_code(
location_code=admin_code
)
elif admin_level == "adminone":
Expand All @@ -209,14 +204,14 @@ def get_admin1_code_based_on_level(admin_code: str, admin_level: str) -> str:

def get_admin2_code_based_on_level(admin_code: str, admin_level: str) -> str:
if admin_level == "national":
admin1_code = _get_admin1_to_location_connector_code(
admin1_code = get_admin1_to_location_connector_code(
location_code=admin_code
)
admin2_code = _get_admin2_to_admin1_connector_code(
admin2_code = get_admin2_to_admin1_connector_code(
admin1_code=admin1_code
)
elif admin_level == "adminone":
admin2_code = _get_admin2_to_admin1_connector_code(
admin2_code = get_admin2_to_admin1_connector_code(
admin1_code=admin_code
)
elif admin_level == "admintwo":
Expand Down
87 changes: 49 additions & 38 deletions src/hapi/pipelines/database/humanitarian_needs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Functions specific to the humanitarian needs theme."""

import re
from datetime import datetime
from logging import getLogger

Expand All @@ -14,7 +15,12 @@
add_missing_value_message,
add_multi_valued_message,
)
from ..utilities.provider_admin_names import get_provider_name
from . import admins
from .admins import (
get_admin1_to_location_connector_code,
get_admin2_to_location_connector_code,
)
from .base_uploader import BaseUploader
from .metadata import Metadata
from .sector import Sector
Expand All @@ -23,6 +29,8 @@


class HumanitarianNeeds(BaseUploader):
admin_name_regex = re.compile(r"Admin (\d) Name")

def __init__(
self,
session: Session,
Expand All @@ -38,21 +46,40 @@ def __init__(
self._configuration = configuration

def get_admin2_ref(self, row, dataset_name, errors):
admin_code = row["Admin 2 PCode"]
if admin_code == "#adm2+code": # ignore HXL row
countryiso3 = row["Country ISO3"]
if countryiso3 == "#country+code": # ignore HXL row
return None
if admin_code:
admin_level = "admintwo"
else:
admin_code = row["Admin 1 PCode"]
if admin_code:
admin_level = "adminone"
else:
admin_code = row["Country ISO3"]
admin_level = "0"
for header in row:
match = self.admin_name_regex.match(header)
if match and row[header]:
admin_level = match.group(1)
match admin_level:
case "0":
admin_level = "national"
return self._admins.get_admin2_ref(
admin_code = countryiso3
case "1":
admin_level = "adminone"
admin_code = row["Admin 1 PCode"]
case "2":
admin_level = "admintwo"
admin_code = row["Admin 2 PCode"]
case _:
return None
admin2_ref = self._admins.get_admin2_ref(
admin_level, admin_code, dataset_name, errors
)
if admin2_ref is None:
if admin_level == "adminone":
admin_code = get_admin1_to_location_connector_code(countryiso3)
elif admin_level == "admintwo":
admin_code = get_admin2_to_location_connector_code(countryiso3)
else:
return None
admin2_ref = self._admins.get_admin2_ref(
admin_level, admin_code, dataset_name, errors
)
return admin2_ref

def populate(self) -> None:
logger.info("Populating humanitarian needs table")
Expand All @@ -63,44 +90,33 @@ def populate(self) -> None:
self._metadata.add_dataset(dataset)
dataset_id = dataset["id"]
dataset_name = dataset["name"]
resource = dataset.get_resource(
1
) # assumes second resource is latest!
resource = dataset.get_resource(0) # assumes first resource is latest!
self._metadata.add_resource(dataset_id, resource)
negative_values_by_iso3 = {}
rounded_values_by_iso3 = {}
resource_id = resource["id"]
resource_name = resource["name"]
year = int(resource_name[-15:-11])
year = int(resource_name[-4:])
time_period_start = datetime(year, 1, 1)
time_period_end = datetime(year, 12, 31, 23, 59, 59)
url = resource["url"]
headers, rows = reader.get_tabular_rows(url, dict_form=True)
# Admin 1 PCode,Admin 2 PCode,Sector,Gender,Age Group,Disabled,Population Group,Population,In Need,Targeted,Affected,Reached
for row in rows:
admin2_ref = self.get_admin2_ref(row, dataset_name, errors)
if not admin2_ref:
continue
countryiso3 = row["Country ISO3"]
population_group = row["Population Group"]
if population_group == "ALL":
population_group = "all"
admin2_ref = self.get_admin2_ref(row, dataset_name, errors)
provider_admin1_name = get_provider_name(row, "Admin 1 Name")
provider_admin2_name = get_provider_name(row, "Admin 2 Name")
sector = row["Sector"]
sector_code = self._sector.get_sector_code(sector)
if not sector_code:
add_missing_value_message(
errors, dataset_name, "sector", sector
)
continue
gender = row["Gender"]
if gender == "a":
gender = "all"
age_range = row["Age Range"]
min_age = row["Min Age"]
max_age = row["Max Age"]
disabled_marker = row["Disabled"]
if disabled_marker == "a":
disabled_marker = "all"
category = row["Category"]
if category is None:
category = ""

def create_row(in_col, population_status):
value = row[in_col]
Expand All @@ -120,16 +136,11 @@ def create_row(in_col, population_status):
humanitarian_needs_row = DBHumanitarianNeeds(
resource_hdx_id=resource_id,
admin2_ref=admin2_ref,
provider_admin1_name="",
provider_admin2_name="",
gender=gender,
age_range=age_range,
min_age=min_age,
max_age=max_age,
provider_admin1_name=provider_admin1_name,
provider_admin2_name=provider_admin2_name,
category=category,
sector_code=sector_code,
population_group=population_group,
population_status=population_status,
disabled_marker=disabled_marker,
population=value,
reference_period_start=time_period_start,
reference_period_end=time_period_end,
Expand Down
12 changes: 6 additions & 6 deletions src/hapi/pipelines/utilities/provider_admin_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@

def get_provider_name(
values: Dict | List,
hxl_tag: str,
hxl_tags: Optional[List[str]] = None,
header_or_hxl_tag: str,
headers_or_hxl_tags: Optional[List[str]] = None,
admin_code: Optional[str] = None,
i: Optional[int] = None,
) -> str:
if hxl_tags is None:
provider_name = values.get(hxl_tag, "")
if headers_or_hxl_tags is None:
provider_name = values.get(header_or_hxl_tag, "")
if provider_name is None:
provider_name = ""
return provider_name
if hxl_tag not in hxl_tags:
if header_or_hxl_tag not in headers_or_hxl_tags:
return ""
provider_name = values[hxl_tags.index(hxl_tag)]
provider_name = values[headers_or_hxl_tags.index(header_or_hxl_tag)]
if admin_code is not None:
provider_name = provider_name[admin_code]
if i is not None:
Expand Down
Loading

0 comments on commit efc4c3b

Please sign in to comment.