Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hapi population dataset #229

Merged
merged 2 commits into from
Feb 21, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ attrs==25.1.0
# jsonlines
# jsonschema
# referencing
cachetools==5.5.1
cachetools==5.5.2
# via google-auth
certifi==2025.1.31
# via requests
Expand Down Expand Up @@ -218,7 +218,7 @@ rfc3986==2.0.0
# via frictionless
rich==13.9.4
# via typer
rpds-py==0.22.3
rpds-py==0.23.0
# via
# jsonschema
# referencing
Expand Down
135 changes: 17 additions & 118 deletions src/hapi/pipelines/database/population.py
Original file line number Diff line number Diff line change
@@ -1,131 +1,30 @@
"""Functions specific to the population theme."""

from logging import getLogger
from typing import Dict

from hapi_schema.db_population import DBPopulation
from hdx.api.configuration import Configuration
from hdx.api.utilities.hdx_error_handler import HDXErrorHandler
from hdx.scraper.framework.utilities.reader import Read
from hdx.utilities.dateparse import parse_date_range
from sqlalchemy.orm import Session

from ..utilities.batch_populate import batch_populate
from ..utilities.provider_admin_names import get_provider_name
from . import admins
from .base_uploader import BaseUploader
from .metadata import Metadata
from .hapi_dataset_uploader import HapiDatasetUploader

logger = getLogger(__name__)


class Population(BaseUploader):
def __init__(
self,
session: Session,
metadata: Metadata,
admins: admins.Admins,
configuration: Configuration,
error_handler: HDXErrorHandler,
):
super().__init__(session)
self._metadata = metadata
self._admins = admins
self._configuration = configuration
self._error_handler = error_handler

def get_admin2_ref(self, row, headers, dataset_name, admin_level):
countryiso3 = row[headers.index("#country+code")]
if admin_level == "national":
admin_code = countryiso3
if admin_level == "adminone":
admin_code = row[headers.index("#adm1+code")]
if admin_level == "admintwo":
admin_code = row[headers.index("#adm2+code")]
admin2_code = admins.get_admin2_code_based_on_level(
admin_code=admin_code, admin_level=admin_level
class Population(HapiDatasetUploader):
def populate_row(self, output_row: Dict, row: Dict) -> None:
output_row["gender"] = row["gender"]
output_row["age_range"] = row["age_range"]
output_row["min_age"] = (
int(float(row["min_age"])) if row["min_age"] else None
)
output_row["max_age"] = (
int(float(row["max_age"])) if row["max_age"] else None
)
admin2_ref = self._admins.admin2_data.get(admin2_code)
if admin2_ref is None:
if admin_level == "adminone":
admin_code = admins.get_admin1_to_location_connector_code(
countryiso3
)
elif admin_level == "admintwo":
admin_code = admins.get_admin2_to_location_connector_code(
countryiso3
)
else:
return None
admin2_ref = self._admins.get_admin2_ref(
admin_level,
admin_code,
dataset_name,
"Population",
self._error_handler,
)
return admin2_ref
output_row["population"] = int(row["population"])

def populate(self) -> None:
logger.info("Populating population table")
reader = Read.get_reader("hdx")
dataset = reader.read_dataset("cod-ps-global", self._configuration)
self._metadata.add_dataset(dataset)
dataset_id = dataset["id"]
dataset_name = dataset["name"]
for resource in dataset.get_resources():
resource_id = resource["id"]
resource_name = resource["name"]
admin_level = _get_admin_level(resource_name)
if not admin_level:
continue
self._metadata.add_resource(dataset_id, resource)
url = resource["url"]
headers, rows = reader.get_tabular_rows(url, headers=2)
population_rows = []
for row in rows:
admin2_ref = self.get_admin2_ref(
row, headers, dataset_name, admin_level
)
gender = row[headers.index("#gender")]
age_range = row[headers.index("#age+range")]
min_age = row[headers.index("#age+min")]
max_age = row[headers.index("#age+max")]
population = row[headers.index("#population")]
reference_year = row[headers.index("#date+year")]
time_period_range = parse_date_range(reference_year, "%Y")
provider_admin1_name = get_provider_name(
row,
"#adm1+name",
headers,
)
provider_admin2_name = get_provider_name(
row,
"#adm2+name",
headers,
)
population_row = dict(
resource_hdx_id=resource_id,
admin2_ref=admin2_ref,
provider_admin1_name=provider_admin1_name,
provider_admin2_name=provider_admin2_name,
gender=gender,
age_range=age_range,
min_age=min_age,
max_age=max_age,
population=int(population),
reference_period_start=time_period_range[0],
reference_period_end=time_period_range[1],
)
population_rows.append(population_row)
batch_populate(population_rows, self._session, DBPopulation)


def _get_admin_level(resource_name: str) -> str or None:
admin_level = resource_name.split(".")[0][-1]
if admin_level == "0":
return "national"
if admin_level == "1":
return "adminone"
if admin_level == "2":
return "admintwo"
return None
self.hapi_populate(
"population",
DBPopulation,
end_resource=2,
)

Large diffs are not rendered by default.

Large diffs are not rendered by default.

1 change: 0 additions & 1 deletion tests/fixtures/input/cod-ps-global.json

This file was deleted.

Loading