Skip to content

Commit

Permalink
Add national data
Browse files Browse the repository at this point in the history
  • Loading branch information
mcarans committed Nov 18, 2024
1 parent bb69aa5 commit 819add0
Show file tree
Hide file tree
Showing 12 changed files with 2,881 additions and 2,249 deletions.
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ classifiers = [
requires-python = ">=3.8"

dependencies = [
"hdx-python-api>= 6.3.4",
"hdx-python-country>= 3.8.1",
"hdx-python-api>= 6.3.5",
"hdx-python-country>= 3.8.3",
"hdx-python-utilities>= 3.7.4",
]
dynamic = ["version"]
Expand Down
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ ckanapi==4.8
# via hdx-python-api
click==8.1.7
# via typer
coverage==7.6.4
coverage==7.6.7
# via pytest-cov
defopt==6.4.0
# via hdx-python-api
Expand Down Expand Up @@ -183,7 +183,7 @@ ruamel-yaml==0.18.6
# via hdx-python-utilities
ruamel-yaml-clib==0.2.12
# via ruamel-yaml
setuptools==75.4.0
setuptools==75.5.0
# via ckanapi
shellingham==1.5.4
# via typer
Expand Down
10 changes: 8 additions & 2 deletions src/hdx/scraper/ophi/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,10 +69,16 @@ def update_dataset(dataset):
downloader, folder, "saved_data", folder, save, use_saved
)
pipeline = Pipeline(configuration, retriever)
trend_path, mpi_path = pipeline.process()
mpi_national_path, mpi_subnational_path, trend_path = (
pipeline.process()
)
dataset_generator = DatasetGenerator(
configuration, trend_path, mpi_path
configuration,
mpi_national_path,
mpi_subnational_path,
trend_path,
)
dataset_generator.load_showcase_links(retriever)
standardised_global = pipeline.get_standardised_global()
standardised_global_trend = (
pipeline.get_standardised_global_trend()
Expand Down
30 changes: 23 additions & 7 deletions src/hdx/scraper/ophi/config/project_configuration.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,28 @@
datasetinfo:
mpi_and_partial_indices: "https://ophi.org.uk/sites/default/files/2024-10/Table%205%20Subnational%20Results%20MPI%202024.xlsx"
trend_over_time: "https://ophi.org.uk/sites/default/files/2024-10/Table%206%20Trends%20Over%20Time%20MPI%202024_3.xlsx"
mpi_and_partial_indices:
national:
url: "https://ophi.org.uk/sites/default/files/2024-10/Table%201%20National%20Results%20MPI%202024.xlsx"
sheet: "1.1 National MPI Results"
subnational:
url: "https://ophi.org.uk/sites/default/files/2024-10/Table%205%20Subnational%20Results%20MPI%202024.xlsx"
sheet: "5.1 MPI Region"
trend_over_time:
url: "https://ophi.org.uk/sites/default/files/2024-10/Table%206%20Trends%20Over%20Time%20MPI%202024_3.xlsx"
national_sheet: "6.1 Harmonised MPI"
subnational_sheet: "6.4 Harmonised MPI Region"

format: "xlsx"
headers:
- 5
- 6
- 7
- 8
- 9
mpi_sheet: "5.1 MPI Region"
trend_sheet: "6.4 Harmonised MPI Region"

showcaseinfo:
# https://docs.google.com/spreadsheets/d/1mChJ1UhgLtqLD-hqbFxd5eKq-L7Nz6awD2znBcEkASs/edit?gid=0#gid=0
urls: "https://docs.google.com/spreadsheets/d/1mChJ1UhgLtqLD-hqbFxd5eKq-L7Nz6awD2znBcEkASs/export?format=csv&gid=0"
notes: "The visual contains sub-national multidimensional poverty data from the country briefs published by the Oxford Poverty and Human Development Initiative (OPHI), University of Oxford."

hxltags:
"country_code": "#country+code"
Expand All @@ -23,6 +36,9 @@ hxltags:
"reference_period_start": "#date+start"
"reference_period_end": "#date+end"

resource_description: "This resource contains standardised MPI estimates by admin one unit and also shows the proportion of people who are MPI poor and experience deprivations in each of the indicators by admin one unit."
trends_resource_description: "This table shows global mpi harmonized level estimates and their changes over time"
mpi_resource_description: "This table shows the MPI and its partial indices disaggregated by subnational regions"
resource_descriptions:
standardised_mpi: "This resource contains standardised MPI estimates by admin one unit and also shows the proportion of people who are MPI poor and experience deprivations in each of the indicators by admin one unit."
standardised_trends: "This resource contains standardised MPI estimates and their changes over time by admin one unit and also shows the proportion of people who are MPI poor and experience deprivations in each of the indicators by admin one unit."
mpi_national: "This table shows the MPI and its partial indices"
mpi_subnational: "This table shows the MPI and its partial indices disaggregated by subnational regions"
trends: "This table shows global mpi harmonized level estimates and their changes over time"
121 changes: 90 additions & 31 deletions src/hdx/scraper/ophi/dataset_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,50 @@
from hdx.api.configuration import Configuration
from hdx.data.dataset import Dataset
from hdx.data.resource import Resource
from hdx.data.showcase import Showcase
from hdx.utilities.retriever import Retrieve

logger = logging.getLogger(__name__)


class DatasetGenerator:
tags = [
"hxl",
"development",
"education",
"health",
"indicators",
"mortality",
"nutrition",
"poverty",
"socioeconomics",
"sustainable development goals-sdg",
"water sanitation and hygiene-wash",
]

def __init__(
self, configuration: Configuration, trend_path: str, mpi_path: str
self,
configuration: Configuration,
mpi_national_path: str,
mpi_subnational_path: str,
trend_path: str,
) -> None:
self._configuration = configuration
self._showcase_links = {}
self._mpi_national_path = mpi_national_path
self._mpi_subnational_path = mpi_subnational_path
self._trend_path = trend_path
self._mpi_path = mpi_path
self._global_hxltags = configuration["hxltags"]
self._country_hxltags = copy(self._global_hxltags)

def load_showcase_links(self, retriever: Retrieve) -> Dict:
url = self._configuration["showcaseinfo"]["urls"]
_, iterator = retriever.get_tabular_rows(
url, dict_form=True, format="csv"
)
for row in iterator:
self._showcase_links[row["Country code"]] = row["URL"]

def generate_resource(
self,
dataset: Dataset,
Expand All @@ -47,41 +77,43 @@ def generate_resource(
)
return success

def _slugified_name(self, name: str) -> str:
return slugify(name).lower()

def generate_dataset_metadata(
self,
title: str,
name: str,
) -> Optional[Dataset]:
logger.info(f"Creating dataset: {title}")
slugified_name = slugify(name).lower()
dataset = Dataset(
{
"name": slugified_name,
"name": self._slugified_name(name),
"title": title,
}
)
dataset.set_maintainer("196196be-6037-4488-8b71-d786adf4c081")
dataset.set_organization("00547685-9ded-4d69-9ca5-47d5278ead7c")
dataset.set_expected_update_frequency("Every year")

tags = [
"hxl",
"development",
"education",
"health",
"indicators",
"mortality",
"nutrition",
"poverty",
"socioeconomics",
"sustainable development goals-sdg",
"water sanitation and hygiene-wash",
]
dataset.add_tags(tags)

dataset.add_tags(self.tags)
dataset.set_subnational(True)
return dataset

def generate_showcase(
self, name: str, title: str, countryiso3: str
) -> Showcase:
showcase = Showcase(
{
"name": f"{self._slugified_name(name)}-showcase",
"title": title,
"notes": self._configuration["showcaseinfo"]["notes"],
"url": self._showcase_links[countryiso3],
"image_url": "",
}
)
showcase.add_tags(self.tags)
return showcase

def generate_dataset(
self,
folder: str,
Expand All @@ -97,16 +129,24 @@ def generate_dataset(
name = f"{countryname} MPI"
dataset = self.generate_dataset_metadata(title, name)
dataset.set_time_period(date_range["start"], date_range["end"])
resource_description = self._configuration["resource_description"]
resource_descriptions = self._configuration["resource_descriptions"]

resource_name = f"{countryname} MPI and Partial Indices"
filename = f"{countryiso3}_mpi.csv"
success = self.generate_resource(
dataset,
resource_name,
resource_description,
resource_descriptions["standardised_mpi"],
self._country_hxltags,
standardised_rows,
sorted(
standardised_rows,
key=lambda x: (
x["country_code"],
x["admin1_code"] if x["admin1_code"] else "",
x["admin1_name"] if x["admin1_name"] else "",
x["reference_period_end"],
),
),
folder,
filename,
)
Expand All @@ -121,9 +161,17 @@ def generate_dataset(
success = self.generate_resource(
dataset,
resource_name,
resource_description,
resource_descriptions["standardised_trends"],
self._country_hxltags,
standardised_trend_rows,
sorted(
standardised_trend_rows,
key=lambda x: (
x["country_code"],
x["admin1_code"] if x["admin1_code"] else "",
x["admin1_name"] if x["admin1_name"] else "",
x["reference_period_end"],
),
),
folder,
filename,
)
Expand All @@ -147,21 +195,32 @@ def generate_global_dataset(
date_range,
)

resource_descriptions = self._configuration["resource_descriptions"]
resourcedata = {
"name": "Trends Over Time MPI database",
"description": self._configuration["trends_resource_description"],
"name": "MPI and Partial Indices National Database",
"description": resource_descriptions["mpi_national"],
}
resource = Resource(resourcedata)
resource.set_format("xlsx")
resource.set_file_to_upload(self._trend_path)
resource.set_file_to_upload(self._mpi_national_path)
dataset.add_update_resource(resource)

resourcedata = {
"name": "MPI and Partial Indices database",
"description": self._configuration["trends_resource_description"],
"name": "MPI and Partial Indices Subnational Database",
"description": resource_descriptions["mpi_subnational"],
}
resource = Resource(resourcedata)
resource.set_format("xlsx")
resource.set_file_to_upload(self._mpi_path)
resource.set_file_to_upload(self._mpi_subnational_path)
dataset.add_update_resource(resource)

resourcedata = {
"name": "Trends Over Time MPI Database",
"description": resource_descriptions["trends"],
}
resource = Resource(resourcedata)
resource.set_format("xlsx")
resource.set_file_to_upload(self._trend_path)
dataset.add_update_resource(resource)

return dataset
Loading

0 comments on commit 819add0

Please sign in to comment.