Add national data

OCHA-DAP · Nov 18, 2024 · 819add0 · 819add0
1 parent bb69aa5
commit 819add0
Show file tree

Hide file tree

Showing 12 changed files with 2,881 additions and 2,249 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,8 +34,8 @@ classifiers = [
 requires-python = ">=3.8"
 
 dependencies = [
-    "hdx-python-api>= 6.3.4",
-    "hdx-python-country>= 3.8.1",
+    "hdx-python-api>= 6.3.5",
+    "hdx-python-country>= 3.8.3",
     "hdx-python-utilities>= 3.7.4",
 ]
 dynamic = ["version"]

diff --git a/requirements.txt b/requirements.txt
@@ -20,7 +20,7 @@ ckanapi==4.8
     # via hdx-python-api
 click==8.1.7
     # via typer
-coverage==7.6.4
+coverage==7.6.7
     # via pytest-cov
 defopt==6.4.0
     # via hdx-python-api
@@ -183,7 +183,7 @@ ruamel-yaml==0.18.6
     # via hdx-python-utilities
 ruamel-yaml-clib==0.2.12
     # via ruamel-yaml
-setuptools==75.4.0
+setuptools==75.5.0
     # via ckanapi
 shellingham==1.5.4
     # via typer

diff --git a/src/hdx/scraper/ophi/__main__.py b/src/hdx/scraper/ophi/__main__.py
@@ -69,10 +69,16 @@ def update_dataset(dataset):
                 downloader, folder, "saved_data", folder, save, use_saved
             )
             pipeline = Pipeline(configuration, retriever)
-            trend_path, mpi_path = pipeline.process()
+            mpi_national_path, mpi_subnational_path, trend_path = (
+                pipeline.process()
+            )
             dataset_generator = DatasetGenerator(
-                configuration, trend_path, mpi_path
+                configuration,
+                mpi_national_path,
+                mpi_subnational_path,
+                trend_path,
             )
+            dataset_generator.load_showcase_links(retriever)
             standardised_global = pipeline.get_standardised_global()
             standardised_global_trend = (
                 pipeline.get_standardised_global_trend()

diff --git a/src/hdx/scraper/ophi/config/project_configuration.yaml b/src/hdx/scraper/ophi/config/project_configuration.yaml
@@ -1,15 +1,28 @@
 datasetinfo:
-  mpi_and_partial_indices: "https://ophi.org.uk/sites/default/files/2024-10/Table%205%20Subnational%20Results%20MPI%202024.xlsx"
-  trend_over_time: "https://ophi.org.uk/sites/default/files/2024-10/Table%206%20Trends%20Over%20Time%20MPI%202024_3.xlsx"
+  mpi_and_partial_indices:
+    national:
+      url: "https://ophi.org.uk/sites/default/files/2024-10/Table%201%20National%20Results%20MPI%202024.xlsx"
+      sheet: "1.1 National MPI Results"
+    subnational:
+      url: "https://ophi.org.uk/sites/default/files/2024-10/Table%205%20Subnational%20Results%20MPI%202024.xlsx"
+      sheet: "5.1 MPI Region"
+  trend_over_time:
+    url: "https://ophi.org.uk/sites/default/files/2024-10/Table%206%20Trends%20Over%20Time%20MPI%202024_3.xlsx"
+    national_sheet: "6.1 Harmonised MPI"
+    subnational_sheet: "6.4 Harmonised MPI Region"
+
   format: "xlsx"
   headers:
     - 5
     - 6
     - 7
     - 8
     - 9
-  mpi_sheet: "5.1 MPI Region"
-  trend_sheet: "6.4 Harmonised MPI Region"
+
+showcaseinfo:
+  # https://docs.google.com/spreadsheets/d/1mChJ1UhgLtqLD-hqbFxd5eKq-L7Nz6awD2znBcEkASs/edit?gid=0#gid=0
+  urls: "https://docs.google.com/spreadsheets/d/1mChJ1UhgLtqLD-hqbFxd5eKq-L7Nz6awD2znBcEkASs/export?format=csv&gid=0"
+  notes: "The visual contains sub-national multidimensional poverty data from the country briefs published by the Oxford Poverty and Human Development Initiative (OPHI), University of Oxford."
 
 hxltags:
   "country_code": "#country+code"
@@ -23,6 +36,9 @@ hxltags:
   "reference_period_start": "#date+start"
   "reference_period_end": "#date+end"
 
-resource_description: "This resource contains standardised MPI estimates by admin one unit and also shows the proportion of people who are MPI poor and experience deprivations in each of the indicators by admin one unit."
-trends_resource_description: "This table shows global mpi harmonized level estimates and their changes over time"
-mpi_resource_description: "This table shows the MPI and its partial indices disaggregated by subnational regions"
+resource_descriptions:
+  standardised_mpi: "This resource contains standardised MPI estimates by admin one unit and also shows the proportion of people who are MPI poor and experience deprivations in each of the indicators by admin one unit."
+  standardised_trends: "This resource contains standardised MPI estimates and their changes over time by admin one unit and also shows the proportion of people who are MPI poor and experience deprivations in each of the indicators by admin one unit."
+  mpi_national: "This table shows the MPI and its partial indices"
+  mpi_subnational: "This table shows the MPI and its partial indices disaggregated by subnational regions"
+  trends: "This table shows global mpi harmonized level estimates and their changes over time"
diff --git a/src/hdx/scraper/ophi/dataset_generator.py b/src/hdx/scraper/ophi/dataset_generator.py
@@ -7,20 +7,50 @@
 from hdx.api.configuration import Configuration
 from hdx.data.dataset import Dataset
 from hdx.data.resource import Resource
+from hdx.data.showcase import Showcase
+from hdx.utilities.retriever import Retrieve
 
 logger = logging.getLogger(__name__)
 
 
 class DatasetGenerator:
+    tags = [
+        "hxl",
+        "development",
+        "education",
+        "health",
+        "indicators",
+        "mortality",
+        "nutrition",
+        "poverty",
+        "socioeconomics",
+        "sustainable development goals-sdg",
+        "water sanitation and hygiene-wash",
+    ]
+
     def __init__(
-        self, configuration: Configuration, trend_path: str, mpi_path: str
+        self,
+        configuration: Configuration,
+        mpi_national_path: str,
+        mpi_subnational_path: str,
+        trend_path: str,
     ) -> None:
         self._configuration = configuration
+        self._showcase_links = {}
+        self._mpi_national_path = mpi_national_path
+        self._mpi_subnational_path = mpi_subnational_path
         self._trend_path = trend_path
-        self._mpi_path = mpi_path
         self._global_hxltags = configuration["hxltags"]
         self._country_hxltags = copy(self._global_hxltags)
 
+    def load_showcase_links(self, retriever: Retrieve) -> Dict:
+        url = self._configuration["showcaseinfo"]["urls"]
+        _, iterator = retriever.get_tabular_rows(
+            url, dict_form=True, format="csv"
+        )
+        for row in iterator:
+            self._showcase_links[row["Country code"]] = row["URL"]
+
     def generate_resource(
         self,
         dataset: Dataset,
@@ -47,41 +77,43 @@ def generate_resource(
         )
         return success
 
+    def _slugified_name(self, name: str) -> str:
+        return slugify(name).lower()
+
     def generate_dataset_metadata(
         self,
         title: str,
         name: str,
     ) -> Optional[Dataset]:
         logger.info(f"Creating dataset: {title}")
-        slugified_name = slugify(name).lower()
         dataset = Dataset(
             {
-                "name": slugified_name,
+                "name": self._slugified_name(name),
                 "title": title,
             }
         )
         dataset.set_maintainer("196196be-6037-4488-8b71-d786adf4c081")
         dataset.set_organization("00547685-9ded-4d69-9ca5-47d5278ead7c")
         dataset.set_expected_update_frequency("Every year")
-
-        tags = [
-            "hxl",
-            "development",
-            "education",
-            "health",
-            "indicators",
-            "mortality",
-            "nutrition",
-            "poverty",
-            "socioeconomics",
-            "sustainable development goals-sdg",
-            "water sanitation and hygiene-wash",
-        ]
-        dataset.add_tags(tags)
-
+        dataset.add_tags(self.tags)
         dataset.set_subnational(True)
         return dataset
 
+    def generate_showcase(
+        self, name: str, title: str, countryiso3: str
+    ) -> Showcase:
+        showcase = Showcase(
+            {
+                "name": f"{self._slugified_name(name)}-showcase",
+                "title": title,
+                "notes": self._configuration["showcaseinfo"]["notes"],
+                "url": self._showcase_links[countryiso3],
+                "image_url": "",
+            }
+        )
+        showcase.add_tags(self.tags)
+        return showcase
+
     def generate_dataset(
         self,
         folder: str,
@@ -97,16 +129,24 @@ def generate_dataset(
         name = f"{countryname} MPI"
         dataset = self.generate_dataset_metadata(title, name)
         dataset.set_time_period(date_range["start"], date_range["end"])
-        resource_description = self._configuration["resource_description"]
+        resource_descriptions = self._configuration["resource_descriptions"]
 
         resource_name = f"{countryname} MPI and Partial Indices"
         filename = f"{countryiso3}_mpi.csv"
         success = self.generate_resource(
             dataset,
             resource_name,
-            resource_description,
+            resource_descriptions["standardised_mpi"],
             self._country_hxltags,
-            standardised_rows,
+            sorted(
+                standardised_rows,
+                key=lambda x: (
+                    x["country_code"],
+                    x["admin1_code"] if x["admin1_code"] else "",
+                    x["admin1_name"] if x["admin1_name"] else "",
+                    x["reference_period_end"],
+                ),
+            ),
             folder,
             filename,
         )
@@ -121,9 +161,17 @@ def generate_dataset(
         success = self.generate_resource(
             dataset,
             resource_name,
-            resource_description,
+            resource_descriptions["standardised_trends"],
             self._country_hxltags,
-            standardised_trend_rows,
+            sorted(
+                standardised_trend_rows,
+                key=lambda x: (
+                    x["country_code"],
+                    x["admin1_code"] if x["admin1_code"] else "",
+                    x["admin1_name"] if x["admin1_name"] else "",
+                    x["reference_period_end"],
+                ),
+            ),
             folder,
             filename,
         )
@@ -147,21 +195,32 @@ def generate_global_dataset(
             date_range,
         )
 
+        resource_descriptions = self._configuration["resource_descriptions"]
         resourcedata = {
-            "name": "Trends Over Time MPI database",
-            "description": self._configuration["trends_resource_description"],
+            "name": "MPI and Partial Indices National Database",
+            "description": resource_descriptions["mpi_national"],
         }
         resource = Resource(resourcedata)
         resource.set_format("xlsx")
-        resource.set_file_to_upload(self._trend_path)
+        resource.set_file_to_upload(self._mpi_national_path)
         dataset.add_update_resource(resource)
 
         resourcedata = {
-            "name": "MPI and Partial Indices database",
-            "description": self._configuration["trends_resource_description"],
+            "name": "MPI and Partial Indices Subnational Database",
+            "description": resource_descriptions["mpi_subnational"],
         }
         resource = Resource(resourcedata)
         resource.set_format("xlsx")
-        resource.set_file_to_upload(self._mpi_path)
+        resource.set_file_to_upload(self._mpi_subnational_path)
         dataset.add_update_resource(resource)
+
+        resourcedata = {
+            "name": "Trends Over Time MPI Database",
+            "description": resource_descriptions["trends"],
+        }
+        resource = Resource(resourcedata)
+        resource.set_format("xlsx")
+        resource.set_file_to_upload(self._trend_path)
+        dataset.add_update_resource(resource)
+
         return dataset