adding tests

OCHA-DAP · Dec 2, 2024 · 247082a · 247082a
1 parent 77b9591
commit 247082a
Show file tree

Hide file tree

Showing 12 changed files with 280 additions and 86 deletions.
diff --git a/.config/ruff.toml b/.config/ruff.toml
@@ -1,4 +1,4 @@
-line-length = 79
+line-length = 99
 exclude = ["_version.py"]
 
 [lint]

diff --git a/.github/workflows/run-python-script.yaml b/.github/workflows/run-python-script.yaml
@@ -41,5 +41,5 @@ jobs:
         password: ${{secrets.HDX_PIPELINE_EMAIL_PASSWORD}}
         subject: "FAILED: ${{github.repository}} run job"
         body: GitHub Actions run job for ${{github.repository}} failed!
-        to: ${{secrets.HDX_PIPELINE_EMAIL_LIST}}
+        to: ${{vars.HDX_PIPELINE_EMAIL_LIST}}
         from: ${{secrets.HDX_PIPELINE_EMAIL_FROM}}
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -194,7 +194,7 @@ pyphonetics==0.5.3
     # via
     #   -c requirements.txt
     #   hdx-python-country
-pytest==8.3.3
+pytest==8.3.4
     # via
     #   hdx-scraper-dcc (pyproject.toml)
     #   pytest-cov
@@ -253,7 +253,7 @@ rich==13.9.4
     # via
     #   -c requirements.txt
     #   typer
-rpds-py==0.21.0
+rpds-py==0.22.0
     # via
     #   -c requirements.txt
     #   jsonschema
@@ -321,7 +321,7 @@ typeguard==4.4.1
     # via
     #   -c requirements.txt
     #   inflect
-typer==0.13.1
+typer==0.14.0
     # via
     #   -c requirements.txt
     #   frictionless

diff --git a/requirements.txt b/requirements.txt
@@ -141,7 +141,7 @@ rfc3986==2.0.0
     # via frictionless
 rich==13.9.4
     # via typer
-rpds-py==0.21.0
+rpds-py==0.22.0
     # via
     #   jsonschema
     #   referencing
@@ -179,7 +179,7 @@ text-unidecode==1.3
     # via python-slugify
 typeguard==4.4.1
     # via inflect
-typer==0.13.1
+typer==0.14.0
     # via frictionless
 typing-extensions==4.12.2
     # via

diff --git a/src/hdx/scraper/dcc/__main__.py b/src/hdx/scraper/dcc/__main__.py
@@ -28,7 +28,7 @@
 
 def main(
     save: bool = False,
-    use_saved: bool = True,
+    use_saved: bool = False,
 ) -> None:
     """Generate datasets and create them in HDX
 
@@ -58,7 +58,7 @@ def main(
 
             countries = dcc.get_data()
 
-            for country in countries[:1]:  # for testing
+            for country in countries:
                 try:
                     datasets = dcc.generate_dataset(country)
 
@@ -92,10 +92,8 @@ def main(
 if __name__ == "__main__":
     facade(
         main,
-        hdx_site="stage",
+        hdx_site="demo",
         user_agent_config_yaml=join(expanduser("~"), ".useragents.yaml"),
         user_agent_lookup=_USER_AGENT_LOOKUP,
-        project_config_yaml=join(
-            dirname(__file__), "config", "project_configuration.yaml"
-        ),
+        project_config_yaml=join(dirname(__file__), "config", "project_configuration.yaml"),
     )
diff --git a/src/hdx/scraper/dcc/config/hdx_dataset_static.yaml b/src/hdx/scraper/dcc/config/hdx_dataset_static.yaml
@@ -2,7 +2,7 @@ license_id: cc-by
 caveats: |
   The United Nations Office for the Coordination of Humanitarian Affairs (OCHA) Common Operational Datasets (COD) Administration Level 0 boundary polygons were used in instances where geoBoundaries simplified polygons were not available.
 
-  Where countries were not included in the health facility data published by Maina et al. (2019) we used data from the Global Healthsites Mapping Project published on Humanitarian Data Exchange (this included: Egypt, Libya, Tunisia, Algeria, Morocco).
+  Where countries were not included in the health facility data published by Maina et al. (2019) we used data from the Global Health sites Mapping Project published on Humanitarian Data Exchange (this included: Egypt, Libya, Tunisia, Algeria, Morocco).
 
   For each country we removed health sites that were unlikely to offer child focused services and vaccinations. Facilities that were removed included: pharmacy, dentist, veterinary, café/pharmacy, social facility.
 
@@ -12,7 +12,7 @@ caveats: |
 dataset_source: Data for Children Collaborative
 package_creator: HDX Data Systems Team
 private: False
-maintainer: fdbb8e79-f020-4039-ab3a-9adb482273b8 #71421920-fdc8-40fb-ac97-99b85e90b8a7 gwatmough
-owner_org:  hdx # should be The University of Edinburgh
+maintainer: "71421920-fdc8-40fb-ac97-99b85e90b8a7" # "fdbb8e79-f020-4039-ab3a-9adb482273b8" gwatmough
+owner_org: "bb1ac1eb-c322-40f0-b0ea-1696792e61df" # "hdx" data-for-children-collaborative
 data_update_frequency: -1 # Never
 subnational: False
diff --git a/src/hdx/scraper/dcc/config/project_configuration.yaml b/src/hdx/scraper/dcc/config/project_configuration.yaml
@@ -3,7 +3,6 @@ data_url: https://raw.githubusercontent.com/Data-Science-Unit/ChildPovetyAccesst
 walking:
   title: Walking Travel Time to nearest Level IV health centre Map
   description: This file is a zip archive containing travel time map in Geotiff format and a thumbnail image in PNG format.
-  date_of_dataset: August 2024
   notes: |
     A 100 m spatial resolution geotiff of walking travel time in seconds to nearest health facility in [country]. The data was generated using the Child Poverty and Access to Services (CPAS) software (10.5281/zenodo.4638563) and was created as part of the CPAS project within the Data for Children Collaborative. The travel time is calculated assuming walking speeds on all roads, tracks, paths and land cover types. A full description is available [here](https://doi.org/10.1038/s41597-022-01274-w) a video description of the data is also available [here](https://www.dataforchildrencollaborative.com/outputs/presentation-a-100m-resolution-travel-time-map)
 
@@ -21,7 +20,6 @@ walking:
 motorised:
   title: Motorised Travel Time (in seconds) to nearest health centre
   description: This file is a zip archive containing travel time map in Geotiff format and a thumbnail image in PNG format.
-  date_of_dataset: August 2024
   notes: |
     A 100 m spatial resolution geotiff of motorised travel time in seconds to nearest health facility in [country]. The data was generated using the Child Poverty and Access to Services (CPAS) software (10.5281/zenodo.4638563) and was created as part of the CPAS project within the Data for Children Collaborative. The travel time is calculated assuming driving speeds of local public transport options on all-weather roads/asphalt roads and walking speeds on all other roads, tracks, paths and land cover types. A full description is available [here](https://doi.org/10.1038/s41597-022-01274-w) a video description of the data is also available [here](https://www.dataforchildrencollaborative.com/outputs/presentation-a-100m-resolution-travel-time-map)
 
@@ -42,5 +40,4 @@ tags:
   - transportation
   - africa
 
-data_update_frequency: -1
 date_of_dataset: August 2024
diff --git a/src/hdx/scraper/dcc/dcc.py b/src/hdx/scraper/dcc/dcc.py
@@ -18,16 +18,14 @@
 
 
 class DCC:
-    def __init__(
-        self, configuration: Configuration, retriever: Retrieve, temp_dir: str
-    ):
+    def __init__(self, configuration: Configuration, retriever: Retrieve, temp_dir: str):
         self._configuration = configuration
         self._retriever = retriever
         self._temp_dir = temp_dir
         self.data = {}
         self.data_url = self._configuration["data_url"]
 
-    def get_location_name(self, country_name) -> str:
+    def get_location_name(self, country_name: str) -> str:
         """Convert country name from data to HDX location name using
            HDX Python Country
 
@@ -37,9 +35,44 @@ def get_location_name(self, country_name) -> str:
         Returns:
             str: HDX location name
         """
-        iso3 = Country.get_iso3_country_code_fuzzy(country_name)
-        location_name = Country.get_country_name_from_iso3(iso3[0])
-        return location_name
+        try:
+            iso3 = Country.get_iso3_country_code_fuzzy(country_name)
+            location_name = Country.get_country_name_from_iso3(iso3[0])
+            return location_name
+        except HDXError as e:
+            logger.error(f"Error retrieving location for country '{country_name}': {e}")
+            return "None"
+
+    def _parse_table_content(self, table_content: str) -> Dict:
+        """Parse table content into a dictionary
+
+        Args:
+            table_content (str): Content of the table
+
+        Returns:
+            Dict: Parsed country data
+        """
+        data = {}
+        rows = [row.strip() for row in table_content.split("\n") if row.strip()]
+        for row in rows:
+            parts = [col.strip() for col in row.split("|") if col.strip()]
+            if len(parts) < 3:
+                logger.warning(f"Incomplete row skipped: {row}")
+                continue
+
+            country, walking, motorised = parts[:3]
+            if country == "Tanzania_w_zanzibar":
+                continue
+
+            try:
+                location_name = self.get_location_name(country)
+                data[location_name] = {
+                    "walking": walking,
+                    "motorised": motorised,
+                }
+            except HDXError:
+                logger.warning(f"Skipping country '{country}' due to location resolution error.")
+        return data
 
     def get_country_data(self, text: str) -> Dict:
         """Extract TIF file urls for each country from markdown file
@@ -50,43 +83,20 @@ def get_country_data(self, text: str) -> Dict:
         Returns:
             Dict: Dict containing country name with two urls each
         """
-        result = {}
-
         # Find the table of country data
-        separator_pattern = r"\|\:\-+\|(:?-+\|)+"
-        separator_match = re.search(separator_pattern, text)
-        if separator_match:
-            # Country data starts after the separator line and
-            # ends at next double newline
-            table_start = separator_match.end()
-            table_end = text.find("\n\n", table_start)
-            table_content = text[table_start:table_end].strip()
-
-            # Split content by pipe character and strip any
-            # whitespace and empty strings
-            parts = [part.strip() for part in table_content.split("|")]
-            parts = [part for part in parts if part]
-
-            # Process parts to extract country, walking, motorised
-            for i in range(0, len(parts), 3):
-                if i + 2 < len(parts):
-                    country = parts[i]
-
-                    # Skip 'Tanzania_w_zanzibar'
-                    if country == "Tanzania_w_zanzibar":
-                        continue
-
-                    location_name = self.get_location_name(country)
-                    walking = parts[i + 1]
-                    motorised = parts[i + 2]
-                    result[location_name] = {
-                        "walking": walking,
-                        "motorised": motorised,
-                    }
-        else:
-            print("Table separator line not found.")
-
-        return result
+        separator_pattern = re.compile(r"\|\:\-+\|(:?-+\|)+")
+        separator_match = separator_pattern.search(text)
+        if not separator_match:
+            logger.error("Table separator line not found.")
+            return {}
+
+        table_start = separator_match.end()
+        table_end = text.find("\n\n", table_start)
+        table_content = (
+            text[table_start:table_end].strip() if table_end != -1 else text[table_start:].strip()
+        )
+
+        return self._parse_table_content(table_content)
 
     def get_data(self) -> List:
         """Download markdown file, get country data
@@ -95,22 +105,20 @@ def get_data(self) -> List:
             None
 
         Returns:
-            List: country names
+            List: Country names
         """
         try:
             # Download readme content
-            content = self._retriever.download_text(
-                self.data_url, filename="readme"
-            )
+            content = self._retriever.download_text(self.data_url, filename="readme")
 
             # Extract country data
             self.data = self.get_country_data(content)
 
         except DownloadError:
             logger.error(f"Could not get data from {self.data_url}")
-            return {}
+            return []
 
-        return [country for country in sorted(self.data)]
+        return sorted(self.data)
 
     def generate_dataset(self, country_name: str) -> List[Dataset]:
         datasets = []
@@ -136,30 +144,22 @@ def generate_dataset(self, country_name: str) -> List[Dataset]:
             # Add dataset info
             dataset.add_tags(self._configuration["tags"])
             dataset_country_iso3 = Country.get_iso3_country_code(country_name)
-            dataset.set_expected_update_frequency(
-                self._configuration["data_update_frequency"]
-            )
             dataset_time_period = datetime.strptime(
                 self._configuration["date_of_dataset"], "%B %Y"
             )
             dataset.set_time_period(dataset_time_period)
-            dataset.set_subnational(False)
             dataset["methodology"] = "Other"
             dataset["methodology_other"] = dataset_info["methodology_other"]
 
             try:
                 dataset.add_country_location(dataset_country_iso3)
             except HDXError:
-                logger.error(
-                    f"Couldn't find country {dataset_country_iso3}, skipping"
-                )
-                return
+                logger.error(f"Couldn't find country {dataset_country_iso3}, skipping")
+                return []
 
             # Create resource
             resource_name = f"service_area_{country_name}_{data_type}.tif"
-            resource_description = dataset_info["description"].replace(
-                "[country]", country_name
-            )
+            resource_description = dataset_info["description"].replace("[country]", country_name)
             resource = {
                 "name": resource_name,
                 "description": resource_description,

diff --git a/tests/fixtures/.deleteme b/tests/fixtures/.deleteme
diff --git a/tests/fixtures/input/.deleteme b/tests/fixtures/input/.deleteme
diff --git a/tests/fixtures/input/readme b/tests/fixtures/input/readme
@@ -0,0 +1,50 @@
+# ChildPovetyAccesstoServicesV2
+Datasets for CPASv2 Project
+
+These datasets have been generated following the methodology developed by Watmough et al. (https://www.nature.com/articles/s41597-022-01274-w) but with 100m resolution instead.
+
+Shield: [![CC BY-NC-SA 4.0][cc-by-nc-sa-shield]][cc-by-nc-sa]
+
+This work is licensed under a
+[Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International License][cc-by-nc-sa].
+
+[![CC BY-NC-SA 4.0][cc-by-nc-sa-image]][cc-by-nc-sa]
+
+[cc-by-nc-sa]: http://creativecommons.org/licenses/by-nc-sa/4.0/
+[cc-by-nc-sa-image]: https://licensebuttons.net/l/by-nc-sa/4.0/88x31.png
+[cc-by-nc-sa-shield]: https://img.shields.io/badge/License-CC%20BY--NC--SA%204.0-lightgrey.svg
+
+
+| Country                  | Link to Walking Time Map                                                                              | Link to Motorised Time Map                                                                                   |
+|:-------------------------|:------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------|
+| Algeria                  | https://s3.eidf.ac.uk/eidf158-walkingtraveltimemaps/service_area_Algeria_walking.tif                  | https://s3.eidf.ac.uk/eidf158-motorised-travel-time-maps/service_area_Algeria_motorised.tif                  |
+| Botswana                 | https://s3.eidf.ac.uk/eidf158-walkingtraveltimemaps/service_area_BWA_walking.tif                      | https://s3.eidf.ac.uk/eidf158-motorised-travel-time-maps/service_area_BWA_motorised.tif                      |
+| Benin                    | https://s3.eidf.ac.uk/eidf158-walkingtraveltimemaps/service_area_Benin_walking.tif                    | https://s3.eidf.ac.uk/eidf158-motorised-travel-time-maps/service_area_Benin_motorised.tif                    |
+
+The travel speeds used for creating these datasets:
+| Road type (per HOT OpenStreetMap Road tags) | Motorised Travel Speed (kmph) | Walking Travel Speed (kmph) |
+|-----------------------------------------------|-------------------------------|-----------------------------|
+| Bridleway                                     | 4                             | 4                           |
+| Bus Stop                                      | 4                             | 4                           |
+| Cycleway                                      | 10                            | 4                           |
+| Footway                                       | 4                             | 4                           |
+| Motorway                                      | 50                            | 4                           |
+| Motorway Link                                 | 50                            | 4                           |
+| Path                                          | 4                             | 4                           |
+| Pedestrian                                    | 4                             | 4                           |
+| Primary                                       | 50                            | 4                           |
+| Primary Link                                  | 50                            | 4                           |
+| Residential                                   | 30                            | 4                           |
+| Road                                          | 30                            | 4                           |
+| Secondary                                     | 50                            | 4                           |
+| Secondary Link                                | 50                            | 4                           |
+| Steps                                         | 3                             | 3                           |
+| Tertiary                                      | 25                            | 4                           |
+| Tertiary Link                                 | 25                            | 4                           |
+| Track                                         | 10                            | 3                           |
+| Trunk                                         | 50                            | 4                           |
+| Trunk Link                                    | 50                            | 4                           |
+| Unclassified                                  | 4                             | 4                           |
+| Yes                                           | 50                            | 4                           |
+
+<br>