Skip to content

Commit

Permalink
Remove duplicate lines
Browse files Browse the repository at this point in the history
  • Loading branch information
mcarans committed Nov 21, 2024
1 parent fdab0a3 commit 0e01f8d
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 735 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ requires-python = ">=3.8"

dependencies = [
"hdx-python-api>= 6.3.5",
"hdx-python-country>= 3.8.3",
"hdx-python-country>= 3.8.5",
"hdx-python-utilities>= 3.7.4",
]
dynamic = ["version"]
Expand Down
10 changes: 5 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ frictionless==5.18.0
# via hdx-python-utilities
hdx-python-api==6.3.5
# via hdx-scraper-ophi (pyproject.toml)
hdx-python-country==3.8.4
hdx-python-country==3.8.5
# via
# hdx-scraper-ophi (pyproject.toml)
# hdx-python-api
Expand Down Expand Up @@ -121,9 +121,9 @@ pockets==0.9.1
# via sphinxcontrib-napoleon
pre-commit==4.0.1
# via hdx-scraper-ophi (pyproject.toml)
pydantic==2.9.2
pydantic==2.10.0
# via frictionless
pydantic-core==2.23.4
pydantic-core==2.27.0
# via pydantic
pygments==2.18.0
# via rich
Expand Down Expand Up @@ -183,7 +183,7 @@ ruamel-yaml==0.18.6
# via hdx-python-utilities
ruamel-yaml-clib==0.2.12
# via ruamel-yaml
setuptools==75.5.0
setuptools==75.6.0
# via ckanapi
shellingham==1.5.4
# via typer
Expand Down Expand Up @@ -240,7 +240,7 @@ xlrd==2.0.1
# via hdx-python-utilities
xlrd3==1.1.0
# via libhxl
xlsx2csv==0.8.3
xlsx2csv==0.8.4
# via hdx-python-utilities
xlsxwriter==3.2.0
# via tableschema-to-template
Expand Down
2 changes: 1 addition & 1 deletion src/hdx/scraper/ophi/config/project_configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ datasetinfo:

showcaseinfo:
# https://docs.google.com/spreadsheets/d/1mChJ1UhgLtqLD-hqbFxd5eKq-L7Nz6awD2znBcEkASs/edit?gid=0#gid=0
urls: "https://docs.google.com/spreadsheets/d/1mChJ1UhgLtqLD-hqbFxd5eKq-L7Nz6awD2znBcEkASs/export?format=csv&gid=0"
urls: "https://docs.google.com/spreadsheets/d/e/2PACX-1vQPXtof5E54tGcQcDOUVwKMV9Kelkt_KqyiYCfGtSUg1B7EoMe7lfoVIHeaL2ij6fyxytplaJQojxyp/pub?gid=0&single=true&output=csv"
notes: "The visual contains sub-national multidimensional poverty data from the country briefs published by the Oxford Poverty and Human Development Initiative (OPHI), University of Oxford."

hxltags:
Expand Down
10 changes: 5 additions & 5 deletions src/hdx/scraper/ophi/dataset_generator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import logging
from copy import copy
from typing import Dict, List, Optional
from typing import Dict, Iterable, List, Optional

from slugify import slugify

Expand Down Expand Up @@ -117,8 +117,8 @@ def generate_showcase(
def generate_dataset(
self,
folder: str,
standardised_rows: List[Dict],
standardised_trend_rows: List[Dict],
standardised_rows: Iterable,
standardised_trend_rows: Iterable,
countryiso3: str,
countryname: str,
date_range: Dict,
Expand Down Expand Up @@ -180,8 +180,8 @@ def generate_dataset(
def generate_global_dataset(
self,
folder: str,
standardised_rows: List[Dict],
standardised_trend_rows: List[Dict],
standardised_rows: Iterable,
standardised_trend_rows: Iterable,
date_range: Dict,
) -> Optional[Dataset]:
if not standardised_rows:
Expand Down
131 changes: 86 additions & 45 deletions src/hdx/scraper/ophi/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
from typing import Dict, List, Tuple
import logging
from datetime import datetime
from typing import Dict, Iterable, List, Tuple

from hdx.api.configuration import Configuration
from hdx.location.adminlevel import AdminLevel
from hdx.utilities.dateparse import parse_date_range
from hdx.utilities.dictandlist import dict_of_lists_add
from hdx.utilities.dictandlist import dict_of_dicts_add
from hdx.utilities.retriever import Retrieve

logger = logging.getLogger(__name__)


class Pipeline:
def __init__(
Expand All @@ -17,15 +21,15 @@ def __init__(
self._retriever = retriever
self._adminone = AdminLevel(admin_level=1, retriever=self._retriever)
self._adminone.setup_from_url()
self._standardised_global = []
self._standardised_global_trend = []
self._standardised_global = {}
self._standardised_global_trend = [{}, {}]
self._standardised_countries = {}
self._standardised_countries_trend = {}
self._standardised_countries_trend = [{}, {}]
self._date_ranges = {}

def process_date(
self, countryiso3: str, date_range: str, row: Dict
) -> None:
) -> Tuple[datetime, datetime]:
date_range = date_range.split("-")
if len(date_range) == 2:
start_date, _ = parse_date_range(date_range[0])
Expand All @@ -52,6 +56,26 @@ def update_date_range(countryiso3: str):

update_date_range(countryiso3)
update_date_range("global")
return start_date, end_date

def add_row(
self,
countryiso3: str,
admin1_code: str,
admin1_name: str,
date_range: str,
row: Dict,
global_dict: Dict,
country_dict: Dict,
msg: str,
) -> None:
start_date, end_date = self.process_date(countryiso3, date_range, row)
key = (countryiso3, admin1_code, admin1_name, start_date, end_date)
if key in global_dict:
logger.error(f"Key {key} already exists in {msg}!")
return
global_dict[key] = row
dict_of_dicts_add(country_dict, countryiso3, key, row)

def read_mpi_national_data(
self, path: str, format: str, sheet: str, headers: List[str]
Expand Down Expand Up @@ -88,9 +112,16 @@ def read_mpi_national_data(
"Multidimensional poverty In severe poverty (severity 50% or higher) % Population"
]
date_range = inrow["MPI data source Year"]
self.process_date(countryiso3, date_range, row)
self._standardised_global.append(row)
dict_of_lists_add(self._standardised_countries, countryiso3, row)
self.add_row(
countryiso3,
"",
"",
date_range,
row,
self._standardised_global,
self._standardised_countries,
"mpi_national",
)

def read_mpi_subnational_data(
self, path: str, format: str, sheet: str, headers: List[str]
Expand Down Expand Up @@ -129,11 +160,18 @@ def read_mpi_subnational_data(
"Multidimensional poverty by region In severe poverty % Population"
]
date_range = inrow["MPI data source Year"]
self.process_date(countryiso3, date_range, row)
self._standardised_global.append(row)
dict_of_lists_add(self._standardised_countries, countryiso3, row)
self.add_row(
countryiso3,
admin1_code,
admin1_name,
date_range,
row,
self._standardised_global,
self._standardised_countries,
"mpi_subnational",
)

def read_national_trends_data(
def read_trends_national_data(
self, path: str, format: str, sheet: str, headers: List[str]
) -> None:
_, iterator = self._retriever.downloader.get_tabular_rows(
Expand All @@ -147,18 +185,11 @@ def read_national_trends_data(
countryiso3 = inrow["ISO country code"]
if not countryiso3:
continue
admin1_name = inrow.get("Region")
if admin1_name:
admin1_code, _ = self._adminone.get_pcode(
countryiso3, admin1_name
)
else:
admin1_code = ""
for i, timepoint in enumerate(("t0", "t1")):
row = {
"country_code": countryiso3,
"admin1_code": admin1_code,
"admin1_name": admin1_name,
"admin1_code": "",
"admin1_name": "",
}
row["mpi"] = inrow[
f"Multidimensional Poverty Index (MPIT) {timepoint} Range 0 to 1"
Expand All @@ -176,13 +207,18 @@ def read_national_trends_data(
f"In severe poverty {timepoint} % pop."
]
date_range = inrow[f"MPI data source {timepoint} Year"]
self.process_date(countryiso3, date_range, row)
self._standardised_global_trend.append(row)
dict_of_lists_add(
self._standardised_countries_trend, countryiso3, row
self.add_row(
countryiso3,
"",
"",
date_range,
row,
self._standardised_global_trend[i],
self._standardised_countries_trend[i],
"trends_subnational",
)

def read_subnational_trends_data(
def read_trends_subnational_data(
self, path: str, format: str, sheet: str, headers: List[str]
) -> None:
_, iterator = self._retriever.downloader.get_tabular_rows(
Expand All @@ -196,13 +232,8 @@ def read_subnational_trends_data(
countryiso3 = inrow["ISO country code"]
if not countryiso3:
continue
admin1_name = inrow.get("Region")
if admin1_name:
admin1_code, _ = self._adminone.get_pcode(
countryiso3, admin1_name
)
else:
admin1_code = ""
admin1_name = inrow["Region"]
admin1_code, _ = self._adminone.get_pcode(countryiso3, admin1_name)
for i, timepoint in enumerate(("t0", "t1")):
row = {
"country_code": countryiso3,
Expand All @@ -225,10 +256,15 @@ def read_subnational_trends_data(
f"In severe poverty {timepoint} % pop."
]
date_range = inrow[f"MPI data source {timepoint} Year"]
self.process_date(countryiso3, date_range, row)
self._standardised_global_trend.append(row)
dict_of_lists_add(
self._standardised_countries_trend, countryiso3, row
self.add_row(
countryiso3,
admin1_code,
admin1_name,
date_range,
row,
self._standardised_global_trend[i],
self._standardised_countries_trend[i],
"trends_subnational",
)

def process(self) -> Tuple[str, str, str]:
Expand Down Expand Up @@ -261,23 +297,28 @@ def process(self) -> Tuple[str, str, str]:
url, "trends-over-time-mpi.xlsx"
)
sheet = trend_over_time["national_sheet"]
self.read_national_trends_data(trend_path, format, sheet, headers)
self.read_trends_national_data(trend_path, format, sheet, headers)
sheet = trend_over_time["subnational_sheet"]
self.read_subnational_trends_data(trend_path, format, sheet, headers)
self.read_trends_subnational_data(trend_path, format, sheet, headers)

return mpi_national_path, mpi_subnational_path, trend_path

def get_standardised_global(self) -> List:
return self._standardised_global
def get_standardised_global(self) -> Iterable:
return self._standardised_global.values()

def get_standardised_countries(self) -> Dict:
return self._standardised_countries

def get_standardised_global_trend(self) -> List:
return self._standardised_global_trend
def get_standardised_global_trend(self) -> Iterable:
self._standardised_global_trend[0].update(
self._standardised_global_trend[1]
)
return self._standardised_global_trend[0].values()

def get_standardised_countries_trend(self) -> Dict:
return self._standardised_countries_trend
for key, value in self._standardised_countries_trend[0].items():
value.update(self._standardised_countries_trend[1][key])
return self._standardised_countries_trend[0]

def get_date_ranges(self) -> Dict:
return self._date_ranges
Loading

0 comments on commit 0e01f8d

Please sign in to comment.