Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EES-4727: Tidying up the data_ingestion code by adding types and spli… #29

Merged
merged 5 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 8 additions & 10 deletions data_ingestion/routers/maintenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,24 @@
router = APIRouter(prefix="/api/maintenance")


@router.post("/publications/build")
async def build_publications():
slugs = fetch_publication_slugs()
@router.post(path="/publications/build")
async def build_publications() -> JSONResponse:
try:
data_upsertion(slugs, extract_releases)
data_upsertion(records=extract_releases(slugs=fetch_publication_slugs()))
except Exception as e:
return JSONResponse(status_code=500, content={"Content": e})
return JSONResponse(status_code=200, content={"Content": "Successful"})


@router.post("/methodologies/build")
async def build_methodologies():
slugs = fetch_methodology_slugs()
@router.post(path="/methodologies/build")
async def build_methodologies() -> JSONResponse:
try:
data_upsertion(slugs, extract_methodologies)
data_upsertion(records=extract_methodologies(slugs=fetch_methodology_slugs()))
except Exception as e:
return JSONResponse(status_code=500, content={"Content": e})
return JSONResponse(status_code=200, content={"Content": "Successful"})


@router.delete("/clear", status_code=status.HTTP_204_NO_CONTENT)
async def clear():
@router.delete(path="/clear", status_code=status.HTTP_204_NO_CONTENT)
async def clear() -> None:
recreate_collection()
6 changes: 3 additions & 3 deletions data_ingestion/routers/methodologies.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
router = APIRouter(prefix="/api/methodologies")


@router.post("/{slug}/update")
def update(slug: str):
@router.post(path="/{slug}/update")
def update(slug: str) -> JSONResponse:
try:
delete_methodology(slug=slug)
data_upsertion([slug], extract_methodologies)
data_upsertion(records=extract_methodologies(slugs=[slug]))
except Exception as e:
return JSONResponse(status_code=500, content={"Content": e})
return JSONResponse(status_code=200, content={"Content": "Succesful"})
6 changes: 3 additions & 3 deletions data_ingestion/routers/publications.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
router = APIRouter(prefix="/api/publications")


@router.post("/{slug}/update")
async def update(slug: str):
@router.post(path="/{slug}/update")
async def update(slug: str) -> JSONResponse:
try:
delete_publication(slug=slug)
data_upsertion([slug], extract_releases)
data_upsertion(records=extract_releases(slugs=[slug]))
except Exception as e:
return JSONResponse(status_code=500, content={"Content": e})
return JSONResponse(status_code=200, content={"Content": "Successful"})
81 changes: 42 additions & 39 deletions data_ingestion/services/methodology_service.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import logging

import requests
Expand All @@ -10,50 +9,54 @@
logger = logging.getLogger(__name__)


def delete_methodology(slug: str):
def delete_methodology(slug: str) -> None:
delete_url(url=f"{settings.ees_url_api_content}/methodology{slug}")


def extract_methodologies(slugs):
texts = []
for slug in slugs:
methodology_info = {}
content = fetch_methodology(slug)
methodology_info["text"] = content["data"]
methodology_info["link"] = content["link"]
texts.append(methodology_info)
return texts
def extract_methodologies(slugs: list[str]) -> list[dict[str, str]]:
return list(map(fetch_methodology, slugs))


def fetch_methodology(slug: str):
methodology_content = {}
methodology_content["link"] = f"{settings.ees_url_public_ui}/methodology/{slug}"
res = requests.get(f"{settings.ees_url_api_content}/methodologies/{slug}")
text = json.loads(res.text)
def fetch_methodology(slug: str) -> dict[str, str]:
try:
methodology_content["data"] = "Headlines Section: "
methodology_content["data"] += BeautifulSoup(
text["headlinesSection"]["content"][0]["body"], "html.parser"
).get_text()
except KeyError as e:
logger.error(f" Error: Key '{e.args[0]}' not found whilst reading content for methodology with slug: '{slug}'")

methodology_content["data"] += "Content Section"
for i in range(len(text["content"])):
for j in range(len(text["content"][i]["content"])):
try:
methodology_content["data"] += BeautifulSoup(
text["content"][i]["content"][j]["body"], "html.parser"
).get_text()
except KeyError:
logger.debug(f"Key does not exist for {slug} at {i}")
return methodology_content


def fetch_methodology_slugs():
data = requests.get(f"{settings.ees_url_api_content}/methodology-themes").json()
slugs = []
for item in data:
response = requests.get(url=f"{settings.ees_url_api_content}/methodologies/{slug}")
response.raise_for_status()
response_json = response.json()
methodology_version_id = response_json["id"]

logger.debug(f"Processing content for methodology version: {methodology_version_id}")

return {
"link": f"{settings.ees_url_public_ui}/methodology/{slug}",
"text": get_general_content(res=response_json),
}
except requests.exceptions.HTTPError as err:
# TODO Why are some methodologies not found?
if err.response.status_code == 404:
logger.error(f"Methodology version for slug {slug} was not found")
return {}
else:
raise


def get_general_content(res: dict) -> str:
content_sections = res["content"]
result = "Content: "
for section_index in range(len(content_sections)):
content_blocks = content_sections[section_index]["content"]
for block_index in range(len(content_blocks)):
content_block = content_blocks[block_index]
if content_block["type"] == "HtmlBlock":
result += BeautifulSoup(markup=content_block["body"], features="html.parser").get_text()
return result


def fetch_methodology_slugs() -> list[str]:
response = requests.get(url=f"{settings.ees_url_api_content}/methodology-themes")
response.raise_for_status()
response_json = response.json()
slugs: list[str] = []
for item in response_json:
for topic in item["topics"]:
for publication in topic["publications"]:
for methodology in publication["methodologies"]:
Expand Down
24 changes: 9 additions & 15 deletions data_ingestion/services/publication_service.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import logging

import requests
Expand All @@ -9,20 +8,15 @@
logger = logging.getLogger(__name__)


def delete_publication(slug: str):
def delete_publication(slug: str) -> None:
delete_url(url=f"{settings.ees_url_public_ui}/find-statistics/{slug}")


def fetch_publication_slugs():
try:
response = requests.get(
f"{settings.ees_url_api_content}/publications?page=1&pageSize=9999&sort=published&order=asc"
)
response.raise_for_status()
publications = json.loads(response.text)["results"]
slugs = [publications[i]["slug"] for i in range(len(publications))]
return slugs
except requests.HTTPError as http_err:
logger.error(f"HTTP error occurred: {http_err}")
except Exception as err:
logger.error(f"Other error occurred: {err}")
def fetch_publication_slugs() -> list[str]:
response = requests.get(
url=f"{settings.ees_url_api_content}/publications?page=1&pageSize=9999&sort=published&order=asc"
)
response.raise_for_status()
response_json = response.json()
publications = response_json["results"]
return list(map(lambda publication: publication["slug"], publications))
111 changes: 61 additions & 50 deletions data_ingestion/services/release_service.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,71 @@
import logging
from typing import Dict, List

import requests
from bs4 import BeautifulSoup

from ..config import settings
from .tablebuilder_service import fetch_key_stat
from .tablebuilder_service import fetch_data_block

logger = logging.getLogger(__name__)


def extract_releases(slugs: str) -> List[Dict]:
texts = []
for slug in slugs:
slug_info = {}
res = requests.get(f"{settings.ees_url_api_content}/publications/{slug}/releases/latest")
key_stats = {}
response_json = res.json()
release_id = response_json["publication"]["releases"][0]["id"]
try:
key_statistics = response_json["keyStatistics"]
if key_statistics != []:
data_strings = []
for i, statistic in enumerate(key_statistics):
data_strings.append(fetch_key_stat(statistic, release_id, i))
key_stats["data"] = "Key Statistics section: ".join(data_strings)
except KeyError:
logger.warn(f"{slug} doesnt contain key stats")
try:
slug_info["text"] = key_stats["data"]
content = fetch_release(slug, response_json)
slug_info["text"] += content["data"]
slug_info["link"] = content["link"]
except Exception:
logger.warn(f"{slug} doesnt contain key stats")
content = fetch_release(slug, response_json)
slug_info["text"] = content["data"]
slug_info["link"] = content["link"]
texts.append(slug_info)
return texts


def fetch_release(slug: str, res: dict) -> dict:
slug_content = {}
slug_content["link"] = f"{settings.ees_url_public_ui}/find-statistics/{slug}"
try:
slug_content["data"] = "Headlines Section: "
slug_content["data"] += BeautifulSoup(res["headlinesSection"]["content"][0]["body"], "html.parser").get_text()
except Exception as e:
logger.info(f" Error: {e}. For {slug} the headlines section doesnt exist")

slug_content["data"] += "Content Section"
for i in range(len(res["content"])):
for j in range(len(res["content"][i]["content"])):
try:
slug_content["data"] += BeautifulSoup(res["content"][i]["content"][j]["body"], "html.parser").get_text()
except KeyError:
logger.debug(f"Key does not exist for {slug} at {i}")
return slug_content
def extract_releases(slugs: list[str]) -> list[dict[str, str]]:
return list(map(fetch_release, slugs))


def fetch_release(slug: str) -> dict[str, str]:
response = requests.get(url=f"{settings.ees_url_api_content}/publications/{slug}/releases/latest")
response.raise_for_status()
response_json = response.json()
release_id = response_json["id"]

logger.debug(f"Processing content for release id: {release_id}")

headlines_content = str(get_headlines_content(res=response_json))
key_stats_content = get_key_statistics_content(release_id=release_id, res=response_json)
general_content = get_general_content(res=response_json)

return {
"link": f"{settings.ees_url_public_ui}/find-statistics/{slug}",
"text": f"{headlines_content}{key_stats_content}{general_content}",
}


def get_headlines_content(res: dict) -> str | None:
headlines_section = res["headlinesSection"]["content"]
if headlines_section:
headlines_content_block = headlines_section[0]
headlines = BeautifulSoup(markup=headlines_content_block["body"], features="html.parser").get_text()
return f"Headline: {headlines}"


def get_key_statistics_content(release_id: str, res: dict) -> str | None:
key_statistics = res["keyStatistics"]
if key_statistics:
key_statistics_content = list(
map(
lambda item: get_key_statistic_content(release_id=release_id, index_and_key_statistic=item),
enumerate(key_statistics),
)
)
return "Key statistic ".join(key_statistics_content)


def get_key_statistic_content(release_id: str, index_and_key_statistic: tuple[int, dict[str, str]]) -> str:
index, key_statistic = index_and_key_statistic
data_block_id = key_statistic["dataBlockId"]
return fetch_data_block(
release_id=release_id, data_block_id=data_block_id, key_statistic=key_statistic, index=index
)


def get_general_content(res: dict) -> str:
content_sections = res["content"]
result = "Content: "
for section_index in range(len(content_sections)):
content_blocks = content_sections[section_index]["content"]
for block_index in range(len(content_blocks)):
content_block = content_blocks[block_index]
if content_block["type"] == "HtmlBlock":
result += BeautifulSoup(markup=content_block["body"], features="html.parser").get_text()
return result
22 changes: 12 additions & 10 deletions data_ingestion/services/tablebuilder_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,27 @@
logger = logging.getLogger(__name__)


def fetch_key_stat(statistic: dict, release_id: str, i: int) -> str:
def fetch_data_block(release_id: str, data_block_id: str, key_statistic: dict[str, str], index: int) -> str:
try:
data_block_id = statistic["dataBlockId"]
res = requests.get(f"{settings.ees_url_api_data}/tablebuilder/release/{release_id}/data-block/{data_block_id}")
response_json = res.json()
response = requests.get(
url=f"{settings.ees_url_api_data}/tablebuilder/release/{release_id}/data-block/{data_block_id}"
)
response.raise_for_status()
response_json = response.json()
label = response_json["subjectMeta"]["indicators"][0]["label"]
measure = list(response_json["results"][0]["measures"].values())[0]
try:
unit = response_json["subjectMeta"]["indicators"][0]["unit"]
measure = f"{measure} {unit}"
measure = f"{measure}{unit}"
except KeyError:
logger.error("No unit found")
except Exception:
label = statistic["title"]
measure = statistic["statistic"]
label = key_statistic["title"]
measure = key_statistic["statistic"]
try:
trend = statistic["trend"]
data_string = f"{i + 1}: {label}-{measure} {trend}."
trend = key_statistic["trend"]
data_string: str = f"{index + 1}: {label}-{measure} {trend}."
except Exception:
data_string = f"{i +1}: {label}-{measure}."
data_string = f"{index + 1}: {label}-{measure}."

return data_string
Loading