From 8a2e5186552d98882a1e410c0d6b6f759f1fec05 Mon Sep 17 00:00:00 2001 From: Ben Outram Date: Thu, 7 Dec 2023 13:32:15 +0000 Subject: [PATCH] EES-4727 Add content_utils.get_content_block_text and rename utils.py to text_utils.py --- chatbot-prototype.code-workspace | 6 +++- .../services/methodology_service.py | 17 ++--------- data_ingestion/services/release_service.py | 29 ++++++------------- data_ingestion/services/vector_db_client.py | 2 +- data_ingestion/utils/__init__.py | 0 data_ingestion/utils/content_utils.py | 13 +++++++++ .../{utils.py => utils/text_utils.py} | 0 data_ingestion_tests/utils/__init__.py | 0 .../text_utils_test.py} | 2 +- 9 files changed, 31 insertions(+), 38 deletions(-) create mode 100644 data_ingestion/utils/__init__.py create mode 100644 data_ingestion/utils/content_utils.py rename data_ingestion/{utils.py => utils/text_utils.py} (100%) create mode 100644 data_ingestion_tests/utils/__init__.py rename data_ingestion_tests/{utils_test.py => utils/text_utils_test.py} (70%) diff --git a/chatbot-prototype.code-workspace b/chatbot-prototype.code-workspace index 8147958..57e9be5 100644 --- a/chatbot-prototype.code-workspace +++ b/chatbot-prototype.code-workspace @@ -52,7 +52,11 @@ "Response Automater Tests", "Data Ingestion API", "Data Ingestion API Tests" - ] + ], + "python.analysis.autoImportCompletions": true, + "python.analysis.inlayHints.functionReturnTypes": true, + "python.analysis.inlayHints.variableTypes": true, + "python.analysis.inlayHints.callArgumentNames": "partial" }, "extensions": { "recommendations": [ diff --git a/data_ingestion/services/methodology_service.py b/data_ingestion/services/methodology_service.py index 152c1c4..52801a2 100644 --- a/data_ingestion/services/methodology_service.py +++ b/data_ingestion/services/methodology_service.py @@ -1,9 +1,9 @@ import logging import requests -from bs4 import BeautifulSoup from ..config import settings +from ..utils.content_utils import get_content_block_text from .vector_db_client import delete_url logger = logging.getLogger(__name__) @@ -28,10 +28,9 @@ def fetch_methodology(slug: str) -> dict[str, str]: return { "link": f"{settings.ees_url_public_ui}/methodology/{slug}", - "text": get_general_content(res=response_json), + "text": get_content_block_text(res=response_json), } except requests.exceptions.HTTPError as err: - # TODO Why are some methodologies not found? if err.response.status_code == 404: logger.error(f"Methodology version for slug {slug} was not found") return {} @@ -39,18 +38,6 @@ def fetch_methodology(slug: str) -> dict[str, str]: raise -def get_general_content(res: dict) -> str: - content_sections = res["content"] - result = "Content: " - for section_index in range(len(content_sections)): - content_blocks = content_sections[section_index]["content"] - for block_index in range(len(content_blocks)): - content_block = content_blocks[block_index] - if content_block["type"] == "HtmlBlock": - result += BeautifulSoup(markup=content_block["body"], features="html.parser").get_text() - return result - - def fetch_methodology_slugs() -> list[str]: response = requests.get(url=f"{settings.ees_url_api_content}/methodology-themes") response.raise_for_status() diff --git a/data_ingestion/services/release_service.py b/data_ingestion/services/release_service.py index 1859a1d..428da5e 100644 --- a/data_ingestion/services/release_service.py +++ b/data_ingestion/services/release_service.py @@ -4,6 +4,7 @@ from bs4 import BeautifulSoup from ..config import settings +from ..utils.content_utils import get_content_block_text from .tablebuilder_service import fetch_data_block logger = logging.getLogger(__name__) @@ -21,17 +22,17 @@ def fetch_release(slug: str) -> dict[str, str]: logger.debug(f"Processing content for release id: {release_id}") - headlines_content = str(get_headlines_content(res=response_json)) - key_stats_content = get_key_statistics_content(release_id=release_id, res=response_json) - general_content = get_general_content(res=response_json) + headlines_text = str(get_headlines_text(res=response_json)) + key_stats_text = get_key_statistics_text(release_id=release_id, res=response_json) + content_block_text = get_content_block_text(res=response_json) return { "link": f"{settings.ees_url_public_ui}/find-statistics/{slug}", - "text": f"{headlines_content}{key_stats_content}{general_content}", + "text": f"{headlines_text}{key_stats_text}{content_block_text}", } -def get_headlines_content(res: dict) -> str | None: +def get_headlines_text(res: dict) -> str | None: headlines_section = res["headlinesSection"]["content"] if headlines_section: headlines_content_block = headlines_section[0] @@ -39,33 +40,21 @@ def get_headlines_content(res: dict) -> str | None: return f"Headline: {headlines}" -def get_key_statistics_content(release_id: str, res: dict) -> str | None: +def get_key_statistics_text(release_id: str, res: dict) -> str | None: key_statistics = res["keyStatistics"] if key_statistics: key_statistics_content = list( map( - lambda item: get_key_statistic_content(release_id=release_id, index_and_key_statistic=item), + lambda item: get_key_statistic_text(release_id=release_id, index_and_key_statistic=item), enumerate(key_statistics), ) ) return "Key statistic ".join(key_statistics_content) -def get_key_statistic_content(release_id: str, index_and_key_statistic: tuple[int, dict[str, str]]) -> str: +def get_key_statistic_text(release_id: str, index_and_key_statistic: tuple[int, dict[str, str]]) -> str: index, key_statistic = index_and_key_statistic data_block_id = key_statistic["dataBlockId"] return fetch_data_block( release_id=release_id, data_block_id=data_block_id, key_statistic=key_statistic, index=index ) - - -def get_general_content(res: dict) -> str: - content_sections = res["content"] - result = "Content: " - for section_index in range(len(content_sections)): - content_blocks = content_sections[section_index]["content"] - for block_index in range(len(content_blocks)): - content_block = content_blocks[block_index] - if content_block["type"] == "HtmlBlock": - result += BeautifulSoup(markup=content_block["body"], features="html.parser").get_text() - return result diff --git a/data_ingestion/services/vector_db_client.py b/data_ingestion/services/vector_db_client.py index 696ea6f..96b5871 100644 --- a/data_ingestion/services/vector_db_client.py +++ b/data_ingestion/services/vector_db_client.py @@ -7,7 +7,7 @@ from qdrant_client.http.exceptions import UnexpectedResponse from ..config import settings -from ..utils import chunk_text +from ..utils.text_utils import chunk_text logger = logging.getLogger(__name__) diff --git a/data_ingestion/utils/__init__.py b/data_ingestion/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_ingestion/utils/content_utils.py b/data_ingestion/utils/content_utils.py new file mode 100644 index 0000000..27d196c --- /dev/null +++ b/data_ingestion/utils/content_utils.py @@ -0,0 +1,13 @@ +from bs4 import BeautifulSoup + + +def get_content_block_text(res: dict) -> str: + content_sections = res["content"] + result = "Content: " + for section_index in range(len(content_sections)): + content_blocks = content_sections[section_index]["content"] + for block_index in range(len(content_blocks)): + content_block = content_blocks[block_index] + if content_block["type"] == "HtmlBlock": + result += BeautifulSoup(markup=content_block["body"], features="html.parser").get_text() + return result diff --git a/data_ingestion/utils.py b/data_ingestion/utils/text_utils.py similarity index 100% rename from data_ingestion/utils.py rename to data_ingestion/utils/text_utils.py diff --git a/data_ingestion_tests/utils/__init__.py b/data_ingestion_tests/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_ingestion_tests/utils_test.py b/data_ingestion_tests/utils/text_utils_test.py similarity index 70% rename from data_ingestion_tests/utils_test.py rename to data_ingestion_tests/utils/text_utils_test.py index 5f545bd..4bb2c2e 100644 --- a/data_ingestion_tests/utils_test.py +++ b/data_ingestion_tests/utils/text_utils_test.py @@ -1,4 +1,4 @@ -from data_ingestion.utils import temp_method_for_proof_of_concept_tests +from data_ingestion.utils.text_utils import temp_method_for_proof_of_concept_tests def test_that_tests_run():