Skip to content

Commit

Permalink
EES-4727 Add content_utils.get_content_block_text and rename utils.py…
Browse files Browse the repository at this point in the history
… to text_utils.py
  • Loading branch information
benoutram committed Dec 7, 2023
1 parent 0ebfd80 commit 8a2e518
Show file tree
Hide file tree
Showing 9 changed files with 31 additions and 38 deletions.
6 changes: 5 additions & 1 deletion chatbot-prototype.code-workspace
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@
"Response Automater Tests",
"Data Ingestion API",
"Data Ingestion API Tests"
]
],
"python.analysis.autoImportCompletions": true,
"python.analysis.inlayHints.functionReturnTypes": true,
"python.analysis.inlayHints.variableTypes": true,
"python.analysis.inlayHints.callArgumentNames": "partial"
},
"extensions": {
"recommendations": [
Expand Down
17 changes: 2 additions & 15 deletions data_ingestion/services/methodology_service.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import logging

import requests
from bs4 import BeautifulSoup

from ..config import settings
from ..utils.content_utils import get_content_block_text
from .vector_db_client import delete_url

logger = logging.getLogger(__name__)
Expand All @@ -28,29 +28,16 @@ def fetch_methodology(slug: str) -> dict[str, str]:

return {
"link": f"{settings.ees_url_public_ui}/methodology/{slug}",
"text": get_general_content(res=response_json),
"text": get_content_block_text(res=response_json),
}
except requests.exceptions.HTTPError as err:
# TODO Why are some methodologies not found?
if err.response.status_code == 404:
logger.error(f"Methodology version for slug {slug} was not found")
return {}
else:
raise


def get_general_content(res: dict) -> str:
content_sections = res["content"]
result = "Content: "
for section_index in range(len(content_sections)):
content_blocks = content_sections[section_index]["content"]
for block_index in range(len(content_blocks)):
content_block = content_blocks[block_index]
if content_block["type"] == "HtmlBlock":
result += BeautifulSoup(markup=content_block["body"], features="html.parser").get_text()
return result


def fetch_methodology_slugs() -> list[str]:
response = requests.get(url=f"{settings.ees_url_api_content}/methodology-themes")
response.raise_for_status()
Expand Down
29 changes: 9 additions & 20 deletions data_ingestion/services/release_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from bs4 import BeautifulSoup

from ..config import settings
from ..utils.content_utils import get_content_block_text
from .tablebuilder_service import fetch_data_block

logger = logging.getLogger(__name__)
Expand All @@ -21,51 +22,39 @@ def fetch_release(slug: str) -> dict[str, str]:

logger.debug(f"Processing content for release id: {release_id}")

headlines_content = str(get_headlines_content(res=response_json))
key_stats_content = get_key_statistics_content(release_id=release_id, res=response_json)
general_content = get_general_content(res=response_json)
headlines_text = str(get_headlines_text(res=response_json))
key_stats_text = get_key_statistics_text(release_id=release_id, res=response_json)
content_block_text = get_content_block_text(res=response_json)

return {
"link": f"{settings.ees_url_public_ui}/find-statistics/{slug}",
"text": f"{headlines_content}{key_stats_content}{general_content}",
"text": f"{headlines_text}{key_stats_text}{content_block_text}",
}


def get_headlines_content(res: dict) -> str | None:
def get_headlines_text(res: dict) -> str | None:
headlines_section = res["headlinesSection"]["content"]
if headlines_section:
headlines_content_block = headlines_section[0]
headlines = BeautifulSoup(markup=headlines_content_block["body"], features="html.parser").get_text()
return f"Headline: {headlines}"


def get_key_statistics_content(release_id: str, res: dict) -> str | None:
def get_key_statistics_text(release_id: str, res: dict) -> str | None:
key_statistics = res["keyStatistics"]
if key_statistics:
key_statistics_content = list(
map(
lambda item: get_key_statistic_content(release_id=release_id, index_and_key_statistic=item),
lambda item: get_key_statistic_text(release_id=release_id, index_and_key_statistic=item),
enumerate(key_statistics),
)
)
return "Key statistic ".join(key_statistics_content)


def get_key_statistic_content(release_id: str, index_and_key_statistic: tuple[int, dict[str, str]]) -> str:
def get_key_statistic_text(release_id: str, index_and_key_statistic: tuple[int, dict[str, str]]) -> str:
index, key_statistic = index_and_key_statistic
data_block_id = key_statistic["dataBlockId"]
return fetch_data_block(
release_id=release_id, data_block_id=data_block_id, key_statistic=key_statistic, index=index
)


def get_general_content(res: dict) -> str:
content_sections = res["content"]
result = "Content: "
for section_index in range(len(content_sections)):
content_blocks = content_sections[section_index]["content"]
for block_index in range(len(content_blocks)):
content_block = content_blocks[block_index]
if content_block["type"] == "HtmlBlock":
result += BeautifulSoup(markup=content_block["body"], features="html.parser").get_text()
return result
2 changes: 1 addition & 1 deletion data_ingestion/services/vector_db_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from qdrant_client.http.exceptions import UnexpectedResponse

from ..config import settings
from ..utils import chunk_text
from ..utils.text_utils import chunk_text

logger = logging.getLogger(__name__)

Expand Down
Empty file.
13 changes: 13 additions & 0 deletions data_ingestion/utils/content_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from bs4 import BeautifulSoup


def get_content_block_text(res: dict) -> str:
content_sections = res["content"]
result = "Content: "
for section_index in range(len(content_sections)):
content_blocks = content_sections[section_index]["content"]
for block_index in range(len(content_blocks)):
content_block = content_blocks[block_index]
if content_block["type"] == "HtmlBlock":
result += BeautifulSoup(markup=content_block["body"], features="html.parser").get_text()
return result
File renamed without changes.
Empty file.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from data_ingestion.utils import temp_method_for_proof_of_concept_tests
from data_ingestion.utils.text_utils import temp_method_for_proof_of_concept_tests


def test_that_tests_run():
Expand Down

0 comments on commit 8a2e518

Please sign in to comment.