Skip to content

Commit

Permalink
Fix get TOC endpoint
Browse files Browse the repository at this point in the history
  • Loading branch information
gabriel-piles committed Oct 12, 2023
1 parent 9e593d0 commit 3a6a090
Show file tree
Hide file tree
Showing 10 changed files with 21 additions and 15 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ start_detached:
docker compose up --build -d

start_for_testing:
docker compose down; docker compose up --build -d
docker compose down; docker compose up --attach pdf_paragraphs_extraction_worker --attach pdf_paragraphs_extraction_api --build

stop:
docker compose stop
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: "3.8"
services:
pdf_paragraphs_extraction_api:
container_name: pdf_paragraphs_extraction_api
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051" ]
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051", "--timeout", "300"]
init: true
restart: unless-stopped
build:
Expand Down
2 changes: 1 addition & 1 deletion local-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: "3.8"
services:
pdf_paragraphs_extraction_api:
container_name: pdf_paragraphs_extraction_api
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051" ]
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051", "--timeout", "300" ]
init: true
restart: unless-stopped
build:
Expand Down
2 changes: 1 addition & 1 deletion mac-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: "3.8"
services:
pdf_paragraphs_extraction_api:
container_name: pdf_paragraphs_extraction_api
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051" ]
entrypoint: [ "gunicorn", "-k", "uvicorn.workers.UvicornWorker", "--chdir", "./src", "app:app", "--bind", "0.0.0.0:5051", "--timeout", "300" ]
init: true
restart: unless-stopped
build:
Expand Down
2 changes: 1 addition & 1 deletion src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
import sentry_sdk

from data.Paragrphs import Paragraphs
from data.Paragraphs import Paragraphs
from data.SegmentBox import SegmentBox
from extract_pdf_paragraphs.extract_paragraphs import get_paths, extract_paragraphs
from extract_pdf_paragraphs.pdf_to_xml import pdf_content_to_pdf_path
Expand Down
File renamed without changes.
7 changes: 0 additions & 7 deletions src/paragraph_extraction_trainer/PdfParagraphTokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,3 @@ def get_paragraph_for_token(self, token: PdfToken):

def check_same_paragraph(self, token_1: PdfToken, token_2: PdfToken):
return self.get_paragraph_for_token(token_1) == self.get_paragraph_for_token(token_2)


#
# if __name__ == "__main__":
# paragraph_tokens = PdfParagraphTokens.from_labeled_data("one_column_train", "cejil_staging1")
# for paragraph in paragraph_tokens.paragraphs:
# print([token.id for token in paragraph.tokens])
13 changes: 13 additions & 0 deletions src/test_end_to_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,19 @@ def test_one_token_per_page_pdf(self):
self.assertEqual(response_json["paragraphs"][0]["page_number"], 1)
self.assertEqual(response_json["paragraphs"][1]["page_number"], 2)

def test_toc(self):
with open(f"{config.APP_PATH}/test_files/toc-test.pdf", "rb") as stream:
files = {"file": stream}
response = requests.post(f"{self.service_url}/get_toc", files=files)

response_json = response.json()
self.assertEqual(response.status_code, 200)
self.assertEqual(len(response_json), 4)
self.assertEqual(response_json[0]["label"], "TEST")
self.assertEqual(response_json[0]["indentation"], 0)
self.assertEqual(response_json[-1]["label"], "C. TITLE LONGER")
self.assertEqual(response_json[-1]["indentation"], 1)

@staticmethod
def get_redis_message() -> ExtractionMessage:
queue = RedisSMQ(host="127.0.0.1", port="6379", qname="segmentation_results", quiet=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,14 +250,14 @@ def from_pdf_segments(pdf_segmentation: PdfSegmentation) -> list["SegmentTwoMode
def get_segments(modes, pdf_segmentation, title_index):
segments: list["SegmentTwoModelsV3SegmentsContext2"] = list()
for index, pdf_segment in enumerate(pdf_segmentation.pdf_segments):
if pdf_segment.token_type not in valid_tag_types:
if pdf_segment.segment_type not in valid_tag_types:
continue

segment_landmarks = SegmentTwoModelsV3SegmentsContext2(
index, pdf_segment, pdf_segmentation.pdf_features, title_index, modes
)

if pdf_segment.token_type == TokenType.TITLE:
if pdf_segment.segment_type == TokenType.TITLE:
title_index += 1

segments.append(segment_landmarks)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def predict(self, pdfs_segmentations: list[PdfSegmentation]) -> list[PdfSegmenta

for pdf_segmentation in pdfs_segmentations:
for index, segment in enumerate(pdf_segmentation.pdf_segments):
if segment.token_type not in valid_tag_types:
if segment.segment_type not in valid_tag_types:
continue

pdf_segmentation.title_predictions[index] = round(100 * predictions[prediction_index])
Expand Down

0 comments on commit 3a6a090

Please sign in to comment.