Skip to content

Commit

Permalink
simplified tests and new folder names
Browse files Browse the repository at this point in the history
  • Loading branch information
aassumpcao committed Nov 27, 2020
1 parent 3aab82f commit 5178830
Show file tree
Hide file tree
Showing 16 changed files with 64 additions and 187 deletions.
Binary file modified .DS_Store
Binary file not shown.
5 changes: 3 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
# htmlcov/
.tox/
.nox/
.coverage
Expand Down Expand Up @@ -131,5 +131,6 @@ dmypy.json

# local files
*misc/
*data/
data/*
*scratch/
tests/text_extraction_tests_aassumpcao.py
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def is_file_type(filepath: str, file_types: List[str]) -> bool:


def write_file_content(
filepath: str, apache_tika_jar: str, metadata: Optional[bool]=None
filepath: str, apache_tika_jar: str, metadata: Optional[bool]=False
) -> str:
"""
Extract the metadata of the original file using the given Apache
Expand Down
7 changes: 4 additions & 3 deletions process/gazette.py → queridodiario_toolbox/gazette.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,14 @@ def __init__(
"Either the filepath or content argument must be specified"
)

def extract_content(self, metadata: Optional[bool]=None) -> str:
def extract_content(self, metadata: Optional[bool]=False) -> str:
"""
Extract gazette content, save to disk, and store filepath
in filepath class content
"""
self.filepath = write_file_content(
self.filepath, self.tika_jar, metadata
filepath=self.filepath, apache_tika_jar=self.tika_jar,
metadata=metadata
)

def load_content(self) -> None:
Expand All @@ -54,4 +55,4 @@ def load_content(self) -> None:
with open(self.filepath, 'r') as fp:
self.content = json.load(fp)
else:
self.content = load_file_content(self.filepath)
self.content = load_file_content(filepath=self.filepath)
72 changes: 0 additions & 72 deletions test.json

This file was deleted.

1 change: 1 addition & 0 deletions tests/data/fake_content.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Hi this is a document created to test the text extraction for the Querido Diário project.
16 changes: 16 additions & 0 deletions tests/data/fake_gazette.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<!DOCTYPE html>
<html>
<head>
<title>Demo</title>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<meta name="description" content="Demo project">
<meta name="viewport" content="width=device-width, initial-scale=1">
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta.3/css/bootstrap.min.css" integrity="sha384-Zug+QiDoJOrZ5t4lssLdxGhVrurbmBWopoEl+M6BdEfwnCJZtKxi1KgxUyJq13dy" crossorigin="anonymous">
<style type="text/css"></style>
</head>
<body>
<p>Hi this is a document created to test the text extraction for the Querido Diário project.</p>
<script type="text/javascript"></script>
</body>
</html>
Binary file added tests/data/fake_gazette.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions tests/data/fake_gazette.json

Large diffs are not rendered by default.

Binary file added tests/data/fake_gazette.m4a
Binary file not shown.
Binary file added tests/data/fake_gazette.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/data/fake_gazette.tiff
Binary file not shown.
1 change: 1 addition & 0 deletions tests/data/fake_metadata.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"gazette": "Hi this is a document created to test the text extraction for the Querido Diário project."}
146 changes: 37 additions & 109 deletions tests/text_extraction_tests.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
from unittest import TestCase
import os

from process.etl.file_transform import *
from process import Gazette
from queridodiario_toolbox.etl.file_transform import *
from queridodiario_toolbox import Gazette

class TextExtractionTests(TestCase):

# APACHE_TIKA_JAR_PATH = "/tika-app.jar"
TIKA_PATH = "/usr/local/Cellar/tika/1.24.1_1/libexec/tika-app-1.24.1.jar"
TIKA_PATH = "/tika-app.jar"

def tearDown(self):
self.clean_txt_file_generated_during_tests()
Expand All @@ -22,9 +21,26 @@ def clean_txt_file_generated_during_tests(self):

def get_files_generated_during_tests(self, root, files):
for f in files:
if ".txt" in f and f not in ["fake_content.txt", "fake_gazette.txt"]:
if ".txt" in f and f not in [
"fake_content.txt", "fake_gazette.txt"
]:
yield f"{root}{f}"

def validate_basic_extract_content(self, gazette, metadata=False):
if metadata:
target = "tests/data/fake_gazette.json"
else:
target = "tests/data/fake_gazette.txt"

gazette.extract_content(metadata=metadata)
self.assertEqual(gazette.filepath, target)

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))

if not metadata:
self.assertIn("Querido", gazette.content, "Extraction Failed")

def test_extract_text_from_invalid_file(self):
with self.assertRaisesRegex(Exception, "No such file"):
gazette = Gazette("file/does/not/exist", self.TIKA_PATH)
Expand Down Expand Up @@ -116,154 +132,66 @@ def test_class_instantiation_with_all_arguments(self):

def test_extract_text_from_doc_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.doc", self.TIKA_PATH)

gazette.extract_content()
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.txt")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.assertIn("Querido", gazette.content, "Extraction Failed")
self.validate_basic_extract_content(gazette)

def test_extract_text_from_docx_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.docx", self.TIKA_PATH)

gazette.extract_content()
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.txt")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.assertIn("Querido", gazette.content, "Extraction Failed")
self.validate_basic_extract_content(gazette)

def test_extract_text_from_odt_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.odt", self.TIKA_PATH)

gazette.extract_content()
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.txt")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.assertIn("Querido", gazette.content, "Extraction Failed")
self.validate_basic_extract_content(gazette)

def test_extract_text_from_html_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.html", self.TIKA_PATH)

gazette.extract_content()
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.txt")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.assertIn("Querido", gazette.content, "Extraction Failed")
self.validate_basic_extract_content(gazette)

def test_extract_text_from_pdf_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.pdf", self.TIKA_PATH)

gazette.extract_content()
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.txt")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.assertIn("Querido", gazette.content, "Extraction Failed")
self.validate_basic_extract_content(gazette)

def test_extract_text_from_jpeg_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.jpeg", self.TIKA_PATH)

gazette.extract_content()
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.txt")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.assertIn("Querido", gazette.content, "Extraction Failed")
self.validate_basic_extract_content(gazette)

def test_extract_text_from_png_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.png", self.TIKA_PATH)

gazette.extract_content()
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.txt")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.assertIn("Querido", gazette.content, "Extraction Failed")
self.validate_basic_extract_content(gazette)

def test_extract_text_from_tiff_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.tiff", self.TIKA_PATH)

gazette.extract_content()
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.txt")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.assertIn("Querido", gazette.content, "Extraction Failed")
self.validate_basic_extract_content(gazette)

def test_extract_metadata_from_doc_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.doc", self.TIKA_PATH)

gazette.extract_content(metadata=True)
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.json")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.validate_basic_extract_content(gazette, metadata=True)

def test_extract_metadata_from_docx_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.docx", self.TIKA_PATH)

gazette.extract_content(metadata=True)
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.json")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.validate_basic_extract_content(gazette, metadata=True)

def test_extract_metadata_from_odt_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.odt", self.TIKA_PATH)

gazette.extract_content(metadata=True)
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.json")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.validate_basic_extract_content(gazette, metadata=True)

def test_extract_metadata_from_html_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.html", self.TIKA_PATH)

gazette.extract_content(metadata=True)
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.json")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.validate_basic_extract_content(gazette, metadata=True)

def test_extract_metadata_from_pdf_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.pdf", self.TIKA_PATH)

gazette.extract_content(metadata=True)
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.json")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.validate_basic_extract_content(gazette, metadata=True)

def test_extract_metadata_from_jpeg_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.jpeg", self.TIKA_PATH)

gazette.extract_content(metadata=True)
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.json")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.validate_basic_extract_content(gazette, metadata=True)

def test_extract_metadata_from_png_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.png", self.TIKA_PATH)

gazette.extract_content(metadata=True)
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.json")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.validate_basic_extract_content(gazette, metadata=True)

def test_extract_metadata_from_tiff_should_return_content(self):
gazette = Gazette("tests/data/fake_gazette.tiff", self.TIKA_PATH)

gazette.extract_content(metadata=True)
self.assertEqual(gazette.filepath, "tests/data/fake_gazette.json")

gazette.load_content()
self.assertNotEqual(0, len(gazette.content))
self.validate_basic_extract_content(gazette, metadata=True)


0 comments on commit 5178830

Please sign in to comment.