Skip to content

Commit

Permalink
Handle multiple numbers and add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
asaltveit committed Dec 1, 2024
1 parent 124b018 commit ea2cf64
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 22 deletions.
22 changes: 11 additions & 11 deletions create_biblio.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,14 +213,14 @@ def main():
print("Updated: Finished")


main()
# file_path = "../test/Itineraries/Hulbert 1923 Some Medieval Advertisements of Rome.pdf"
# doc = fitz.open(file_path)
# page = doc[0]
# print("page: ", page)
# i, _ = getInfoFromFileName(file_path)
# print("file-name-info: ", i)
# info = getInfoGeneral(page)
# print("infoLines: ", info)
# output = parseInfoGeneral(info, i)
# print("output: ", output)
# main()
file_path = "../test/Itineraries/Zurli 1998 Il cod Vindobonensis Palatinus 9401 asterisk dell Anthologia Latina.pdf"
doc = fitz.open(file_path)
page = doc[0]
print("page: ", page)
i, _ = getInfoFromFileName(file_path)
print("file-name-info: ", i)
info = getInfoGeneral(page)
print("infoLines: ", info)
output = parseInfoGeneral(info, i)
print("output: ", output)
33 changes: 27 additions & 6 deletions parse_info_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,39 @@
import re
import os


def collectYearManuscriptCode(file_name, output):
space_sections = file_name.split(" ")
numbers4digits = re.findall(r"[0-9]{4}", file_name)
numbersAllDigits = re.findall(r"[0-9]{3,9}", file_name)
posYear = int(numbers4digits[0])
if (
numbers4digits
and len(numbers4digits) >= 1
and posYear > 0
and posYear < 2050
and space_sections[1] == numbers4digits[0]
):
output["year"] = numbers4digits[0]
additionalNums = [x for x in numbersAllDigits if x != numbers4digits[0]]
if additionalNums:
output["number_of_volumes"] = additionalNums[0]
else:
output["number_of_volumes"] = numbersAllDigits[0]
return output


# Has tests
def getInfoFromFileName(file_path):
print("Update: Collecting info from file name")
output = {}
file_name = os.path.basename(os.path.normpath(file_path))
# Remove .pdf
file_name = file_name.split(".")[0]
# Split on year

# Numbers
output = collectYearManuscriptCode(file_name, output)

textSections = re.split(r"(?<!\d)\d{4}(?!\d)", file_name)
if len(textSections) == 2 and textSections[1]:
author, title = textSections
Expand All @@ -21,6 +46,7 @@ def getInfoFromFileName(file_path):
elif len(textSections) == 2:
title = textSections[0]
output["title"] = title.strip()
print("Update: Article title found")
elif len(textSections) > 2:
author = textSections[0]
title = textSections[1]
Expand All @@ -33,11 +59,6 @@ def getInfoFromFileName(file_path):
output["title"] = file_name.strip()
print("Update: Article title found")

year = re.findall(r"[0-9]{4}", file_name)
if len(year) >= 1:
output["year"] = year[0]
print("Update: Year published found")

return output, 2


Expand Down
43 changes: 38 additions & 5 deletions tests/test_parse_info_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,33 @@
from parse_info_functions import (
getInfoFromFileName,
parseInfoGeneral,
findInfoJSTOR,
collectYearManuscriptCode,
)

fake = Faker()

# collectYearManuscriptCode
@pytest.mark.parametrize(
"file_name,output,expected",
[
(
"Ammannati 2023 Lupus in fabula - Sulla vera mano di Lupo di Ferrières",
{},
{"year": "2023"},
),
(
"Zurli 1998 Il cod Vindobonensis Palatinus 9401 asterisk dell Anthologia Latina",
{},
{"year": "1998", "number_of_volumes": 9401},
),
("Levitan-DancingEndRope-1985", {}, {"year": 1985}), # fails
("Les manuscrits de Loup de Ferrières", {}, {}), # fails
],
)
def test_collectYearManuscriptCode(file_name, output, expected):
assert collectYearManuscriptCode(file_name, output) == expected


# getInfoFromFileName
def test_author_year_title_format():
assert getInfoFromFileName(
Expand Down Expand Up @@ -37,7 +59,7 @@ def test_dashes():
)


def test_multiple_years_format():
def test_multiple_numbers_format():
assert getInfoFromFileName(
"Zurli 1998 Il cod Vindobonensis Palatinus 9401 asterisk dell Anthologia Latina"
) == (
Expand Down Expand Up @@ -89,7 +111,11 @@ def test_no_year():
{"authors": ["Zurli"], "title": "Il cod Vindobonensis Palatinus", "year": "1998"},
infolines_middlebury_all_caps_expected,
)
infolines_jstor = []
infolines_jstor = [
"Some Medieval Advertisements of Rome ",
"Author(s): J. R. Hulbert ",
"Source: Modern Philology , May, 1923, Vol. 20, No. 4 (May, 1923), pp. 403-424 Published by: The University of Chicago Press Stable URL: https://www.jstor.org/stable/433697JSTOR is a not-for-profit service that helps scholars, researchers, and students discover, use, and build upon a wide range of content in a trusted digital archive. We use information technology and tools to increase productivity and facilitate new forms of scholarship. For more information about JSTOR, please contact [email protected]. Your use of the JSTOR archive indicates your acceptance of the Terms & Conditions of Use, available at ",
]
no_infolines = ([], {"title": "blah, blah, blah"}, {"title": "blah, blah, blah"})
# Persee - not working
# infolines_persee_output = {'authors': ['Pellegrin'], 'title': 'Les manuscrits de Loup de Ferrières', 'year': '1957'}
Expand Down Expand Up @@ -133,8 +159,8 @@ def test_findInfoJSTOR(tmp_path):
f1.parent.mkdir() # create a directory "mydir" in temp folder (which is the parent directory of "myfile"
f1.touch() # create a file "myfile" in "mydir"
# JSTOR_no_title = ((fitz.open('mydir/myfile')[0], ({'title': 'myfile'}, 2)))
doc = fitz.open("mydir/myfile.pdf")
assert findInfoJSTOR(doc[0], "mydir/myfile.pdf") == ({"title": "myfile"}, 2)
# doc = fitz.open("mydir/myfile.pdf")
# assert findInfoJSTOR(doc[0], "mydir/myfile.pdf") == ({"title": "myfile"}, 2)


# Mock tools:
Expand Down Expand Up @@ -163,3 +189,10 @@ def test_findInfoJSTOR(tmp_path):
# fake = Faker()

# pip install Faker

# Might be more for backend objects:
# https://factoryboy.readthedocs.io/en/stable/

# tmp_dir: https://stackoverflow.com/questions/36070031/creating-a-temporary-directory-in-pytest
# fitz open - https://github.com/pymupdf/PyMuPDF/issues/612
# mocking issues - https://stackoverflow.com/questions/65728499/python-pytest-mocking-three-functions

0 comments on commit ea2cf64

Please sign in to comment.