From 28afc51f9495d7d6911b34911fe4015732912e9f Mon Sep 17 00:00:00 2001 From: Anna Date: Mon, 2 Dec 2024 11:46:57 -0800 Subject: [PATCH] Move os functions to own file --- create_biblio.py | 89 +++--------------------------- os_functions.py | 46 +++++++++++++++ parse_info_functions.py | 70 ++++++++++++----------- tests/test_parse_info_functions.py | 1 + 4 files changed, 93 insertions(+), 113 deletions(-) create mode 100644 os_functions.py diff --git a/create_biblio.py b/create_biblio.py index acebfa2..889da34 100644 --- a/create_biblio.py +++ b/create_biblio.py @@ -1,21 +1,26 @@ import fitz # PyMuPDF / fitz # For reading PDF # using PyMuPDF-1.24.13 import rispy # To create ris file # using rispy-0.9.0 import argparse # To collect arguments from command line # using argparse-1.4.0 -import os # For finding PDFs in folder -from constants import END_KEYWORDS, KEYWORDS from parse_info_functions import ( getInfoFromFileName, + getInfoGeneral, parseInfoGeneral, findInfoPersee, findInfoJSTOR, ) +from os_functions import ( + searchFolder, + checkOutputFileType, + checkInputPathExists, +) # Added requirements.txt for easier setup # Added to Github # Added dev tools # Added README # TODO Add tests +# TODO add files for grouped functions # Searches given folder and all sub folders for PDFs # Collects citation info from JSTOR or Persee formats @@ -81,86 +86,6 @@ def findInfo(pdf_path): risEntries.append(output) -def getInfoGeneral(page): - global numOther - numOther += 1 - # Get list of lines of text, with fonts and line size - lis = [] - for i in page.get_text("dict")["blocks"]: - try: - lines = i["lines"] - for line in range(len(lines)): - for k in range(len(lines[line]["spans"])): - li = list( - ( - lines[line]["spans"][k]["text"], - i["lines"][line]["spans"][k]["font"], - round(i["lines"][line]["spans"][k]["size"]), - ) - ) - lis.append(li) - except KeyError: - pass - # Get list of only relevant lines of text - curStr = "" - infoLines = [] - for i in range(len(lis)): - if lis[i][0].startswith(tuple(KEYWORDS)): - infoLines.append(curStr) - curStr = lis[i][0] - elif lis[i][0].startswith(tuple(END_KEYWORDS)): - infoLines.append(curStr) - curStr = "" - else: - curStr += lis[i][0] - return infoLines - - -def searchFolder(search_path): - numPaths = 0 - print("Update: Searching for PDFs") - paths = [] - - for root, _, files in os.walk(search_path): - for file in files: - if file.endswith(".pdf") and not file.startswith("-"): - numPaths += 1 - paths.append(os.path.join(root, file)) - - print("Update: Found " + str(numPaths) + " paths") - return paths - - -def checkOutputFileType(file, inputPath): - if not file: - file = getLastInputPathParameter(inputPath) - if not file.endswith(".ris"): - return file.split(".")[0] + ".ris" - else: - return file - - -# TODO Update readme to reflect where output file will go -def getLastInputPathParameter(inputPath): - folderName = os.path.basename(os.path.normpath(inputPath)) - fileName = folderName + ".ris" - return os.path.join(inputPath, fileName) - - -def checkInputPathExists(file): - try: - path = os.path.exists(file) - if path: - print("Update: Input path exists") - return True - else: - print("Error: Input path does not exist") - return False - except Exception as e: - print("Error: Cannot access input folder: " + e) - return False - - def getCommandLineArguments(): parser = argparse.ArgumentParser(description="Creates ris file from PDF") # Take input path from command line diff --git a/os_functions.py b/os_functions.py new file mode 100644 index 0000000..983e7f7 --- /dev/null +++ b/os_functions.py @@ -0,0 +1,46 @@ +import os + + +def searchFolder(search_path): + numPaths = 0 + print("Update: Searching for PDFs") + paths = [] + + for root, _, files in os.walk(search_path): + for file in files: + if file.endswith(".pdf") and not file.startswith("-"): + numPaths += 1 + paths.append(os.path.join(root, file)) + + print("Update: Found " + str(numPaths) + " paths") + return paths + + +def checkInputPathExists(file): + try: + path = os.path.exists(file) + if path: + print("Update: Input path exists") + return True + else: + print("Error: Input path does not exist") + return False + except Exception as e: + print("Error: Cannot access input folder: " + e) + return False + + +def checkOutputFileType(file, inputPath): + if not file: + file = getLastInputPathParameter(inputPath) + if not file.endswith(".ris"): + return file.split(".")[0] + ".ris" + else: + return file + + +# TODO Update readme to reflect where output file will go +def getLastInputPathParameter(inputPath): + folderName = os.path.basename(os.path.normpath(inputPath)) + fileName = folderName + ".ris" + return os.path.join(inputPath, fileName) diff --git a/parse_info_functions.py b/parse_info_functions.py index d9c9044..541fa9a 100644 --- a/parse_info_functions.py +++ b/parse_info_functions.py @@ -1,6 +1,7 @@ import fitz # PyMuPDF / fitz # For reading PDF import re import os +from constants import END_KEYWORDS, KEYWORDS def collectYearManuscriptCode(file_name, output): @@ -140,6 +141,7 @@ def parseInfoGeneral(infoLines, output): return output +# Fitz used here # Assumes all sections are present, whether they have info or not def findInfoPersee(page, citeThisDocRec, pdf_path): # Reminder: Any field may be missing @@ -235,37 +237,7 @@ def findInfoJSTOR(page, pdf_path): else: output = {"type_of_reference": "JOUR"} - # Get list of lines of text, with fonts and line size - lis = [] - for i in page.get_text("dict")["blocks"]: - try: - lines = i["lines"] - for line in range(len(lines)): - for k in range(len(lines[line]["spans"])): - li = list( - ( - lines[line]["spans"][k]["text"], - i["lines"][line]["spans"][k]["font"], - round(i["lines"][line]["spans"][k]["size"]), - ) - ) - lis.append(li) - except KeyError: - pass - # Get list of only relevant lines of text - keywords = ["Author(s):", "Source:", "Published"] - j = 0 - curStr = "" - infoLines = [] - for i in range(len(lis)): - if j >= len(keywords): - break - if lis[i][0].startswith(keywords[j]): - infoLines.append(curStr) - curStr = lis[i][0] - j += 1 - else: - curStr += lis[i][0] + infoLines = getInfoGeneral(page) if not infoLines: print("Update: Didn't find title, searching file name") @@ -299,3 +271,39 @@ def findInfoJSTOR(page, pdf_path): output["end_page"] = endPage return output, 1 + + +# Fitz used here +def getInfoGeneral(page): + global numOther + numOther += 1 + # Get list of lines of text, with fonts and line size + lis = [] + for i in page.get_text("dict")["blocks"]: + try: + lines = i["lines"] + for line in range(len(lines)): + for k in range(len(lines[line]["spans"])): + li = list( + ( + lines[line]["spans"][k]["text"], + i["lines"][line]["spans"][k]["font"], + round(i["lines"][line]["spans"][k]["size"]), + ) + ) + lis.append(li) + except KeyError: + pass + # Get list of only relevant lines of text + curStr = "" + infoLines = [] + for i in range(len(lis)): + if lis[i][0].startswith(tuple(KEYWORDS)): + infoLines.append(curStr) + curStr = lis[i][0] + elif lis[i][0].startswith(tuple(END_KEYWORDS)): + infoLines.append(curStr) + curStr = "" + else: + curStr += lis[i][0] + return infoLines diff --git a/tests/test_parse_info_functions.py b/tests/test_parse_info_functions.py index 43adcd1..ebd5d15 100644 --- a/tests/test_parse_info_functions.py +++ b/tests/test_parse_info_functions.py @@ -143,6 +143,7 @@ def test_parseInfoGeneral(inputLines, output, expected): assert parseInfoGeneral(inputLines, output) == expected +# Below tests not working JSTOR_title = (fitz.open(), ({}, 1)) JSTOR_no_title = ({}, ({}, 2)) f1 = ""