Skip to content

Commit

Permalink
Move os functions to own file
Browse files Browse the repository at this point in the history
  • Loading branch information
asaltveit committed Dec 2, 2024
1 parent 1c08988 commit 28afc51
Show file tree
Hide file tree
Showing 4 changed files with 93 additions and 113 deletions.
89 changes: 7 additions & 82 deletions create_biblio.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
import fitz # PyMuPDF / fitz # For reading PDF # using PyMuPDF-1.24.13
import rispy # To create ris file # using rispy-0.9.0
import argparse # To collect arguments from command line # using argparse-1.4.0
import os # For finding PDFs in folder

from constants import END_KEYWORDS, KEYWORDS
from parse_info_functions import (
getInfoFromFileName,
getInfoGeneral,
parseInfoGeneral,
findInfoPersee,
findInfoJSTOR,
)
from os_functions import (
searchFolder,
checkOutputFileType,
checkInputPathExists,
)

# Added requirements.txt for easier setup
# Added to Github
# Added dev tools
# Added README
# TODO Add tests
# TODO add files for grouped functions

# Searches given folder and all sub folders for PDFs
# Collects citation info from JSTOR or Persee formats
Expand Down Expand Up @@ -81,86 +86,6 @@ def findInfo(pdf_path):
risEntries.append(output)


def getInfoGeneral(page):
global numOther
numOther += 1
# Get list of lines of text, with fonts and line size
lis = []
for i in page.get_text("dict")["blocks"]:
try:
lines = i["lines"]
for line in range(len(lines)):
for k in range(len(lines[line]["spans"])):
li = list(
(
lines[line]["spans"][k]["text"],
i["lines"][line]["spans"][k]["font"],
round(i["lines"][line]["spans"][k]["size"]),
)
)
lis.append(li)
except KeyError:
pass
# Get list of only relevant lines of text
curStr = ""
infoLines = []
for i in range(len(lis)):
if lis[i][0].startswith(tuple(KEYWORDS)):
infoLines.append(curStr)
curStr = lis[i][0]
elif lis[i][0].startswith(tuple(END_KEYWORDS)):
infoLines.append(curStr)
curStr = ""
else:
curStr += lis[i][0]
return infoLines


def searchFolder(search_path):
numPaths = 0
print("Update: Searching for PDFs")
paths = []

for root, _, files in os.walk(search_path):
for file in files:
if file.endswith(".pdf") and not file.startswith("-"):
numPaths += 1
paths.append(os.path.join(root, file))

print("Update: Found " + str(numPaths) + " paths")
return paths


def checkOutputFileType(file, inputPath):
if not file:
file = getLastInputPathParameter(inputPath)
if not file.endswith(".ris"):
return file.split(".")[0] + ".ris"
else:
return file


# TODO Update readme to reflect where output file will go
def getLastInputPathParameter(inputPath):
folderName = os.path.basename(os.path.normpath(inputPath))
fileName = folderName + ".ris"
return os.path.join(inputPath, fileName)


def checkInputPathExists(file):
try:
path = os.path.exists(file)
if path:
print("Update: Input path exists")
return True
else:
print("Error: Input path does not exist")
return False
except Exception as e:
print("Error: Cannot access input folder: " + e)
return False


def getCommandLineArguments():
parser = argparse.ArgumentParser(description="Creates ris file from PDF")
# Take input path from command line
Expand Down
46 changes: 46 additions & 0 deletions os_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os


def searchFolder(search_path):
numPaths = 0
print("Update: Searching for PDFs")
paths = []

for root, _, files in os.walk(search_path):
for file in files:
if file.endswith(".pdf") and not file.startswith("-"):
numPaths += 1
paths.append(os.path.join(root, file))

print("Update: Found " + str(numPaths) + " paths")
return paths


def checkInputPathExists(file):
try:
path = os.path.exists(file)
if path:
print("Update: Input path exists")
return True
else:
print("Error: Input path does not exist")
return False
except Exception as e:
print("Error: Cannot access input folder: " + e)
return False


def checkOutputFileType(file, inputPath):
if not file:
file = getLastInputPathParameter(inputPath)
if not file.endswith(".ris"):
return file.split(".")[0] + ".ris"
else:
return file


# TODO Update readme to reflect where output file will go
def getLastInputPathParameter(inputPath):
folderName = os.path.basename(os.path.normpath(inputPath))
fileName = folderName + ".ris"
return os.path.join(inputPath, fileName)
70 changes: 39 additions & 31 deletions parse_info_functions.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import fitz # PyMuPDF / fitz # For reading PDF
import re
import os
from constants import END_KEYWORDS, KEYWORDS


def collectYearManuscriptCode(file_name, output):
Expand Down Expand Up @@ -140,6 +141,7 @@ def parseInfoGeneral(infoLines, output):
return output


# Fitz used here
# Assumes all sections are present, whether they have info or not
def findInfoPersee(page, citeThisDocRec, pdf_path):
# Reminder: Any field may be missing
Expand Down Expand Up @@ -235,37 +237,7 @@ def findInfoJSTOR(page, pdf_path):
else:
output = {"type_of_reference": "JOUR"}

# Get list of lines of text, with fonts and line size
lis = []
for i in page.get_text("dict")["blocks"]:
try:
lines = i["lines"]
for line in range(len(lines)):
for k in range(len(lines[line]["spans"])):
li = list(
(
lines[line]["spans"][k]["text"],
i["lines"][line]["spans"][k]["font"],
round(i["lines"][line]["spans"][k]["size"]),
)
)
lis.append(li)
except KeyError:
pass
# Get list of only relevant lines of text
keywords = ["Author(s):", "Source:", "Published"]
j = 0
curStr = ""
infoLines = []
for i in range(len(lis)):
if j >= len(keywords):
break
if lis[i][0].startswith(keywords[j]):
infoLines.append(curStr)
curStr = lis[i][0]
j += 1
else:
curStr += lis[i][0]
infoLines = getInfoGeneral(page)

if not infoLines:
print("Update: Didn't find title, searching file name")
Expand Down Expand Up @@ -299,3 +271,39 @@ def findInfoJSTOR(page, pdf_path):
output["end_page"] = endPage

return output, 1


# Fitz used here
def getInfoGeneral(page):
global numOther
numOther += 1
# Get list of lines of text, with fonts and line size
lis = []
for i in page.get_text("dict")["blocks"]:
try:
lines = i["lines"]
for line in range(len(lines)):
for k in range(len(lines[line]["spans"])):
li = list(
(
lines[line]["spans"][k]["text"],
i["lines"][line]["spans"][k]["font"],
round(i["lines"][line]["spans"][k]["size"]),
)
)
lis.append(li)
except KeyError:
pass
# Get list of only relevant lines of text
curStr = ""
infoLines = []
for i in range(len(lis)):
if lis[i][0].startswith(tuple(KEYWORDS)):
infoLines.append(curStr)
curStr = lis[i][0]
elif lis[i][0].startswith(tuple(END_KEYWORDS)):
infoLines.append(curStr)
curStr = ""
else:
curStr += lis[i][0]
return infoLines
1 change: 1 addition & 0 deletions tests/test_parse_info_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ def test_parseInfoGeneral(inputLines, output, expected):
assert parseInfoGeneral(inputLines, output) == expected


# Below tests not working
JSTOR_title = (fitz.open(), ({}, 1))
JSTOR_no_title = ({}, ({}, 2))
f1 = ""
Expand Down

0 comments on commit 28afc51

Please sign in to comment.