From 28afc51f9495d7d6911b34911fe4015732912e9f Mon Sep 17 00:00:00 2001
From: Anna <annasaltveit@gmail.com>
Date: Mon, 2 Dec 2024 11:46:57 -0800
Subject: [PATCH] Move os functions to own file

---
 create_biblio.py                   | 89 +++---------------------------
 os_functions.py                    | 46 +++++++++++++++
 parse_info_functions.py            | 70 ++++++++++++-----------
 tests/test_parse_info_functions.py |  1 +
 4 files changed, 93 insertions(+), 113 deletions(-)
 create mode 100644 os_functions.py

diff --git a/create_biblio.py b/create_biblio.py
index acebfa2..889da34 100644
--- a/create_biblio.py
+++ b/create_biblio.py
@@ -1,21 +1,26 @@
 import fitz  # PyMuPDF / fitz # For reading PDF # using PyMuPDF-1.24.13
 import rispy  # To create ris file # using rispy-0.9.0
 import argparse  # To collect arguments from command line # using argparse-1.4.0
-import os  # For finding PDFs in folder
 
-from constants import END_KEYWORDS, KEYWORDS
 from parse_info_functions import (
     getInfoFromFileName,
+    getInfoGeneral,
     parseInfoGeneral,
     findInfoPersee,
     findInfoJSTOR,
 )
+from os_functions import (
+    searchFolder,
+    checkOutputFileType,
+    checkInputPathExists,
+)
 
 # Added requirements.txt for easier setup
 # Added to Github
 # Added dev tools
 # Added README
 # TODO Add tests
+# TODO add files for grouped functions
 
 # Searches given folder and all sub folders for PDFs
 # Collects citation info from JSTOR or Persee formats
@@ -81,86 +86,6 @@ def findInfo(pdf_path):
     risEntries.append(output)
 
 
-def getInfoGeneral(page):
-    global numOther
-    numOther += 1
-    # Get list of lines of text, with fonts and line size
-    lis = []
-    for i in page.get_text("dict")["blocks"]:
-        try:
-            lines = i["lines"]
-            for line in range(len(lines)):
-                for k in range(len(lines[line]["spans"])):
-                    li = list(
-                        (
-                            lines[line]["spans"][k]["text"],
-                            i["lines"][line]["spans"][k]["font"],
-                            round(i["lines"][line]["spans"][k]["size"]),
-                        )
-                    )
-                    lis.append(li)
-        except KeyError:
-            pass
-    # Get list of only relevant lines of text
-    curStr = ""
-    infoLines = []
-    for i in range(len(lis)):
-        if lis[i][0].startswith(tuple(KEYWORDS)):
-            infoLines.append(curStr)
-            curStr = lis[i][0]
-        elif lis[i][0].startswith(tuple(END_KEYWORDS)):
-            infoLines.append(curStr)
-            curStr = ""
-        else:
-            curStr += lis[i][0]
-    return infoLines
-
-
-def searchFolder(search_path):
-    numPaths = 0
-    print("Update: Searching for PDFs")
-    paths = []
-
-    for root, _, files in os.walk(search_path):
-        for file in files:
-            if file.endswith(".pdf") and not file.startswith("-"):
-                numPaths += 1
-                paths.append(os.path.join(root, file))
-
-    print("Update: Found " + str(numPaths) + " paths")
-    return paths
-
-
-def checkOutputFileType(file, inputPath):
-    if not file:
-        file = getLastInputPathParameter(inputPath)
-    if not file.endswith(".ris"):
-        return file.split(".")[0] + ".ris"
-    else:
-        return file
-
-
-# TODO Update readme to reflect where output file will go
-def getLastInputPathParameter(inputPath):
-    folderName = os.path.basename(os.path.normpath(inputPath))
-    fileName = folderName + ".ris"
-    return os.path.join(inputPath, fileName)
-
-
-def checkInputPathExists(file):
-    try:
-        path = os.path.exists(file)
-        if path:
-            print("Update: Input path exists")
-            return True
-        else:
-            print("Error: Input path does not exist")
-            return False
-    except Exception as e:
-        print("Error: Cannot access input folder: " + e)
-        return False
-
-
 def getCommandLineArguments():
     parser = argparse.ArgumentParser(description="Creates ris file from PDF")
     # Take input path from command line
diff --git a/os_functions.py b/os_functions.py
new file mode 100644
index 0000000..983e7f7
--- /dev/null
+++ b/os_functions.py
@@ -0,0 +1,46 @@
+import os
+
+
+def searchFolder(search_path):
+    numPaths = 0
+    print("Update: Searching for PDFs")
+    paths = []
+
+    for root, _, files in os.walk(search_path):
+        for file in files:
+            if file.endswith(".pdf") and not file.startswith("-"):
+                numPaths += 1
+                paths.append(os.path.join(root, file))
+
+    print("Update: Found " + str(numPaths) + " paths")
+    return paths
+
+
+def checkInputPathExists(file):
+    try:
+        path = os.path.exists(file)
+        if path:
+            print("Update: Input path exists")
+            return True
+        else:
+            print("Error: Input path does not exist")
+            return False
+    except Exception as e:
+        print("Error: Cannot access input folder: " + e)
+        return False
+
+
+def checkOutputFileType(file, inputPath):
+    if not file:
+        file = getLastInputPathParameter(inputPath)
+    if not file.endswith(".ris"):
+        return file.split(".")[0] + ".ris"
+    else:
+        return file
+
+
+# TODO Update readme to reflect where output file will go
+def getLastInputPathParameter(inputPath):
+    folderName = os.path.basename(os.path.normpath(inputPath))
+    fileName = folderName + ".ris"
+    return os.path.join(inputPath, fileName)
diff --git a/parse_info_functions.py b/parse_info_functions.py
index d9c9044..541fa9a 100644
--- a/parse_info_functions.py
+++ b/parse_info_functions.py
@@ -1,6 +1,7 @@
 import fitz  # PyMuPDF / fitz # For reading PDF
 import re
 import os
+from constants import END_KEYWORDS, KEYWORDS
 
 
 def collectYearManuscriptCode(file_name, output):
@@ -140,6 +141,7 @@ def parseInfoGeneral(infoLines, output):
     return output
 
 
+# Fitz used here
 # Assumes all sections are present, whether they have info or not
 def findInfoPersee(page, citeThisDocRec, pdf_path):
     # Reminder: Any field may be missing
@@ -235,37 +237,7 @@ def findInfoJSTOR(page, pdf_path):
     else:
         output = {"type_of_reference": "JOUR"}
 
-    # Get list of lines of text, with fonts and line size
-    lis = []
-    for i in page.get_text("dict")["blocks"]:
-        try:
-            lines = i["lines"]
-            for line in range(len(lines)):
-                for k in range(len(lines[line]["spans"])):
-                    li = list(
-                        (
-                            lines[line]["spans"][k]["text"],
-                            i["lines"][line]["spans"][k]["font"],
-                            round(i["lines"][line]["spans"][k]["size"]),
-                        )
-                    )
-                    lis.append(li)
-        except KeyError:
-            pass
-    # Get list of only relevant lines of text
-    keywords = ["Author(s):", "Source:", "Published"]
-    j = 0
-    curStr = ""
-    infoLines = []
-    for i in range(len(lis)):
-        if j >= len(keywords):
-            break
-        if lis[i][0].startswith(keywords[j]):
-            infoLines.append(curStr)
-            curStr = lis[i][0]
-            j += 1
-        else:
-            curStr += lis[i][0]
+    infoLines = getInfoGeneral(page)
 
     if not infoLines:
         print("Update: Didn't find title, searching file name")
@@ -299,3 +271,39 @@ def findInfoJSTOR(page, pdf_path):
                     output["end_page"] = endPage
 
     return output, 1
+
+
+# Fitz used here
+def getInfoGeneral(page):
+    global numOther
+    numOther += 1
+    # Get list of lines of text, with fonts and line size
+    lis = []
+    for i in page.get_text("dict")["blocks"]:
+        try:
+            lines = i["lines"]
+            for line in range(len(lines)):
+                for k in range(len(lines[line]["spans"])):
+                    li = list(
+                        (
+                            lines[line]["spans"][k]["text"],
+                            i["lines"][line]["spans"][k]["font"],
+                            round(i["lines"][line]["spans"][k]["size"]),
+                        )
+                    )
+                    lis.append(li)
+        except KeyError:
+            pass
+    # Get list of only relevant lines of text
+    curStr = ""
+    infoLines = []
+    for i in range(len(lis)):
+        if lis[i][0].startswith(tuple(KEYWORDS)):
+            infoLines.append(curStr)
+            curStr = lis[i][0]
+        elif lis[i][0].startswith(tuple(END_KEYWORDS)):
+            infoLines.append(curStr)
+            curStr = ""
+        else:
+            curStr += lis[i][0]
+    return infoLines
diff --git a/tests/test_parse_info_functions.py b/tests/test_parse_info_functions.py
index 43adcd1..ebd5d15 100644
--- a/tests/test_parse_info_functions.py
+++ b/tests/test_parse_info_functions.py
@@ -143,6 +143,7 @@ def test_parseInfoGeneral(inputLines, output, expected):
     assert parseInfoGeneral(inputLines, output) == expected
 
 
+# Below tests not working
 JSTOR_title = (fitz.open(), ({}, 1))
 JSTOR_no_title = ({}, ({}, 2))
 f1 = ""