From 57fc217ca8d9f3d84f59156414c43e798a76fee8 Mon Sep 17 00:00:00 2001
From: JSIV <scrudato@umich.edu>
Date: Sun, 5 Feb 2023 20:03:35 -0500
Subject: [PATCH] Line 43 of cli.pawls.preprocessors.tesseract in
 extract_page_tokens() fails when the underlying text datatype is not actually
 text. I assume this is rare but is dependent on the original source PDF
 authoring tool. I have a pdf where once page only has a number on it and it
 appears the data type that is extracted to the dataframe is float64. This
 fails with the extract_page_tokens() function as written. Added .astype(str)
 to line 43 to force conversion to string, which should cover these kinds of
 corner cases. Working for me at least on the pdf that was crashingt the
 parser. (#199)

---
 cli/pawls/preprocessors/tesseract.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cli/pawls/preprocessors/tesseract.py b/cli/pawls/preprocessors/tesseract.py
index 0c48f257..d0bd49b2 100644
--- a/cli/pawls/preprocessors/tesseract.py
+++ b/cli/pawls/preprocessors/tesseract.py
@@ -40,7 +40,7 @@ def extract_page_tokens(
                     gp["width"].max(),
                     gp["height"].max(),
                     gp["conf"].mean(),
-                    gp["text"].str.cat(sep=" "),
+                    gp["text"].astype(str).str.cat(sep=" "),
                 ]
             )
         )