From 26aa28a487009725e25c900c52456e30030f414a Mon Sep 17 00:00:00 2001
From: bosd <c5e2fd43-d292-4c90-9d1f-74ff3436329a@anonaddy.me>
Date: Tue, 7 Feb 2023 06:24:03 +0100
Subject: [PATCH] tesseract: [FIX] pdf pre-processing

detect the mimetype of the input, also when using invoice2data as a library
---
 src/invoice2data/input/tesseract.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py
index 3883f85b..c7023d57 100644
--- a/src/invoice2data/input/tesseract.py
+++ b/src/invoice2data/input/tesseract.py
@@ -2,6 +2,7 @@
 
 from distutils import spawn
 import tempfile
+import mimetypes
 
 from subprocess import Popen, PIPE, STDOUT, CalledProcessError, TimeoutExpired
 from subprocess import run
@@ -54,8 +55,10 @@ def to_text(path):
         "-append",
         "png:-",
     ]
-    if path.endswith(".pdf"):
-        # pre processing pdf file by converting to png
+    mt = mimetypes.guess_type(path)
+    if mt[0] == "application/pdf":
+        # tesseract does not support pdf files, pre-processing is needed.
+        logger.debug("PDF file detected, start pre-processing by converting to png")
         p1 = Popen(convert, stdout=PIPE)
         tess_input = "stdin"
         stdin = p1.stdout