From 26aa28a487009725e25c900c52456e30030f414a Mon Sep 17 00:00:00 2001 From: bosd Date: Tue, 7 Feb 2023 06:24:03 +0100 Subject: [PATCH] tesseract: [FIX] pdf pre-processing detect the mimetype of the input, also when using invoice2data as a library --- src/invoice2data/input/tesseract.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/invoice2data/input/tesseract.py b/src/invoice2data/input/tesseract.py index 3883f85b..c7023d57 100644 --- a/src/invoice2data/input/tesseract.py +++ b/src/invoice2data/input/tesseract.py @@ -2,6 +2,7 @@ from distutils import spawn import tempfile +import mimetypes from subprocess import Popen, PIPE, STDOUT, CalledProcessError, TimeoutExpired from subprocess import run @@ -54,8 +55,10 @@ def to_text(path): "-append", "png:-", ] - if path.endswith(".pdf"): - # pre processing pdf file by converting to png + mt = mimetypes.guess_type(path) + if mt[0] == "application/pdf": + # tesseract does not support pdf files, pre-processing is needed. + logger.debug("PDF file detected, start pre-processing by converting to png") p1 = Popen(convert, stdout=PIPE) tess_input = "stdin" stdin = p1.stdout