Skip to content

Commit

Permalink
tesseract: [FIX] pdf pre-processing
Browse files Browse the repository at this point in the history
detect the mimetype of the input, also when using invoice2data as a library
  • Loading branch information
bosd committed Feb 11, 2023
1 parent c0cae5a commit 26aa28a
Showing 1 changed file with 5 additions and 2 deletions.
7 changes: 5 additions & 2 deletions src/invoice2data/input/tesseract.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from distutils import spawn
import tempfile
import mimetypes

from subprocess import Popen, PIPE, STDOUT, CalledProcessError, TimeoutExpired
from subprocess import run
Expand Down Expand Up @@ -54,8 +55,10 @@ def to_text(path):
"-append",
"png:-",
]
if path.endswith(".pdf"):
# pre processing pdf file by converting to png
mt = mimetypes.guess_type(path)
if mt[0] == "application/pdf":
# tesseract does not support pdf files, pre-processing is needed.
logger.debug("PDF file detected, start pre-processing by converting to png")
p1 = Popen(convert, stdout=PIPE)
tess_input = "stdin"
stdin = p1.stdout
Expand Down

0 comments on commit 26aa28a

Please sign in to comment.