Skip to content

Commit

Permalink
Update makePDF.py for OCR
Browse files Browse the repository at this point in the history
  • Loading branch information
minamotorin authored Sep 28, 2021
1 parent 6a2942f commit 5fe2bde
Showing 1 changed file with 18 additions and 1 deletion.
19 changes: 18 additions & 1 deletion makePDF.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import os
from PIL import Image
from tqdm import tqdm
from pytesseract import image_to_pdf_or_hocr
import PyPDF2
from io import BytesIO

class createBook:

Expand All @@ -22,4 +25,18 @@ def makePdf(self):
os.mkdir(os.path.join(self.path,'Output'))
name = str(self.name[:min(10,len(self.name))]).replace(" ","")
name = ''.join(ch for ch in name if ch.isalnum()) + ".pdf"
pdf.output(os.path.join(self.path,'Output',name),"F")
pdf.output(os.path.join(self.path,'Output',name),"F")

def ocrPdf(self, lang=None):
pdf = PyPDF2.PdfFileWriter()
for pagePath in tqdm(self.imageNameList):
with open(pagePath, 'rb') as ofile:
im = Image.open(ofile)
page = image_to_pdf_or_hocr(im, lang=lang)
pdf.addPage(PyPDF2.PdfFileReader(BytesIO(page)).getPage(0))
if not os.path.exists(os.path.join(self.path,'Output')):
os.mkdir(os.path.join(self.path,'Output'))
name = str(self.name[:min(10,len(self.name))]).replace(" ","")
name = ''.join(ch for ch in name if ch.isalnum()) + ".pdf"
with open(os.path.join(self.path,'Output',name),'wb') as ofile:
pdf.write(ofile)

0 comments on commit 5fe2bde

Please sign in to comment.