Skip to content

Commit

Permalink
tesseract support added
Browse files Browse the repository at this point in the history
  • Loading branch information
vaibhavk97 committed Feb 26, 2020
1 parent a009b47 commit 886ef7c
Show file tree
Hide file tree
Showing 7 changed files with 31 additions and 66 deletions.
4 changes: 2 additions & 2 deletions GoBooDo.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def getInitialData(self):
stringResponse = ("["+scripts[-4].text.split("_OC_Run")[1][1:-2]+"]")
jsonResponse = json.loads(stringResponse)
self.createPageDict(jsonResponse)
print(f'The total pages available for fetching are {len(self.pageList)}')
print(f'Pages to be fetched in the current iteration are : {len(self.pageList)}')
for elem in jsonResponse[3]['page']:
page = elem['pid']
self.pageList.remove(page)
Expand Down Expand Up @@ -140,7 +140,7 @@ def fetchPageLinks(self,proxy=None):

def processBook(self):
print('------------------- Fetching Images -------------------')
downloadService = StoreImages(self.path,settings['proxy_images'],settings['page_resolution'],settings['empty_image_size'])
downloadService = StoreImages(self.path,settings['proxy_images'],settings['page_resolution'],settings['tesseract_path'])
downloadService.getImages(settings['max_retry_images']+1)
print('------------------- Creating PDF -------------------')
service = createBook(self.name, self.path)
Expand Down
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ The configuration can be done in the settings.json and the description is as fol
{
"country":"co.in", // The TLD for the service that is being used for example books.google.co.in or books.google.de
"page_resolution": 1500, // The resoution of page in dpi.
"empty_image_size": 98670, // The size of empty image, it refers to the size of "Image not available pages".
"tesseract_path": 'C:\program Files\Tesseract-OCR\tesseract.exe', // The path for tesseract engine if not available via environment variables.
"proxy_links":0, // 0 for disabling proxy when fetching page links upon reaching the limit.
"proxy_images":0, // 0 for disabling proxy when fetching page images upon reaching the limit.
"max_retry_links":1, // Max retries for fetching a link using proxies.
Expand All @@ -42,8 +42,7 @@ The configuration can be done in the settings.json and the description is as fol
The output will be saved as a folder named the 'id' of the book which was given as input. The final PDF will be in the output folder inside it along with a folder containing the images.
Proxies may be added in proxies.txt (a sample proxy has been added already).

For changing "empty_image_size" according to the resolution of the page set in page_resolution, run setEmptyImageSize.py. The resolution is set with respect to books scanned in **potrait** manner
which is the case for most of them. However, it can still be the case that this page is present in the final PDF.
GooBoDo now uses Tesseract for identifying empty images fetched from valid links. Please configure Tesseract prior to avoid any errors related to it. The path in settings is used for Windows installation. Please configure Tesseract for a linux distribution accordingly.

The breakup of the files downloaded is as follows:
~~~
Expand All @@ -62,6 +61,8 @@ bs4
Pillow
fpdf
html5lib
tqdm
pytesseract
~~~

# Features
Expand Down
4 changes: 3 additions & 1 deletion makePDF.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,6 @@ def makePdf(self):
pdf.image(pagePath,0,0)
if not os.path.exists(os.path.join(self.path,'Output')):
os.mkdir(os.path.join(self.path,'Output'))
pdf.output(os.path.join(self.path,'Output',str(self.name[:min(10,len(self.name))])+".pdf"),"F")
name = str(self.name[:min(10,len(self.name))]).replace(" ","")
name = ''.join(ch for ch in name if ch.isalnum()) + ".pdf"
pdf.output(os.path.join(self.path,'Output',name),"F")
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ bs4
Pillow
fpdf
html5lib
tqdm
pytesseract
55 changes: 0 additions & 55 deletions setEmptyImageSize.py

This file was deleted.

4 changes: 2 additions & 2 deletions settings.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"country": "co.in",
"page_resolution": 1500,
"empty_image_size": 98670,
"page_resolution": 500,
"tesseract_path": "C:\\program Files\\Tesseract-OCR\\tesseract.exe",
"proxy_links": 0,
"proxy_images": 0,
"max_retry_links": 1,
Expand Down
21 changes: 18 additions & 3 deletions storeImages.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytesseract
from PIL import Image
import requests
import pickle
Expand All @@ -7,11 +8,11 @@

class StoreImages:

def __init__(self,bookpath,proxyflag,emptyImageSize,resolution):
self.emptyImageSize = emptyImageSize
def __init__(self,bookpath,proxyflag,resolution,tesserPath):
self.pageResolution = resolution
self.proxyFlag = proxyflag
self.bookPath = bookpath
self.tesserPath = tesserPath
self.imagePath = os.path.join(self.bookPath,'Images')
self.pagesFetched = {}
self.PageLinkDict = {}
Expand Down Expand Up @@ -53,6 +54,20 @@ def resethead(self):
except:
pass

def pageEmpty(self,image):
im = Image.open(BytesIO(image))
width, height = im.size
im = im.resize((int(width / 5), int(height / 5)))
gray = im.convert('L')
bw = gray.point(lambda x: 0 if x < 250 else 255, '1')
try:
text = pytesseract.image_to_string(bw)
except:
pytesseract.pytesseract.tesseract_cmd = self.tesserPath
text = pytesseract.image_to_string(bw)
return text.replace('\n', " ") == 'image not available'


def getImages(self,retries):
self.resethead()
for pageData in self.PageLinkDict.keys():
Expand All @@ -79,7 +94,7 @@ def getImages(self,retries):
proxyFailed = True
else:
pageImage = requests.get(link + '&w=' + str(self.pageResolution), headers=self.head, verify=False)
if len(pageImage.content) == self.emptyImageSize or proxyFailed:
if self.pageEmpty(pageImage.content) or proxyFailed:
self.resethead()
checkIfPageFetched -= 1
else:
Expand Down

0 comments on commit 886ef7c

Please sign in to comment.