tesseract support added

minamotorin · Feb 26, 2020 · 886ef7c · 886ef7c
1 parent a009b47
commit 886ef7c
Show file tree

Hide file tree

Showing 7 changed files with 31 additions and 66 deletions.
diff --git a/GoBooDo.py b/GoBooDo.py
@@ -85,7 +85,7 @@ def getInitialData(self):
                 stringResponse = ("["+scripts[-4].text.split("_OC_Run")[1][1:-2]+"]")
             jsonResponse = json.loads(stringResponse)
             self.createPageDict(jsonResponse)
-            print(f'The total pages available for fetching are {len(self.pageList)}')
+            print(f'Pages to be fetched in the current iteration are : {len(self.pageList)}')
             for elem in jsonResponse[3]['page']:
                 page = elem['pid']
                 self.pageList.remove(page)
@@ -140,7 +140,7 @@ def fetchPageLinks(self,proxy=None):
 
     def processBook(self):
         print('------------------- Fetching Images -------------------')
-        downloadService = StoreImages(self.path,settings['proxy_images'],settings['page_resolution'],settings['empty_image_size'])
+        downloadService = StoreImages(self.path,settings['proxy_images'],settings['page_resolution'],settings['tesseract_path'])
         downloadService.getImages(settings['max_retry_images']+1)
         print('------------------- Creating PDF -------------------')
         service = createBook(self.name, self.path)

diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ The configuration can be done in the settings.json and the description is as fol
 {
   "country":"co.in", // The TLD for the service that is being used for example books.google.co.in or books.google.de
   "page_resolution": 1500, // The resoution of page in dpi.
-  "empty_image_size": 98670, // The size of empty image, it refers to the size of "Image not available pages".
+  "tesseract_path": 'C:\program Files\Tesseract-OCR\tesseract.exe', // The path for tesseract engine if not available via environment variables.
   "proxy_links":0,   // 0 for disabling proxy when fetching page links upon reaching the limit.
   "proxy_images":0,  // 0 for disabling proxy when fetching  page images upon reaching the limit.
   "max_retry_links":1, // Max retries for fetching a link using proxies.
@@ -42,8 +42,7 @@ The configuration can be done in the settings.json and the description is as fol
 The output will be saved as a folder named the 'id' of the book which was given as input. The final PDF will be in the output folder inside it along with a folder containing the images.
 Proxies may be added in proxies.txt (a sample proxy has been added already).
 
-For changing "empty_image_size" according to the resolution of the page set in page_resolution, run setEmptyImageSize.py. The resolution is set with respect to books scanned in **potrait** manner
-which is the case for most of them. However, it can still be the case that this page is present in the final PDF.
+GooBoDo now uses Tesseract for identifying empty images fetched from valid links. Please configure Tesseract prior to avoid any errors related to it. The path in settings is used for Windows installation. Please configure Tesseract for a linux distribution accordingly.
 
 The breakup of the files downloaded is as follows:
 ~~~
@@ -62,6 +61,8 @@ bs4
 Pillow
 fpdf
 html5lib
+tqdm
+pytesseract
 ~~~
 
 # Features 

diff --git a/makePDF.py b/makePDF.py
@@ -20,4 +20,6 @@ def makePdf(self):
             pdf.image(pagePath,0,0)
         if not os.path.exists(os.path.join(self.path,'Output')):
             os.mkdir(os.path.join(self.path,'Output'))
-        pdf.output(os.path.join(self.path,'Output',str(self.name[:min(10,len(self.name))])+".pdf"),"F")
+        name = str(self.name[:min(10,len(self.name))]).replace(" ","")
+        name = ''.join(ch for ch in name if ch.isalnum()) + ".pdf"
+        pdf.output(os.path.join(self.path,'Output',name),"F")
diff --git a/requirements.txt b/requirements.txt
@@ -3,3 +3,5 @@ bs4
 Pillow
 fpdf
 html5lib
+tqdm
+pytesseract
diff --git a/setEmptyImageSize.py b/setEmptyImageSize.py
diff --git a/settings.json b/settings.json
@@ -1,7 +1,7 @@
 {
     "country": "co.in",
-    "page_resolution": 1500,
-    "empty_image_size": 98670,
+    "page_resolution": 500,
+    "tesseract_path": "C:\\program Files\\Tesseract-OCR\\tesseract.exe",
     "proxy_links": 0,
     "proxy_images": 0,
     "max_retry_links": 1,

diff --git a/storeImages.py b/storeImages.py
@@ -1,3 +1,4 @@
+import pytesseract
 from PIL import Image
 import requests
 import pickle
@@ -7,11 +8,11 @@
 
 class StoreImages:
 
-    def __init__(self,bookpath,proxyflag,emptyImageSize,resolution):
-        self.emptyImageSize = emptyImageSize
+    def __init__(self,bookpath,proxyflag,resolution,tesserPath):
         self.pageResolution = resolution
         self.proxyFlag = proxyflag
         self.bookPath = bookpath
+        self.tesserPath = tesserPath
         self.imagePath = os.path.join(self.bookPath,'Images')
         self.pagesFetched = {}
         self.PageLinkDict = {}
@@ -53,6 +54,20 @@ def resethead(self):
         except:
             pass
 
+    def pageEmpty(self,image):
+        im = Image.open(BytesIO(image))
+        width, height = im.size
+        im = im.resize((int(width / 5), int(height / 5)))
+        gray = im.convert('L')
+        bw = gray.point(lambda x: 0 if x < 250 else 255, '1')
+        try:
+            text = pytesseract.image_to_string(bw)
+        except:
+            pytesseract.pytesseract.tesseract_cmd = self.tesserPath
+            text = pytesseract.image_to_string(bw)
+        return text.replace('\n', " ") == 'image not available'
+
+
     def getImages(self,retries):
         self.resethead()
         for pageData in self.PageLinkDict.keys():
@@ -79,7 +94,7 @@ def getImages(self,retries):
                                 proxyFailed = True
                         else:
                             pageImage = requests.get(link + '&w=' + str(self.pageResolution), headers=self.head, verify=False)
-                        if len(pageImage.content) == self.emptyImageSize or proxyFailed:
+                        if self.pageEmpty(pageImage.content) or proxyFailed:
                             self.resethead()
                             checkIfPageFetched -= 1
                         else:
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,5 @@ bs4 @@
     Pillow
     fpdf
     html5lib
+    tqdm
+    pytesseract