Better exception handling

minamotorin · Feb 5, 2020 · 73db9ae · 73db9ae
1 parent 39f953e
commit 73db9ae
Show file tree

Hide file tree

Showing 5 changed files with 69 additions and 56 deletions.
diff --git a/GoBooDo.py b/GoBooDo.py
@@ -12,7 +12,7 @@
 import urllib3
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 
-parser = argparse.ArgumentParser(description='A tutorial of argparse!')
+parser = argparse.ArgumentParser(description='Welcome to GoBooDo')
 
 parser.add_argument("--id")
 args = parser.parse_args()
@@ -68,8 +68,8 @@ def createPageDict(self,jsonResponse):
 
     def getInitialData(self):
         initUrl = "https://books.google." + self.country + "/books?id=" + self.id + "&printsec=frontcover"
-        page_data = requests.get(initUrl, headers=self.head, verify=False)
-        soup = BeautifulSoup(page_data.content, "html5lib")
+        pageData = requests.get(initUrl, headers=self.head, verify=False)
+        soup = BeautifulSoup(pageData.content, "html5lib")
         self.name = soup.findAll("title")[0].contents[0]
         print(f'Downloading {self.name[:-15]}')
         if self.found == False:
@@ -89,7 +89,7 @@ def getInitialData(self):
                 with open(os.path.join(self.dataPath,'pageLinkDict.pkl'),'rb') as ofile:
                     self.pageLinkDict = pickle.load(ofile)
             except:
-                print('Please delete the corresponding folder and start again')
+                print('Please delete the corresponding folder and start again or the book is not available for preview.')
                 exit(0)
 
     def insertIntoPageDict(self, subsequentJsonData):
@@ -118,15 +118,15 @@ def fetchPageLinks(self,proxy=None):
             try:
                 self.b_url = "https://books.google."+self.country+"/books?id=" + str(self.id) + "&pg=" +\
                              str(self.pageList[0]) + "&jscmd=click3"
-                page_data = requests.get(self.b_url, headers=self.head,proxies=proxyDict,verify=False)
+                pageData = requests.get(self.b_url, headers=self.head,proxies=proxyDict,verify=False)
             except Exception as e:
-                print(e)
-            return page_data.json()
+                print('Could not connect with this proxy')
+            return pageData.json()
         else:
             self.b_url = "https://books.google."+self.country+"/books?id="+str(self.id)+"&pg="+ str(self.pageList[0]) \
                          +"&jscmd=click3"
-            page_data = requests.get(self.b_url, headers=self.head,verify=False)
-            return page_data.json()
+            pageData = requests.get(self.b_url, headers=self.head,verify=False)
+            return pageData.json()
 
     def processBook(self):
         downloadService = StoreImages(self.path,settings['proxy_images'])
@@ -135,14 +135,17 @@ def processBook(self):
         service.makePdf()
 
     def start(self):
-        self.getInitialData()
+        try:
+            self.getInitialData()
+        except:
+            exit(0)
         try:
             lastFirstList = self.pageList[0]
         except:
             print('There appears to be no page links to be fetched, fetching the images for downloaded links')
             return self.processBook()
         maxPageLimit = 0
-        maxPageLimitHIT = settings['max_retry_links']+1
+        maxPageLimitHit = settings['max_retry_links']+2
         proxyFlag = 0
         while True:
             try:
@@ -156,9 +159,9 @@ def start(self):
             self.insertIntoPageDict(interimData)
             try:
                 if (maxPageLimit == self.pageList[0]):
-                    maxPageLimitHIT -= 1
-                if (maxPageLimitHIT == 1):
-                    maxPageLimitHIT = settings['max_retry_links']+1
+                    maxPageLimitHit -= 1
+                if (maxPageLimitHit == 1):
+                    maxPageLimitHit = settings['max_retry_links']+2
                     print(f'Could not fetch link for page {self.pageList[0]}')
                     self.obstinatePages.append(self.pageList[0])
                     self.pageList = self.pageList[1:]
@@ -178,15 +181,14 @@ def start(self):
         self.processBook()
 
 if __name__ == "__main__":
-
     print('''
- .88888.           dP                               dP          
-d8'   `88          88                               88          
-88        .d8888b. 88d888b. .d8888b. .d8888b. .d888b88 .d8888b. 
-88   YP88 88'  `88 88'  `88 88'  `88 88'  `88 88'  `88 88'  `88 
-Y8.   .88 88.  .88 88.  .88 88.  .88 88.  .88 88.  .88 88.  .88 
- `88888'  `88888P' 88Y8888' `88888P' `88888P' `88888P8 `88888P' 
-oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo                                                                         
+ .88888.            888888ba                    888888ba           
+d8'   `88           88    `8b                   88    `8b          
+88        .d8888b. a88aaaa8P' .d8888b. .d8888b. 88     88 .d8888b. 
+88   YP88 88'  `88  88   `8b. 88'  `88 88'  `88 88     88 88'  `88 
+Y8.   .88 88.  .88  88    .88 88.  .88 88.  .88 88    .8P 88.  .88 
+ `88888'  `88888P'  88888888P `88888P' `88888P' 8888888P  `88888P' 
+ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo                                                                                                                                
     ''')
     book_id = args.id
     if(book_id==None or len(book_id)!=12):

diff --git a/README.md b/README.md
@@ -2,17 +2,17 @@
 ### A google books downloader with proxy support.
 
 
-     .88888.           dP                               dP
-    d8'   `88          88                               88
-    88        .d8888b. 88d888b. .d8888b. .d8888b. .d888b88 .d8888b.
-    88   YP88 88'  `88 88'  `88 88'  `88 88'  `88 88'  `88 88'  `88
-    Y8.   .88 88.  .88 88.  .88 88.  .88 88.  .88 88.  .88 88.  .88
-     `88888'  `88888P' 88Y8888' `88888P' `88888P' `88888P8 `88888P'
-    oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
+     .88888.            888888ba                    888888ba           
+    d8'   `88           88    `8b                   88    `8b          
+    88        .d8888b. a88aaaa8P' .d8888b. .d8888b. 88     88 .d8888b. 
+    88   YP88 88'  `88  88   `8b. 88'  `88 88'  `88 88     88 88'  `88 
+    Y8.   .88 88.  .88  88    .88 88.  .88 88.  .88 88    .8P 88.  .88 
+     `88888'  `88888P'  88888888P `88888P' `88888P' 8888888P  `88888P' 
+    ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
+                                                                   
 
 
-GoBooDo is a python3 program which downloads the books which are available for preview at Google Books. It downloads high resolution image of those pages and combines them to save the file as a PDF. Since Google limits the amount of pages accessible to each individual majorly on the basis of IP address, GoBooDo uses proxies for circumventing that limit and maximizing the number of pages that can be accessed in the preview.
-
+GoBooDo is a **python3** program for downloading **previewable** books on Google books. It downloads high resolution images (1500dpi) of pages and combines them to save the file as a PDF. 
 # Usage
 For downloading a book GoBooDo requires the book id which can be fetched from the url of the book. For instance consider the example below:
 ~~~
@@ -28,17 +28,16 @@ python GoBooDo.py --id=XUwOtdcIWdkC
 The configuration can be done in the settings.json and the description is as follows:
 ~~~
 {
-  "country":"co.in", //The TLD for the service that is being used.
-  "proxy_links":0,   //0 to not allow proxy while fetching page links when current ip is banned otherwise 1.
-  "proxy_images":0,  //0 to not allow proxy while fetching page images when current ip is banned otherwise 1.
+  "country":"co.in", //The TLD for the service that is being used for example books.google.co.in or books.google.de
+  "proxy_links":0,   //0 for disabling proxy when fetching page links upon reaching the limit.
+  "proxy_images":0,  //0 for disabling proxy when fetching  page images upon reaching the limit.
   "max_retry_links":1, // max retries for fetching a link using proxies.
   "max_retry_images":1 // max retries for a fetching a image using proxies.
 }
 ~~~
 
-The output will be saved as a folder named the 'id' of the book which was given as input. The final PDF will be in the output folder inside it.
-Proxies may be added in proxies.txt (a sample proxy has been added already). After the program has finished running once, it will keep the state of the book saved and any subsequent runs will resume from the previous state of links and images.
-
+The output will be saved as a folder named the 'id' of the book which was given as input. The final PDF will be in the output folder inside it along with a folder containing the images.
+Proxies may be added in proxies.txt (a sample proxy has been added already).
 # Dependencies
 ~~~
 requests
@@ -48,6 +47,10 @@ fpdf
 html5lib
 ~~~
 
+#Features 
+1. Stateful : GoBooDo keeps a track of the books which are downloaded. In each subsequent iterations of operation only those those links and images are fetched which were not downloaded earlier.
+2. Proxy support : Since Google limits the amount of pages accessible to each individual majorly on the basis of IP address, GoBooDo uses proxies for circumventing that limit and maximizing the number of pages that can be accessed in the preview.
+
 # Todo
 1. Add proxy integration with a checker.
 2. Make the system more robust from being detected by google.
diff --git a/proxies.txt b/proxies.txt
@@ -1 +1,5 @@
-94.20.21.37:3128
+203.190.53.45:53927
+85.90.215.111:3128
+46.130.117.84:8080
+1.10.189.156:34899
+115.124.86.107:37600
diff --git a/settings.json b/settings.json
@@ -1,7 +1,7 @@
 {
-  "country":"co.in",
-  "proxy_links":0,
-  "proxy_images":0,
-  "max_retry_links":1,
-  "max_retry_images":1
+  "country": "de",
+  "proxy_links": 0,
+  "proxy_images": 0,
+  "max_retry_links": 1,
+  "max_retry_images": 1
 }
diff --git a/storeImages.py b/storeImages.py
@@ -8,7 +8,7 @@
 class StoreImages:
 
     def __init__(self,bookpath,proxyflag):
-        self.proxyflag = proxyflag
+        self.proxyFlag = proxyflag
         self.bookPath = bookpath
         self.imagePath = os.path.join(self.bookPath,'Images')
         self.pagesFetched = {}
@@ -21,7 +21,6 @@ def __init__(self,bookpath,proxyflag):
             path = os.path.join(bookpath, 'data', "pagesFetched.pkl")
             with open(path, 'rb') as ofile:
                 self.pagesFetched = pickle.load(ofile)
-
             path = os.path.join(bookpath, 'data', "pageLinkDict.pkl")
             with open(path, 'rb') as ofile:
                 allPages = pickle.load(ofile)
@@ -54,36 +53,41 @@ def resethead(self):
 
     def getImages(self,retries):
         self.resethead()
-        for page_data in self.PageLinkDict.keys():
+        for pageData in self.PageLinkDict.keys():
                 try:
-                    link = self.PageLinkDict[page_data]['src']
+                    link = self.PageLinkDict[pageData]['src']
                     if not link:
                         continue
-                    page_number = self.PageLinkDict[page_data]['order']
+                    pageNumber = self.PageLinkDict[pageData]['order']
                     checkIfPageFetched = retries
                     while checkIfPageFetched > 0:
-                        if checkIfPageFetched != retries and self.proxyflag :
+                        proxyFailed = False
+                        if checkIfPageFetched != retries and self.proxyFlag :
                             proxy = self.getProxy()
                             proxyDict = {
                                 "http": 'http://' + proxy,
                                 "https": 'https://' + proxy,
                             }
-                            print(f'Using {proxy} for the image of page {page_number}')
-                            pageImage = requests.get(link + '&w=1500', headers=self.head,proxies=proxyDict,verify=False)
+                            print(f'Using {proxy} for the image of page {pageNumber}')
+                            proxyFailed = False
+                            try:
+                                pageImage = requests.get(link + '&w=1500', headers=self.head,proxies=proxyDict,verify=False)
+                            except:
+                                proxyFailed = True
                         else:
                             pageImage = requests.get(link + '&w=1500', headers=self.head,verify=False)
-                        if len(pageImage.content) == 98670:
+                        if len(pageImage.content) == 98670 or proxyFailed:
                             self.resethead()
                             checkIfPageFetched -= 1
                         else:
                             checkIfPageFetched = -1
-                            print(f'Fetched image for page {page_number}')
-                            self.pagesFetched[page_data]=self.PageLinkDict[page_data]
+                            print(f'Fetched image for page {pageNumber}')
+                            self.pagesFetched[pageData]=self.PageLinkDict[pageData]
                             im = Image.open(BytesIO(pageImage.content))
-                            im.save(os.path.join(self.imagePath,str(page_number)+".png"))
+                            im.save(os.path.join(self.imagePath,str(pageNumber)+".png"))
                     else:
                         if(checkIfPageFetched==0):
-                            print("Could not fetch the image of " + str(page_number))
+                            print("Could not fetch the image for page " + str(pageNumber))
                 except Exception as e:
                     print(e)
         with open(os.path.join(self.bookPath,'data','pagesFetched.pkl'),'wb+') as ofile: