diff --git a/GoBooDo.py b/GoBooDo.py index 5dcebfd..8af2eb3 100644 --- a/GoBooDo.py +++ b/GoBooDo.py @@ -12,7 +12,7 @@ import urllib3 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) -parser = argparse.ArgumentParser(description='A tutorial of argparse!') +parser = argparse.ArgumentParser(description='Welcome to GoBooDo') parser.add_argument("--id") args = parser.parse_args() @@ -68,8 +68,8 @@ def createPageDict(self,jsonResponse): def getInitialData(self): initUrl = "https://books.google." + self.country + "/books?id=" + self.id + "&printsec=frontcover" - page_data = requests.get(initUrl, headers=self.head, verify=False) - soup = BeautifulSoup(page_data.content, "html5lib") + pageData = requests.get(initUrl, headers=self.head, verify=False) + soup = BeautifulSoup(pageData.content, "html5lib") self.name = soup.findAll("title")[0].contents[0] print(f'Downloading {self.name[:-15]}') if self.found == False: @@ -89,7 +89,7 @@ def getInitialData(self): with open(os.path.join(self.dataPath,'pageLinkDict.pkl'),'rb') as ofile: self.pageLinkDict = pickle.load(ofile) except: - print('Please delete the corresponding folder and start again') + print('Please delete the corresponding folder and start again or the book is not available for preview.') exit(0) def insertIntoPageDict(self, subsequentJsonData): @@ -118,15 +118,15 @@ def fetchPageLinks(self,proxy=None): try: self.b_url = "https://books.google."+self.country+"/books?id=" + str(self.id) + "&pg=" +\ str(self.pageList[0]) + "&jscmd=click3" - page_data = requests.get(self.b_url, headers=self.head,proxies=proxyDict,verify=False) + pageData = requests.get(self.b_url, headers=self.head,proxies=proxyDict,verify=False) except Exception as e: - print(e) - return page_data.json() + print('Could not connect with this proxy') + return pageData.json() else: self.b_url = "https://books.google."+self.country+"/books?id="+str(self.id)+"&pg="+ str(self.pageList[0]) \ +"&jscmd=click3" - page_data = requests.get(self.b_url, headers=self.head,verify=False) - return page_data.json() + pageData = requests.get(self.b_url, headers=self.head,verify=False) + return pageData.json() def processBook(self): downloadService = StoreImages(self.path,settings['proxy_images']) @@ -135,14 +135,17 @@ def processBook(self): service.makePdf() def start(self): - self.getInitialData() + try: + self.getInitialData() + except: + exit(0) try: lastFirstList = self.pageList[0] except: print('There appears to be no page links to be fetched, fetching the images for downloaded links') return self.processBook() maxPageLimit = 0 - maxPageLimitHIT = settings['max_retry_links']+1 + maxPageLimitHit = settings['max_retry_links']+2 proxyFlag = 0 while True: try: @@ -156,9 +159,9 @@ def start(self): self.insertIntoPageDict(interimData) try: if (maxPageLimit == self.pageList[0]): - maxPageLimitHIT -= 1 - if (maxPageLimitHIT == 1): - maxPageLimitHIT = settings['max_retry_links']+1 + maxPageLimitHit -= 1 + if (maxPageLimitHit == 1): + maxPageLimitHit = settings['max_retry_links']+2 print(f'Could not fetch link for page {self.pageList[0]}') self.obstinatePages.append(self.pageList[0]) self.pageList = self.pageList[1:] @@ -178,15 +181,14 @@ def start(self): self.processBook() if __name__ == "__main__": - print(''' - .88888. dP dP -d8' `88 88 88 -88 .d8888b. 88d888b. .d8888b. .d8888b. .d888b88 .d8888b. -88 YP88 88' `88 88' `88 88' `88 88' `88 88' `88 88' `88 -Y8. .88 88. .88 88. .88 88. .88 88. .88 88. .88 88. .88 - `88888' `88888P' 88Y8888' `88888P' `88888P' `88888P8 `88888P' -oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo + .88888. 888888ba 888888ba +d8' `88 88 `8b 88 `8b +88 .d8888b. a88aaaa8P' .d8888b. .d8888b. 88 88 .d8888b. +88 YP88 88' `88 88 `8b. 88' `88 88' `88 88 88 88' `88 +Y8. .88 88. .88 88 .88 88. .88 88. .88 88 .8P 88. .88 + `88888' `88888P' 88888888P `88888P' `88888P' 8888888P `88888P' +ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo ''') book_id = args.id if(book_id==None or len(book_id)!=12): diff --git a/README.md b/README.md index 5fb40cf..2aa9848 100644 --- a/README.md +++ b/README.md @@ -2,17 +2,17 @@ ### A google books downloader with proxy support. - .88888. dP dP - d8' `88 88 88 - 88 .d8888b. 88d888b. .d8888b. .d8888b. .d888b88 .d8888b. - 88 YP88 88' `88 88' `88 88' `88 88' `88 88' `88 88' `88 - Y8. .88 88. .88 88. .88 88. .88 88. .88 88. .88 88. .88 - `88888' `88888P' 88Y8888' `88888P' `88888P' `88888P8 `88888P' - oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo + .88888. 888888ba 888888ba + d8' `88 88 `8b 88 `8b + 88 .d8888b. a88aaaa8P' .d8888b. .d8888b. 88 88 .d8888b. + 88 YP88 88' `88 88 `8b. 88' `88 88' `88 88 88 88' `88 + Y8. .88 88. .88 88 .88 88. .88 88. .88 88 .8P 88. .88 + `88888' `88888P' 88888888P `88888P' `88888P' 8888888P `88888P' + ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo + -GoBooDo is a python3 program which downloads the books which are available for preview at Google Books. It downloads high resolution image of those pages and combines them to save the file as a PDF. Since Google limits the amount of pages accessible to each individual majorly on the basis of IP address, GoBooDo uses proxies for circumventing that limit and maximizing the number of pages that can be accessed in the preview. - +GoBooDo is a **python3** program for downloading **previewable** books on Google books. It downloads high resolution images (1500dpi) of pages and combines them to save the file as a PDF. # Usage For downloading a book GoBooDo requires the book id which can be fetched from the url of the book. For instance consider the example below: ~~~ @@ -28,17 +28,16 @@ python GoBooDo.py --id=XUwOtdcIWdkC The configuration can be done in the settings.json and the description is as follows: ~~~ { - "country":"co.in", //The TLD for the service that is being used. - "proxy_links":0, //0 to not allow proxy while fetching page links when current ip is banned otherwise 1. - "proxy_images":0, //0 to not allow proxy while fetching page images when current ip is banned otherwise 1. + "country":"co.in", //The TLD for the service that is being used for example books.google.co.in or books.google.de + "proxy_links":0, //0 for disabling proxy when fetching page links upon reaching the limit. + "proxy_images":0, //0 for disabling proxy when fetching page images upon reaching the limit. "max_retry_links":1, // max retries for fetching a link using proxies. "max_retry_images":1 // max retries for a fetching a image using proxies. } ~~~ -The output will be saved as a folder named the 'id' of the book which was given as input. The final PDF will be in the output folder inside it. -Proxies may be added in proxies.txt (a sample proxy has been added already). After the program has finished running once, it will keep the state of the book saved and any subsequent runs will resume from the previous state of links and images. - +The output will be saved as a folder named the 'id' of the book which was given as input. The final PDF will be in the output folder inside it along with a folder containing the images. +Proxies may be added in proxies.txt (a sample proxy has been added already). # Dependencies ~~~ requests @@ -48,6 +47,10 @@ fpdf html5lib ~~~ +#Features +1. Stateful : GoBooDo keeps a track of the books which are downloaded. In each subsequent iterations of operation only those those links and images are fetched which were not downloaded earlier. +2. Proxy support : Since Google limits the amount of pages accessible to each individual majorly on the basis of IP address, GoBooDo uses proxies for circumventing that limit and maximizing the number of pages that can be accessed in the preview. + # Todo 1. Add proxy integration with a checker. 2. Make the system more robust from being detected by google. diff --git a/proxies.txt b/proxies.txt index 4a1b67b..dc5126d 100644 --- a/proxies.txt +++ b/proxies.txt @@ -1 +1,5 @@ -94.20.21.37:3128 \ No newline at end of file +203.190.53.45:53927 +85.90.215.111:3128 +46.130.117.84:8080 +1.10.189.156:34899 +115.124.86.107:37600 \ No newline at end of file diff --git a/settings.json b/settings.json index 056f18a..b6b4b30 100644 --- a/settings.json +++ b/settings.json @@ -1,7 +1,7 @@ { - "country":"co.in", - "proxy_links":0, - "proxy_images":0, - "max_retry_links":1, - "max_retry_images":1 + "country": "de", + "proxy_links": 0, + "proxy_images": 0, + "max_retry_links": 1, + "max_retry_images": 1 } \ No newline at end of file diff --git a/storeImages.py b/storeImages.py index 6caf9d5..975d2ef 100644 --- a/storeImages.py +++ b/storeImages.py @@ -8,7 +8,7 @@ class StoreImages: def __init__(self,bookpath,proxyflag): - self.proxyflag = proxyflag + self.proxyFlag = proxyflag self.bookPath = bookpath self.imagePath = os.path.join(self.bookPath,'Images') self.pagesFetched = {} @@ -21,7 +21,6 @@ def __init__(self,bookpath,proxyflag): path = os.path.join(bookpath, 'data', "pagesFetched.pkl") with open(path, 'rb') as ofile: self.pagesFetched = pickle.load(ofile) - path = os.path.join(bookpath, 'data', "pageLinkDict.pkl") with open(path, 'rb') as ofile: allPages = pickle.load(ofile) @@ -54,36 +53,41 @@ def resethead(self): def getImages(self,retries): self.resethead() - for page_data in self.PageLinkDict.keys(): + for pageData in self.PageLinkDict.keys(): try: - link = self.PageLinkDict[page_data]['src'] + link = self.PageLinkDict[pageData]['src'] if not link: continue - page_number = self.PageLinkDict[page_data]['order'] + pageNumber = self.PageLinkDict[pageData]['order'] checkIfPageFetched = retries while checkIfPageFetched > 0: - if checkIfPageFetched != retries and self.proxyflag : + proxyFailed = False + if checkIfPageFetched != retries and self.proxyFlag : proxy = self.getProxy() proxyDict = { "http": 'http://' + proxy, "https": 'https://' + proxy, } - print(f'Using {proxy} for the image of page {page_number}') - pageImage = requests.get(link + '&w=1500', headers=self.head,proxies=proxyDict,verify=False) + print(f'Using {proxy} for the image of page {pageNumber}') + proxyFailed = False + try: + pageImage = requests.get(link + '&w=1500', headers=self.head,proxies=proxyDict,verify=False) + except: + proxyFailed = True else: pageImage = requests.get(link + '&w=1500', headers=self.head,verify=False) - if len(pageImage.content) == 98670: + if len(pageImage.content) == 98670 or proxyFailed: self.resethead() checkIfPageFetched -= 1 else: checkIfPageFetched = -1 - print(f'Fetched image for page {page_number}') - self.pagesFetched[page_data]=self.PageLinkDict[page_data] + print(f'Fetched image for page {pageNumber}') + self.pagesFetched[pageData]=self.PageLinkDict[pageData] im = Image.open(BytesIO(pageImage.content)) - im.save(os.path.join(self.imagePath,str(page_number)+".png")) + im.save(os.path.join(self.imagePath,str(pageNumber)+".png")) else: if(checkIfPageFetched==0): - print("Could not fetch the image of " + str(page_number)) + print("Could not fetch the image for page " + str(pageNumber)) except Exception as e: print(e) with open(os.path.join(self.bookPath,'data','pagesFetched.pkl'),'wb+') as ofile: