Skip to content

Commit

Permalink
Better exception handling
Browse files Browse the repository at this point in the history
  • Loading branch information
vaibhavk97 committed Feb 5, 2020
1 parent 39f953e commit 73db9ae
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 56 deletions.
46 changes: 24 additions & 22 deletions GoBooDo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

parser = argparse.ArgumentParser(description='A tutorial of argparse!')
parser = argparse.ArgumentParser(description='Welcome to GoBooDo')

parser.add_argument("--id")
args = parser.parse_args()
Expand Down Expand Up @@ -68,8 +68,8 @@ def createPageDict(self,jsonResponse):

def getInitialData(self):
initUrl = "https://books.google." + self.country + "/books?id=" + self.id + "&printsec=frontcover"
page_data = requests.get(initUrl, headers=self.head, verify=False)
soup = BeautifulSoup(page_data.content, "html5lib")
pageData = requests.get(initUrl, headers=self.head, verify=False)
soup = BeautifulSoup(pageData.content, "html5lib")
self.name = soup.findAll("title")[0].contents[0]
print(f'Downloading {self.name[:-15]}')
if self.found == False:
Expand All @@ -89,7 +89,7 @@ def getInitialData(self):
with open(os.path.join(self.dataPath,'pageLinkDict.pkl'),'rb') as ofile:
self.pageLinkDict = pickle.load(ofile)
except:
print('Please delete the corresponding folder and start again')
print('Please delete the corresponding folder and start again or the book is not available for preview.')
exit(0)

def insertIntoPageDict(self, subsequentJsonData):
Expand Down Expand Up @@ -118,15 +118,15 @@ def fetchPageLinks(self,proxy=None):
try:
self.b_url = "https://books.google."+self.country+"/books?id=" + str(self.id) + "&pg=" +\
str(self.pageList[0]) + "&jscmd=click3"
page_data = requests.get(self.b_url, headers=self.head,proxies=proxyDict,verify=False)
pageData = requests.get(self.b_url, headers=self.head,proxies=proxyDict,verify=False)
except Exception as e:
print(e)
return page_data.json()
print('Could not connect with this proxy')
return pageData.json()
else:
self.b_url = "https://books.google."+self.country+"/books?id="+str(self.id)+"&pg="+ str(self.pageList[0]) \
+"&jscmd=click3"
page_data = requests.get(self.b_url, headers=self.head,verify=False)
return page_data.json()
pageData = requests.get(self.b_url, headers=self.head,verify=False)
return pageData.json()

def processBook(self):
downloadService = StoreImages(self.path,settings['proxy_images'])
Expand All @@ -135,14 +135,17 @@ def processBook(self):
service.makePdf()

def start(self):
self.getInitialData()
try:
self.getInitialData()
except:
exit(0)
try:
lastFirstList = self.pageList[0]
except:
print('There appears to be no page links to be fetched, fetching the images for downloaded links')
return self.processBook()
maxPageLimit = 0
maxPageLimitHIT = settings['max_retry_links']+1
maxPageLimitHit = settings['max_retry_links']+2
proxyFlag = 0
while True:
try:
Expand All @@ -156,9 +159,9 @@ def start(self):
self.insertIntoPageDict(interimData)
try:
if (maxPageLimit == self.pageList[0]):
maxPageLimitHIT -= 1
if (maxPageLimitHIT == 1):
maxPageLimitHIT = settings['max_retry_links']+1
maxPageLimitHit -= 1
if (maxPageLimitHit == 1):
maxPageLimitHit = settings['max_retry_links']+2
print(f'Could not fetch link for page {self.pageList[0]}')
self.obstinatePages.append(self.pageList[0])
self.pageList = self.pageList[1:]
Expand All @@ -178,15 +181,14 @@ def start(self):
self.processBook()

if __name__ == "__main__":

print('''
.88888. dP dP
d8' `88 88 88
88 .d8888b. 88d888b. .d8888b. .d8888b. .d888b88 .d8888b.
88 YP88 88' `88 88' `88 88' `88 88' `88 88' `88 88' `88
Y8. .88 88. .88 88. .88 88. .88 88. .88 88. .88 88. .88
`88888' `88888P' 88Y8888' `88888P' `88888P' `88888P8 `88888P'
oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
.88888. 888888ba 888888ba
d8' `88 88 `8b 88 `8b
88 .d8888b. a88aaaa8P' .d8888b. .d8888b. 88 88 .d8888b.
88 YP88 88' `88 88 `8b. 88' `88 88' `88 88 88 88' `88
Y8. .88 88. .88 88 .88 88. .88 88. .88 88 .8P 88. .88
`88888' `88888P' 88888888P `88888P' `88888P' 8888888P `88888P'
ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
''')
book_id = args.id
if(book_id==None or len(book_id)!=12):
Expand Down
33 changes: 18 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
### A google books downloader with proxy support.


.88888. dP dP
d8' `88 88 88
88 .d8888b. 88d888b. .d8888b. .d8888b. .d888b88 .d8888b.
88 YP88 88' `88 88' `88 88' `88 88' `88 88' `88 88' `88
Y8. .88 88. .88 88. .88 88. .88 88. .88 88. .88 88. .88
`88888' `88888P' 88Y8888' `88888P' `88888P' `88888P8 `88888P'
oooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo
.88888. 888888ba 888888ba
d8' `88 88 `8b 88 `8b
88 .d8888b. a88aaaa8P' .d8888b. .d8888b. 88 88 .d8888b.
88 YP88 88' `88 88 `8b. 88' `88 88' `88 88 88 88' `88
Y8. .88 88. .88 88 .88 88. .88 88. .88 88 .8P 88. .88
`88888' `88888P' 88888888P `88888P' `88888P' 8888888P `88888P'
ooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooooo


GoBooDo is a python3 program which downloads the books which are available for preview at Google Books. It downloads high resolution image of those pages and combines them to save the file as a PDF. Since Google limits the amount of pages accessible to each individual majorly on the basis of IP address, GoBooDo uses proxies for circumventing that limit and maximizing the number of pages that can be accessed in the preview.

GoBooDo is a **python3** program for downloading **previewable** books on Google books. It downloads high resolution images (1500dpi) of pages and combines them to save the file as a PDF.
# Usage
For downloading a book GoBooDo requires the book id which can be fetched from the url of the book. For instance consider the example below:
~~~
Expand All @@ -28,17 +28,16 @@ python GoBooDo.py --id=XUwOtdcIWdkC
The configuration can be done in the settings.json and the description is as follows:
~~~
{
"country":"co.in", //The TLD for the service that is being used.
"proxy_links":0, //0 to not allow proxy while fetching page links when current ip is banned otherwise 1.
"proxy_images":0, //0 to not allow proxy while fetching page images when current ip is banned otherwise 1.
"country":"co.in", //The TLD for the service that is being used for example books.google.co.in or books.google.de
"proxy_links":0, //0 for disabling proxy when fetching page links upon reaching the limit.
"proxy_images":0, //0 for disabling proxy when fetching page images upon reaching the limit.
"max_retry_links":1, // max retries for fetching a link using proxies.
"max_retry_images":1 // max retries for a fetching a image using proxies.
}
~~~

The output will be saved as a folder named the 'id' of the book which was given as input. The final PDF will be in the output folder inside it.
Proxies may be added in proxies.txt (a sample proxy has been added already). After the program has finished running once, it will keep the state of the book saved and any subsequent runs will resume from the previous state of links and images.

The output will be saved as a folder named the 'id' of the book which was given as input. The final PDF will be in the output folder inside it along with a folder containing the images.
Proxies may be added in proxies.txt (a sample proxy has been added already).
# Dependencies
~~~
requests
Expand All @@ -48,6 +47,10 @@ fpdf
html5lib
~~~

#Features
1. Stateful : GoBooDo keeps a track of the books which are downloaded. In each subsequent iterations of operation only those those links and images are fetched which were not downloaded earlier.
2. Proxy support : Since Google limits the amount of pages accessible to each individual majorly on the basis of IP address, GoBooDo uses proxies for circumventing that limit and maximizing the number of pages that can be accessed in the preview.

# Todo
1. Add proxy integration with a checker.
2. Make the system more robust from being detected by google.
6 changes: 5 additions & 1 deletion proxies.txt
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
94.20.21.37:3128
203.190.53.45:53927
85.90.215.111:3128
46.130.117.84:8080
1.10.189.156:34899
115.124.86.107:37600
10 changes: 5 additions & 5 deletions settings.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"country":"co.in",
"proxy_links":0,
"proxy_images":0,
"max_retry_links":1,
"max_retry_images":1
"country": "de",
"proxy_links": 0,
"proxy_images": 0,
"max_retry_links": 1,
"max_retry_images": 1
}
30 changes: 17 additions & 13 deletions storeImages.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class StoreImages:

def __init__(self,bookpath,proxyflag):
self.proxyflag = proxyflag
self.proxyFlag = proxyflag
self.bookPath = bookpath
self.imagePath = os.path.join(self.bookPath,'Images')
self.pagesFetched = {}
Expand All @@ -21,7 +21,6 @@ def __init__(self,bookpath,proxyflag):
path = os.path.join(bookpath, 'data', "pagesFetched.pkl")
with open(path, 'rb') as ofile:
self.pagesFetched = pickle.load(ofile)

path = os.path.join(bookpath, 'data', "pageLinkDict.pkl")
with open(path, 'rb') as ofile:
allPages = pickle.load(ofile)
Expand Down Expand Up @@ -54,36 +53,41 @@ def resethead(self):

def getImages(self,retries):
self.resethead()
for page_data in self.PageLinkDict.keys():
for pageData in self.PageLinkDict.keys():
try:
link = self.PageLinkDict[page_data]['src']
link = self.PageLinkDict[pageData]['src']
if not link:
continue
page_number = self.PageLinkDict[page_data]['order']
pageNumber = self.PageLinkDict[pageData]['order']
checkIfPageFetched = retries
while checkIfPageFetched > 0:
if checkIfPageFetched != retries and self.proxyflag :
proxyFailed = False
if checkIfPageFetched != retries and self.proxyFlag :
proxy = self.getProxy()
proxyDict = {
"http": 'http://' + proxy,
"https": 'https://' + proxy,
}
print(f'Using {proxy} for the image of page {page_number}')
pageImage = requests.get(link + '&w=1500', headers=self.head,proxies=proxyDict,verify=False)
print(f'Using {proxy} for the image of page {pageNumber}')
proxyFailed = False
try:
pageImage = requests.get(link + '&w=1500', headers=self.head,proxies=proxyDict,verify=False)
except:
proxyFailed = True
else:
pageImage = requests.get(link + '&w=1500', headers=self.head,verify=False)
if len(pageImage.content) == 98670:
if len(pageImage.content) == 98670 or proxyFailed:
self.resethead()
checkIfPageFetched -= 1
else:
checkIfPageFetched = -1
print(f'Fetched image for page {page_number}')
self.pagesFetched[page_data]=self.PageLinkDict[page_data]
print(f'Fetched image for page {pageNumber}')
self.pagesFetched[pageData]=self.PageLinkDict[pageData]
im = Image.open(BytesIO(pageImage.content))
im.save(os.path.join(self.imagePath,str(page_number)+".png"))
im.save(os.path.join(self.imagePath,str(pageNumber)+".png"))
else:
if(checkIfPageFetched==0):
print("Could not fetch the image of " + str(page_number))
print("Could not fetch the image for page " + str(pageNumber))
except Exception as e:
print(e)
with open(os.path.join(self.bookPath,'data','pagesFetched.pkl'),'wb+') as ofile:
Expand Down

0 comments on commit 73db9ae

Please sign in to comment.