-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
103 lines (66 loc) · 2.95 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import logging
import requests
import bs4
import utils
logging.basicConfig()
logger = logging.getLogger("mangastream")
logger.setLevel("DEBUG")
def get_url_image(soup_page):
div_page = soup_page.find("div", attrs={"class": "read_img"})
if not div_page:
return None
url_image = div_page.find("img")["src"]
if url_image.startswith("//"):
url_image = "http:" + url_image
return url_image
def get_url_next(soup_page):
div_page = soup_page.find("div", attrs={"class": "read_img"})
if not div_page:
return None
url_next = div_page.find("a")["href"]
return url_next
def get_chapter(url_chapter):
logger.info("Downloading chapter under URL '{0}'".format(url_chapter))
url_next = url_chapter
# create an in-memory zip-file to store the retrieved images
fzip = utils.InMemoryZip()
counter_page = 1
while True:
logger.info("Getting page '{0}'".format(url_next))
if url_next.startswith("/manga"):
url_next = "https://www.mangatown.com" + url_next
elif url_next.startswith("//www"):
url_next = "https:" + url_next
# get the page and parse it with `BeautifulSoup`
response = requests.get(url_next, timeout=60, verify=False)
soup_page = bs4.BeautifulSoup(response.content, "lxml")
# retrieve the image URL
url_image = get_url_image(soup_page=soup_page)
# stop if no image was found
if not url_image:
break
logger.info("Getting image '{0}'".format(url_image))
# retrieve the actual image
response_image = requests.get(url_image, timeout=60)
# retrieve the `Content-Type` header-value and define the extension
content_type = response_image.headers["Content-Type"]
image_extension = content_type.split("/")[-1]
# name the image through the counter and the mimetype-extension
fname_image = "{0}.{1}".format(str(counter_page).zfill(3), image_extension)
logger.info("Writing image '{0}'".format(fname_image))
# write the retrieved image into the in-memory zip-file
fzip = fzip.append(fname_image, response_image.content)
# get the next URL
url_next_candidate = get_url_next(soup_page=soup_page)
logger.info("Evaluating URL '{0}' for continuation".format(url_next_candidate))
# if anything but the last portion of the URL has changed that means we were redirected to the next chapter
# instead of the next image so halt execution
if url_next_candidate.endswith(".html"):
logger.info("URL '{0}' considered part of the chapter. Continuing".format(url_next_candidate))
url_next = url_next_candidate
else:
logger.info("URLs '{0}' and '{1}' are too disimilar. Stopping".format(url_next, url_next_candidate))
break
counter_page += 1
# return the in-memory zip-file
return fzip