-
Notifications
You must be signed in to change notification settings - Fork 3
/
imdb.py
69 lines (55 loc) · 2.2 KB
/
imdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from acrawler import Crawler, Request, ParselItem, Handler, register, get_logger
class MovieItem(ParselItem):
log = True
css = {
# just some normal css rules
"date": ".subtext a[href*=releaseinfo]::text",
"time": ".subtext time::text",
"rating": "span[itemprop=ratingValue]::text",
"rating_count": "span[itemprop=ratingCount]::text",
"metascore": ".metacriticScore span::text",
# if you provide a list with additional functions,
# they are considered as field processor function
"title": ["h1::text", str.strip],
# the following four fules is get all matching values
# the rule starts with [ and ends with ] comparing to normal rules
"genres": "[.subtext a[href*=genres]::text]",
"director": "[h4:contains(Director) ~ a[href*=name]::text]",
"writers": "[h4:contains(Writer) ~ a[href*=name]::text]",
"stars": "[h4:contains(Star) ~ a[href*=name]::text]",
}
class IMDBCrawler(Crawler):
config = {"MAX_REQUESTS": 4, "DOWNLOAD_DELAY": 1}
async def start_requests(self):
yield Request("https://www.imdb.com/chart/moviemeter")
def parse(self, response):
yield from response.follow(
".lister-list tr .titleColumn a::attr(href)", callback=self.parse_movie
)
def parse_movie(self, response):
url = response.url_str
yield MovieItem(response.sel, extra={"url": url.split("?")[0]})
@register()
class HorrorHandler(Handler):
family = "MovieItem"
logger = get_logger("horrorlog")
async def handle_after(self, item):
if item["genres"] and "Horror" in item["genres"]:
self.logger.warning(f"({item['title']}) is a horror movie!!!!")
@MovieItem.bind()
def process_time(value):
# a self-defined field processing function
# process time to minutes
# '3h 1min' -> 181
if value:
res = 0
segs = value.split(" ")
for seg in segs:
if seg.endswith("min"):
res += int(seg.replace("min", ""))
elif seg.endswith("h"):
res += 60 * int(seg.replace("h", ""))
return res
return value
if __name__ == "__main__":
IMDBCrawler().run()