-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathissue_crawler.py
125 lines (103 loc) · 5.93 KB
/
issue_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Luiz Felipe Fronchetti
# Special thanks to Luiz H. Susin, who collaborated with a similar code in Java.
import scrapy
import os
import json
from datetime import datetime
class IssueSpider(scrapy.Spider):
"""
Spiders are classes which define how a certain site (or a group of sites) will be scraped, including how to perform the crawl (i.e. follow links)
and how to extract structured data from their pages (i.e. scraping items). In other words, Spiders are the place where you define the custom behaviour
for crawling and parsing pages for a particular site (or, in some cases, a group of sites).
How to use:
to collect Rails project issues (page one to ten), via terminal, use:
$ scrapy runspider IssueSpider.py -a filename=rails.txt -a url=https://github.com/rails/rails -a firstpage=1 -a lastpage=10
Variables:
name: A string which defines the name for this spider.
allowed_domains: An optional list of strings containing domains that this spider is allowed to crawl.
start_urls: A list of URLs where the spider will begin to crawl from.
custom_settings: A dictionary of settings that will be overridden from the project wide configuration when running this spider.
Methods:
__init__: initial method, receives parameters by reference and instantiates the utility class .
parse: default callback used by Scrapy to process downloaded responses, returns the url of each issue available on the issues table [1].
parse_inside_issue: in this method we extract all data, issue by issue [2].
[1] Issues Table Page Example: https://github.com/rails/rails/issues
[2] Issue Page Example: https://github.com/rails/rails/issues/1
* Read more about Scrapy Spiders at: http://doc.scrapy.org/en/latest/topics/spiders.html
"""
name = 'IssueSpider'
allowed_domains = ['github.com']
custom_settings = {
'DOWNLOAD_DELAY': 0.5,
'CONCURRENT_REQUESTS': 1,
'COOKIES_ENABLED': False,
'ROBOTSTXT_OBEY': False,
'RANDOMIZE_DOWNLOAD_DELAY': True
}
def __init__(self, filename=None, url=None, firstpage=None, lastpage=None, *args, **kwargs):
super(IssueSpider, self).__init__(*args, **kwargs)
self.utils = Utils(filename)
self.utils.write_header()
self.start_urls = self.utils.define_start_urls(url, firstpage, lastpage)
def parse(self, response):
for sel in response.xpath('//li//div[@class="d-table width-full Box-row--drag-hide lh-condensed"]//div[@class="d-table-cell width-full p-3"]'):
url = sel.xpath('a/@href').extract()[0]
url = ('https://github.com' + url)
yield scrapy.Request(url, callback=self.parse_inside_issue)
def parse_inside_issue(self, response):
for sel in response.xpath('//div[@class="issues-listing"]'):
number = sel.xpath('//div[@class="gh-header-show "]//span[@class="gh-header-number"]/text()').extract()
header = sel.xpath('//div[@class="flex-table gh-header-meta"]//div[@class="flex-table-item flex-table-item-primary"]')
comment = header.xpath('text()').extract()
author = header.xpath('a[@class="author"]/text()').extract()
open_date = header.xpath('relative-time/@datetime').extract()
close_date = response.xpath('//div[@class="discussion-item discussion-item-closed"]//div[@class="discussion-item-header"]//relative-time[@class="timestamp"]/@datetime').extract()
tags = response.xpath('//span[@class="timeline-comment-label"]/text()').extract()
self.utils.write_issue(number, comment, author, open_date, close_date, tags)
class Utils():
def __init__(self, file_name):
self.file_destination = os.path.dirname(os.path.abspath(__file__)) + '/' + file_name
def define_start_urls(self, url, firstpage, lastpage):
start_urls = []
for page_number in range(int(firstpage), int(lastpage)):
start_urls.append(url + '/issues?page=' + str(page_number) + '&q=is%3Aissue')
return start_urls
def write_header(self):
header_data = 'Identifier, Author, Author Tag, Number of Comments, Opening Date, Closing Date \n'
with open(self.file_destination, 'w') as f:
f.write(header_data)
f.close()
def write_issue(self, number, comment, author, open_date, close_date, tags):
number = number[0].encode('utf-8').strip()
author = author[0].encode('utf-8').strip()
comment = comment[3].encode('utf-8').strip()
open_time_string = datetime.strptime(open_date[0], '%Y-%m-%dT%H:%M:%SZ')
open_date_format = open_time_string.strftime('%d/%m/%Y').encode('utf-8')
if not close_date:
close_date_format = 'None'
else:
if len(close_date) > 1:
close_date_format = None
for date in close_date:
close_time_string = datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
if not close_date_format:
close_date_format = close_time_string.strftime('%d/%m/%Y').encode('utf-8')
else:
close_date_format = close_date_format + ' - ' + close_time_string.strftime('%d/%m/%Y').encode('utf-8')
else:
close_time_string = datetime.strptime(close_date[0], '%Y-%m-%dT%H:%M:%SZ')
close_date_format = close_time_string.strftime('%d/%m/%Y').encode('utf-8')
if not tags:
tags_format = 'None'
else:
tags_format = []
for tag in tags:
if tag not in tags_format:
tags_format.append(tag.encode('utf-8').strip())
with open(self.file_destination, 'a') as f:
f.write(
number + ', ' + author + ', ' + str(tags_format) + ', ' +
comment + ', ' + str(open_date_format) + ', ' + str(close_date_format) + '\n')
f.close()