-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
117 lines (103 loc) · 3.58 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""
Title: SIH 2020 Scrapper
File Name: main.py
Author: Daljeet Singh Chhabra
Language: Python
Date Created: 26-12-2019
Date Modified: 26-12-2019
##########################################################################################################
# Description:
# Main script file for scrapping.
##########################################################################################################
"""
from http.cookiejar import LWPCookieJar
from bs4 import BeautifulSoup
import mechanize
import csv
# BASE URL
URL = 'https://www.sih.gov.in/sih2020PS?page='
# Empty Data dictionary to hold data
data = {}
# Initializing the browser
br = mechanize.Browser()
# Setting Cookie Jar
cj = LWPCookieJar()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
br.addheaders = [('User-agent',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 '
'Firefox/3.0.1')]
br.set_handle_robots(False)
PS = []
pages = []
br.open('https://www.sih.gov.in/sih2020PS')
for link in br.links():
if link.text.isnumeric():
pages.append(int(link.text))
total_pages = max(pages)
for page in range(1, total_pages+1):
res = br.open(URL + str(page))
soup = BeautifulSoup(res.read())
table = soup.find("table")
# Extracting useful information
i = 0
trs = table.findAll('tr')
for i in range(1, len(trs), 7):
row = trs[i]
col = row.findAll('td')
if len(col) == 0:
continue
desc_modal = []
for des in col[2].findAll('tr'):
des_col = des.findAll('td')
desc_modal.append(des_col[0].text)
organization = col[1].string
problem_title = col[2].text.split(sep='\n')[0]
problem_title = problem_title.replace('\n', '')
problem_title = problem_title.replace('\t', '')
problem_title = problem_title.strip(' ')
problem_description = desc_modal[0]
problem_description = problem_description.replace('\n', '')
problem_description = problem_description.replace('\t', '')
problem_description = problem_description.strip(' ')
category = desc_modal[2]
domain_bucket = desc_modal[3]
youtube_link = desc_modal[4]
youtube_link = youtube_link.replace('\n', '')
youtube_link = youtube_link.replace('\t', '')
youtube_link = youtube_link.strip(' ')
dataset_link = desc_modal[5]
dataset_link = dataset_link.replace('\n', '')
dataset_link = dataset_link.replace('\t', '')
dataset_link = dataset_link.strip(' ')
ps_number = col[10].string
data = {
'organization': organization,
'problem_title': problem_title,
'problem_description:': problem_description,
'category': category,
'domain_bucket': domain_bucket,
'youtube_link': youtube_link,
'dataset_link': dataset_link,
'ps_number': ps_number
}
PS.append(data)
# print(data)
# print(PS)
csv_file = "SIH 2020 PS.csv"
try:
with open(csv_file, 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=data.keys())
writer.writeheader()
for data in PS:
writer.writerow(data)
except IOError:
print("I/O error")