-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathWeb Scraping from BreakoutPoint.py
127 lines (110 loc) · 5.95 KB
/
Web Scraping from BreakoutPoint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import datetime
import time
import requests
import json
from lxml import html
session = requests.session()
Symbol_arr = []
Company_arr = []
ShortActivist_arr = []
ReleaseDate_arr = []
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9', 'Connection': 'keep-alive', 'Referer': 'https://breakoutpoint.com/',
'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', 'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"', }
def login():
response = session.get('https://breakoutpoint.com/accounts/login/', headers=headers)
csrftoken = response.cookies.get_dict()['csrftoken']
data = {'csrfmiddlewaretoken': csrftoken, 'login': '[email protected]', 'password': 'Christophe24!', }
response = session.post('https://breakoutpoint.com/accounts/login/', headers=headers, data=data)
global csrfmiddlewaretoken
doc = html.fromstring(response.text)
csrfmiddlewaretoken = doc.xpath('//*[@name="csrfmiddlewaretoken"]/@value')[0].strip()
def get_data():
global Symbol_arr
global Company_arr
global ShortActivist_arr
global ReleaseDate_arr
tmp_symbol_lst = []
tmp_company_lst = []
tmp_shortactivist_lst = []
tmp_relase_date_lst = []
tmp_dict_data = []
page = 0
while True:
data = {'columns[0][data]': '13', 'columns[0][name]': 'symbol', 'columns[0][searchable]': 'true',
'columns[0][orderable]': 'true', 'columns[0][search][value]': '', 'columns[0][search][regex]': 'false',
'columns[1][data]': '1', 'columns[1][name]': 'issuer', 'columns[1][searchable]': 'true',
'columns[1][orderable]': 'true', 'columns[1][search][value]': '', 'columns[1][search][regex]': 'false',
'columns[2][data]': '0', 'columns[2][name]': 'activist', 'columns[2][searchable]': 'true',
'columns[2][orderable]': 'true', 'columns[2][search][value]': '', 'columns[2][search][regex]': 'false',
'columns[3][data]': '3', 'columns[3][name]': 'release_date', 'columns[3][searchable]': 'true',
'columns[3][orderable]': 'true', 'columns[3][search][value]': '', 'columns[3][search][regex]': 'false',
'columns[4][data]': '10', 'columns[4][name]': 'campaign_return', 'columns[4][searchable]': 'true',
'columns[4][orderable]': 'true', 'columns[4][search][value]': '', 'columns[4][search][regex]': 'false',
'columns[5][data]': '16', 'columns[5][name]': 'percent_change_daily ', 'columns[5][searchable]': 'true',
'columns[5][orderable]': 'true', 'columns[5][search][value]': '', 'columns[5][search][regex]': 'false',
'columns[6][data]': '21', 'columns[6][name]': 'country', 'columns[6][searchable]': 'true',
'columns[6][orderable]': 'true', 'columns[6][search][value]': '', 'columns[6][search][regex]': 'false',
'columns[7][data]': '24', 'columns[7][name]': 'region', 'columns[7][searchable]': 'true',
'columns[7][orderable]': 'true', 'columns[7][search][value]': '', 'columns[7][search][regex]': 'false',
'columns[8][data]': '25', 'columns[8][name]': 'marketcapinitial', 'columns[8][searchable]': 'true',
'columns[8][orderable]': 'true', 'columns[8][search][value]': '', 'columns[8][search][regex]': 'false',
'order[0][column]': '3', 'order[0][dir]': 'desc', 'start': f'{page}', 'length': '20', 'search[value]': '',
'search[regex]': 'false', 'active_closed': 'L', 'csrfmiddlewaretoken': csrfmiddlewaretoken, }
response = session.post('https://breakoutpoint.com/activists-shorts/assc/', headers=headers, data=data)
content = json.loads(response.text)
try:
content_data = content['data']
except:
content_data = []
if content_data:
for data_ in content_data:
try:
symbol_ = data_[13].strip()
except:
symbol_ = ''
try:
Release_date = data_[3].strip()
except:
Release_date = ''
try:
company = data_[1].strip()
except:
company = ''
try:
shortactivist = data_[0].strip()
except:
shortactivist = ''
if symbol_ == '' and company == '' and shortactivist == '' and Release_date == '':
continue
data = {'symbol_': symbol_, 'Release_date': Release_date, 'company': company,
'shortactivist': shortactivist}
if data not in tmp_dict_data:
tmp_dict_data.append(data)
tmp_symbol_lst.append(symbol_)
tmp_company_lst.append(company)
tmp_shortactivist_lst.append(shortactivist)
tmp_relase_date_lst.append(Release_date)
page += 20
else:
break
Symbol_arr = tmp_symbol_lst + Symbol_arr
Company_arr = tmp_company_lst + Company_arr
ShortActivist_arr = tmp_shortactivist_lst + ShortActivist_arr
ReleaseDate_arr = tmp_relase_date_lst + ReleaseDate_arr
if __name__ == '__main__':
login()
while True:
print(f'Start: {datetime.datetime.now()}')
get_data()
print(f'Symbol_arr = \n{Symbol_arr}')
print(f'Company_arr = \n{Company_arr}')
print(f'ShortActivist_arr = \n{ShortActivist_arr}')
print(f'ReleaseDate_arr = \n{ReleaseDate_arr}')
print(f'End: {datetime.datetime.now()}')
time.sleep(60)