-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprocess.py
255 lines (221 loc) · 10.2 KB
/
process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import pandas as pd
import numpy as np
import config
import warnings
from datetime import datetime as dt
from datetime import timedelta as delta
import plotly.express as px
warnings.filterwarnings("ignore")
def clean_marketcap_price(data='market'):
"""
Parameter:
data: 'market' or 'price'
"""
#Extract the Symbol in order to set the columns name
symbol_name = [
str(name) for name in list(pd.read_csv(config.MARKETCAP)['Symbol'])
]
if data == 'market':
#Load data and drop useless columns and rows
df = pd.read_csv(config.MARKETCAP,skiprows=3).iloc[:,4:]
elif data == 'price':
#Load data and drop useless columns and rows
df = pd.read_csv(config.PRICE,skiprows=3).iloc[:,4:]
else:
return 'Dataset not found.'
#Delete residual rows
df = df.loc[df['Name'].notnull()]
#Convert data of time to datetime
df['Name'] = pd.to_datetime(df['Name'])
#Rename the columns of date
df.rename(columns={'Name':'Date'},inplace=True)
#Set Date column as index
df.set_index('Date',inplace=True)
#Shortcut the name of columns
df.columns = symbol_name
# Drop "Error" columns
df = df.select_dtypes(exclude='object')
#Delete useless columns
to_drop = [col for col in df.columns if len(df[col].unique()) == 1]
df.drop(to_drop,axis=1,inplace=True)
#Get the name of columns that have missing values
missing_col = {}
col_countnull = df.isnull().sum().reset_index()
for i in range(len(col_countnull)):
if col_countnull.loc[i,0] != 0:
missing_col[col_countnull.loc[i,'index']] = col_countnull.loc[i,0]
#Fill the missing values
for col in missing_col:
#Because the missing value is only located in the left tail, it implies that the company was not listed on those days.
df[col].fillna(value=0,inplace=True)
return df
def full_marketcap_price(data='market'):
"""
Return dataframe of companies with price/marketcap information.
Parameter:
data: 'market' or 'price'
"""
if (data == 'market') or (data == 'price'):
company = pd.read_csv(config.COMPANY)
#Choose columns to merge and rename them
to_merge = company[['Symbol','Name','NAME','COMPANY NAME','RIC','ICB INDUSTRY NAME']]
to_merge.columns = ['Symbol','Name','Status Name','Company','RIC','Industry']
#Fill in the missing values
to_merge.fillna(value='Unknown',inplace=True)
to_merge['RIC'].replace('-','Unknown',inplace=True)
#Load data cleaned
main = clean_marketcap_price(data).reset_index()
main['Date'] = main['Date'].astype('string')
main = main.transpose()
main.reset_index(inplace=True)
main.columns = main.iloc[0]
main = main.iloc[1:].reset_index(drop=True)
main.rename(columns={'Date':'Symbol'},inplace=True)
#Merge the dataset
df = to_merge.merge(main,how='right',on='Symbol')
#Correct datatype of df
for col in df.columns[6:]:
df[col] = df[col].astype('float64')
#Delete Exchange-traded fund and company was not listed in Singagore
df = df[~df['Name'].str.contains("\\(SES")]
df = df[~df['Name'].str.contains("ETF")]
df.reset_index(drop=True,inplace=True)
#Correct data of Delist or Suspended company
delist_time = [name.split('.')[-1] for name in df.loc[df['Status Name'].str.contains('DELIST')]['Status Name']]
delist_time = [date_available(dt.strftime(dt.strptime(i,'%d/%m/%y'),'%Y-%m-%d')) for i in delist_time]
delist_code = [code for code in df.loc[df['Status Name'].str.contains('DELIST')]['Symbol']]
delist = {delist_code[i]:delist_time[i] for i in range(len(delist_time))}
for symbol,date in delist.items():
df.loc[df['Symbol'] == symbol,date:] = 0
susp_time = [name.split('.')[-1] for name in df.loc[df['Status Name'].str.contains('SUSP')]['Status Name']]
susp_time = [date_available(dt.strftime(dt.strptime(i,'%d/%m/%y'),'%Y-%m-%d')) for i in susp_time]
susp_code = [code for code in df.loc[df['Status Name'].str.contains('SUSP')]['Symbol']]
susp = {susp_code[i]:susp_time[i] for i in range(len(susp_time))}
for symbol,date in susp.items():
df.loc[df['Symbol'] == symbol,date:] = 0
#Remove duplicated companies
company_counts = df[df['Company'] != 'Unknown']['Company'].value_counts().reset_index()
company_dup = []
for i in range(len(company_counts)):
if company_counts['Company'][i] > 1:
company_dup.append(company_counts['index'][i])
for i in company_dup:
company_drop = list(df[df['Company'] == i][['Symbol','Company']].index)[1:]
df.drop(company_drop,inplace=True)
df.reset_index(drop=True,inplace=True)
else:
return 'Dataset not found.'
return df
def date_available(to_convert):
if dt.strptime(to_convert,'%Y-%m-%d').weekday() == 6: #Sunday
sub = dt.strptime(to_convert,'%Y-%m-%d') - delta(days=2) #Convert to the closet Friday
to_convert = dt.strftime(sub,'%Y-%m-%d')
elif dt.strptime(to_convert,'%Y-%m-%d').weekday() == 5: #Saturday
sub = dt.strptime(to_convert,'%Y-%m-%d') - delta(days=1) #Convert to the closet Friday
to_convert = dt.strftime(sub,'%Y-%m-%d')
else:
return to_convert
return to_convert
def profit_comparision(data,before='2021-01-01',after='2021-09-24',top=10,desc=True):
"""
Return dataframe of top companies with the highest or lowest profit in a specified time range.
Parameters:
data: dataframe from full_marketcap_price function
before: 'YYYY-MM-DD' format
after: 'YYYY-MM-DD' format
top: The number of top companies is returned
desc: Return dataframe in descending order if True, return dataframe in ascending order if False
"""
before = date_available(before)
after = date_available(after)
df_price = data
df = df_price[
(df_price[before]!=0) & (df_price[after]!=0)
][['Symbol','Name','Company','Industry',before,after]]
df['Profit'] = (df[after] - df[before])/df[before] *100
if desc == True:
df = df.sort_values('Profit',ascending=False).head(top).reset_index(drop=True)
else:
df = df.sort_values('Profit',ascending=True).head(top).reset_index(drop=True)
return df
def top_industry_marketcap(data, industry='Financials', date='2021-09-24',top=5,desc=True,get_all=False):
"""
Return top companies with highest or lowest market cap in specified time group by industry.
Parameters:
data: dataframe from full_marketcap_price function
industry: Return dataframe filtered by industry. If industry = "all", return the whole industry
date: 'YYYY-MM-DD' format
top: The number of top companies is returned
desc: Return dataframe in descending order if True, return dataframe in ascending order if False
get_all: Return the whole companies with market cap in specified time group by industry
"""
date = date_available(date)
df_market = data
df_market = df_market[df_market[date] != 0]
if get_all == False:
if desc == True:
order = False
else:
order = True
industry_info = [
df_market[
df_market['Industry']==industry][['Symbol','Name','Company','Industry',date]]\
.sort_values(by=date,ascending=order).head(top)\
for industry in df_market['Industry'].unique()
]
df = pd.concat(industry_info).reset_index(drop=True)
else:
df = df_market[['Symbol','Name','Company','Industry',date]].sort_values(by='Industry').reset_index(drop=True)
df.rename(columns={date:'MarketCap'},inplace=True)
if industry == 'all':
return df
return df[df['Industry'] == industry]
def greater_price(data,week=52,date='2021-09-24'):
"""
Return dataframe of companies have price above the specified week mark
Parameters:
data: dataframe from full_marketcap_price function
week: Number of weeks is selected as the mark
date: 'YYYY-MM-DD' format, date is selected for comparision
"""
from datetime import datetime as dt
from datetime import timedelta as delta
date = date_available(date)
df_price = data
df_price = df_price[df_price[date] != 0]
compare_tail = dt.strftime((dt.strptime(date,'%Y-%m-%d') - delta(days=1)),'%Y-%m-%d')
compare_head = dt.strftime((dt.strptime(compare_tail,'%Y-%m-%d') - delta(weeks=week)),'%Y-%m-%d')
tail_available = date_available(compare_tail)
head_available = date_available(compare_head)
df = df_price.loc[:,head_available:tail_available]
df = df.join(df_price[date])
df['Max Price'] = df.loc[:,head_available:tail_available].max(axis=1)
df = df_price[['Company','Symbol']].join(df)
df = df[(df[date] > df['Max Price'])].reset_index(drop=True)
df.set_index('Company',inplace=True)
return df
def market_overview_plot(data,date='2021-09-24'):
df = top_industry_marketcap(data,industry='all',date=date,get_all=True)[['Industry','Symbol','MarketCap']]
tree_fig = px.treemap(
df,path=['Industry','Symbol'],
values = 'MarketCap',
color = 'MarketCap',
color_continuous_scale='Mint',
title = 'Market Capialization Overview'
)
return tree_fig
if __name__ == "__main__":
data_market = full_marketcap_price(data='market')
data_price = full_marketcap_price(data='price')
##Question 1:
#print(profit_comparision(data=data_price,before='2021-01-01',after='2021-09-24',top=10, desc= True))
##Question 2:
# print(profit_comparision(data=data_price,before='2021-01-01',after='2021-09-24',top=10,desc=False))
##Question 3:
# print(greater_price(data=data_price,week=52,date='2021-09-24'))
##Question 4:
# print(top_industry_marketcap(data=data_market,date='2021-09-24',top=5,desc=True,industry='all'))
##Question 5:
# print(top_industry_marketcap(data=data_market,date='2021-09-24',get_all=True,industry='all'))
# overview_plot = market_overview_plot(data_market,date='2021-09-24')
# overview_plot.show()