-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyzerdb.py
161 lines (133 loc) · 6.72 KB
/
analyzerdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import openpyxl
import re
import os
from fuzzywuzzy import fuzz
import logging
import datetime
import sqlite3
logger = logging.getLogger()
logger.setLevel(logging.WARNING)
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
conn = sqlite3.connect('secfinance.sqlite')
curr = conn.cursor()
curr.executescript('''
DROP TABLE IF EXISTS company;
DROP TABLE IF EXISTS valuetype;
DROP TABLE IF EXISTS summary;
DROP TABLE IF EXISTS dollardenomination;
CREATE TABLE company (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE valuetype (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
name TEXT UNIQUE
);
CREATE TABLE dollardenomination (
id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
denomination INTEGER UNIQUE);
CREATE TABLE summary (
company_id INTEGER,
valuetype_id INTEGER,
date TEXT,
dollaramt INTEGER,
dollardenomination_id INTEGER)
''')
def xlparse(tickerin,filepath,classname,datein):
wb = openpyxl.load_workbook(filepath)
sheetnames = wb.get_sheet_names() #getting list of all sheetnames
#section for dealing with sheet 1 which is consolidated statement of income
sheet = wb.get_sheet_by_name(sheetnames[1])
cola = [] #create new list to store all values in column A
colb = []
for cellobj in sheet.columns[0]: #iterate through all values where column0*header column is true..
cola.append(cellobj.value) #append them to list of columna so we can extract info out of it..
for cellobj in sheet.columns[1]: #iterate through all values in columnB and store in colb
colb.append(cellobj.value)
#extracting information on millions vs thousands from columnArowA
#format 'Consolidated Condensed Statements of Income - USD ($) shares in Millions, $ in Millions'
sharedeno = re.findall('shares in ([a-zA-Z]+)',cola[0])
dollardeno = re.findall('\$ in ([a-zA-Z]+)',cola[0]) #note the use of special character '\' to match for $ vs match at end of line..
doldeno = 1000000
#if dollardeno == None:
# doldeno = 1000000
# if regular expression returns None assuming default of one million for now
#elif dollardeno[0] == 'Thousands':
# doldeno = 1000
#elif dollardeno[0] == 'Millions':
# doldeno = 1000000
#else:
# logger.error('Unknown denomination for dollar value')
#print sharedeno[0],dollardeno[0],doldeno #regular expression returns a list
#netrevenue -cost ofsales = grossmargin
#grossmargin - operating expenses = operating income
#operating expenses = randd + marketing + restructuring + amoritization
#operating income - gains(losses) on equity investments + iterests and other... = income before taxes
#income before taxes - provision for taxes = net incomde
#netincome/basic shares = basic earnings per share
#netincome/diluted shares = diluted earnings per share of commone stock
netrevenuelist = ['netrevenue','Net sales','Netsales','netsales','Net revenue'] #we will want to keep expanding this list based on how many different variants from different companies
netincomelist = ['netincome','Net income'] #we will want to keep expanding this list based on how many different variants from different companies
listoflists = [netrevenuelist,netincomelist] #make this list as long as the number of entries we want to extract and store in db
logger.debug('%s,%s',cola,colb)
for name, value in zip(cola,
colb): # looping through both cola and b at the same time .. maybe not efficient time wise and better to use index ?# ?
#extractandstore(name,value,netrevenuelist,tickerin,datein,doldeno)
for listentry in listoflists:
for entry in listentry:
if fuzz.ratio(entry, name) > 70:
# TODO Add a check for blank cells
logger.info('%s,%s', name, fuzz.ratio(entry, name))
curr.execute('''INSERT OR IGNORE INTO company (name)
VALUES(?)''', (tickerin,))
curr.execute('''SELECT id FROM company WHERE name = ?''', (tickerin,))
company_id = curr.fetchone()[0]
curr.execute(''' INSERT OR IGNORE INTO valuetype (name)
VALUES(?)''', (listentry[0],)) #first entry of list will have the name we will use for column type
curr.execute('''SELECT id FROM valuetype WHERE name = ?''', (listentry[0],))
valuetype_id = curr.fetchone()[0]
# denomination table
curr.execute('''INSERT OR IGNORE INTO dollardenomination(denomination)
VALUES(?)''', (doldeno,))
curr.execute('''SELECT id FROM dollardenomination WHERE denomination = ?''', (doldeno,))
dollardenomination_id = curr.fetchone()[
0] # fetching first column since that is the id that can be used as refernece in main table
curr.execute('''INSERT OR IGNORE INTO summary (company_id, valuetype_id, date, dollaramt, dollardenomination_id)
VALUES(?,?,?,?,?)''', (company_id, valuetype_id, datein, value, dollardenomination_id))
break
# TODO Add a check for blank cells
conn.commit()
return
# Main Function
ticker = raw_input("Enter stock ticker")
datelist = []
# parse all files and create a class for each one of those with data
currentpath = os.getcwd() #getting current directory of .py script
newpath = currentpath + '\\Tickers\\' + ticker.upper() #planning to create new directory with ticker name in upper case
#if os.path.exists(newpath): #check on if path alrerady exists
for root,dirs,files in os.walk(newpath): #walk returns root path, directories and then the file names
for name in files:
#print name
filepath = (os.path.join(root, name))
filename = name.rstrip('.xls') #removing the .xls extension
date = re.findall(('\d{4}-\d{2}-\d{2}'), filename)
datet = datetime.datetime.strptime(date[0], '%Y-%m-%d')
datelist.append(datet)
logger.debug('%s %s',filepath,filename)
#filepath = 'C:\Users\Aravind\Dropbox\Learning\Programming\Python\Python Fun\SEC_10k_q\AAPL_10-K_2015-10-28.xlsx'
print filename
xlparse(ticker, filepath, filename, date[0])
#TODO understand file error with older xls files
#expand to other rows in income sheet
#start plotting
#list = []
#print date2num(datelist)
##for (date,value) in datelist:
## x = [date2num(datelist)]
## print x
#for entry in classlist:
# list.append(entry.netincome)
#for x,y in zip(datelist,list):
# print x,y
#plt.bar(datelist,list,color="red",linestyle='-',linewidth=5)
#plt.show()