-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsec_download.py
130 lines (106 loc) · 5.68 KB
/
sec_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Download and analyze 10Q/10k filings form different companies
#from SECEdgar.crawler import SecCrawler
#seccrawler = SecCrawler()
#seccrawler.filing_10K('AAPL','0000320193','200010101','10')
import requests
import re
import datetime
import logging
import os
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
from bs4 import BeautifulSoup
def downloadfile(ftpfolder,ftpfiletype,localname):
from ftplib import FTP
ftp = FTP('ftp.sec.gov')
ftp.login()
logger.info ('Changing to ftp path %s',ftpfolder)
ftp.cwd(ftpfolder)
dirlisting = ftp.nlst()
ftpfilename = 'dummyname' #for cases where there is no match
for file in dirlisting:
if ftpfiletype == 'Excel':
if re.findall('xlsx',file):
localext = '.xlsx' #doing xlsx first so otherwise xls will also match for xlsx
ftpfilename = file
break
elif re.findall('xls',file):
localext = '.xls'
ftpfilename = file
break
##TODO : Generate folder structure based on company form type and append to local file name form name and year qtr..
localfile = open(localname+localext,'wb')
#list = ftp.dir()
try:
ftp.retrbinary('RETR ' + ftpfilename ,localfile.write, 1024 )
except:
# unable to handle this error.. program exits upon hitting file open error.. follow up !@#@!
print ' File error: File %s does not exist in FTP path %s',ftpfilename,ftpfolder
## print i
localfile.close()
ftp.quit()
# TODO
# Write function to find subfolder on the fly..Done
# Get cik or company name on the fly and match to cik ..DONE
# Write a function that can retrieve all the filenames from ftp path
# Modify the download function so that it can download all Excel(xlsx or xls ) or text documents or all documents (you will have to do this one by one..)
# Extract ftp folder path/cik everything straigh from the search results and extracting from the table for results....
# https://www.sec.gov/cgi-bin/browse-edgar?CIK=intc&owner=exclude&action=getcompany&Find=Search
#https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000050863&type=10-q&dateb=20010101&owner=exclude&count=100
def folderlookup(ticker,formtype,date,count):
newurl = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK='+ticker+'&type='+formtype+'&dateb='+date+'&owner=exclude&count='+count
print newurl
try:
page1 = requests.get(newurl)
except:
print "unable to retrieve URL. URL or server error"
soup = BeautifulSoup(page1.text,"lxml") #trying lxml as per bs recommendation
tables = soup.find('table',summary='Results')
tr = tables.find_all('tr')
ftp_path = []
filedate = []
formtype = []
for tr in tables.find_all('tr'):
td = tr.find_all('td')
if td: # checking so we can skip first header row..
formtype.append(td[0].text)
raw_loc = td[1].a["href"] ## beautiful soup returns a list and we need to access the individual elements of the list so we can operate on them using bs objects such as.string etc.
extract_loc = re.findall('/edgar/data(/[0-9]+/[0-9]+)',raw_loc) #extracting the path usign regular expressino from second element.. this will return a list and need to extract..
if extract_loc: #only if the regular expression is a match #TODOOOO will have to figure out ways to take care of the cases where second ftp path is not part of the file
ftp_path.append(extract_loc[0]) # then append to path
filedate.append(td[3].text)
return formtype,filedate,ftp_path
def ciklookup(ticker):
URL = 'https://www.sec.gov/cgi-bin/browse-edgar?CIK='+ticker+'&owner=exclude&action=getcompany&Find=Search'
page = requests.get(URL)
ciks = re.findall('CIK=([0-9]+)',page.text)
cik_internal = ciks[0] # getting all the cik using findall can be cut short to stop at first if needed
return cik_internal
def generate_ftp_folder(): #creating the path to travers sec.gov ftp site using the index
folder = '000005086315000072' ## TODO make this generic next
ftpfolderpath = '/' + cik + '/' + folder
return ftpfolderpath
def stripzero(cikzero): # removes the leading zero from CIK for the purpose of navigation into ftp folder
cik_int = re.findall('([1-9][0-9]+)',cikzero)
return str(cik_int[0])
ticker = raw_input('Enter the Stock ticker:')
form = raw_input('Enter Form Type eg. 10-q /10-k...:')
totalcount = raw_input('Count of forms to download:')
today = datetime.date.today()
todaysdate = str(today.year)+str(today.month)+str(today.day) #converting to str and then concatenating
priortodate = todaysdate
form,dates,path = folderlookup(ticker,form,priortodate,totalcount) #form type , file date, ftp navigation path
filename = 'Excel'
print '----------------------------------'
for i in range(len(form)):
logger.info('%s %s %s',form[i],dates[i],path[i])
if form[i] == '10-K/A':
form[i] = '10-K_A' # / breaks windows file naming conventions.. changing to underscore
currentpath = os.getcwd() #getting current directory of .py script
newpath = currentpath + '\\Tickers\\' + ticker.upper() #planning to create new directory with ticker name in upper case
if not os.path.exists(newpath): #check on if path alrerady exists
os.makedirs(newpath) # if path does not exist creat it...
localfilename = ticker+'_'+form[i]+'_'+dates[i] #generating the file name to store locally.. not including extension
localfilename = os.path.join(newpath,localfilename)
downloadfile(str(path[i]),filename,localfilename)