Skip to content

Commit

Permalink
Add link tools
Browse files Browse the repository at this point in the history
  • Loading branch information
defseg committed Oct 17, 2017
1 parent 346068b commit 34265d0
Show file tree
Hide file tree
Showing 5 changed files with 280 additions and 0 deletions.
50 changes: 50 additions & 0 deletions links/check_links.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from db import init_db
import datetime, urllib2, ssl

def store_result(id, code, msg, conn, sql):
sql.execute("INSERT INTO checks(datetime, code, message, link) VALUES(?,?,?,?)", (datetime.datetime.now(), str(code), msg, int(id)))
conn.commit()

class NotHTTPException(Exception):
pass

def check_link(id, url):
if url[:3] == 'ftp':
raise NotHTTPException("FTP links are currently unsupported")
if url[:4] != 'http':
print(url)
url = "http://" + url
req = urllib2.Request(url)
e = ''
try:
res = urllib2.urlopen(req)
code = res.getcode()
except urllib2.HTTPError as e:
code = e.code
except urllib2.URLError as e:
code = -1
except ssl.CertificateError as e:
code = -2
except urllib2.socket.error as e:
code = -3
except urllib2.httplib.BadStatusLine as e:
code = -4
except ValueError as e:
code = -5
return (id, code, str(e))

if __name__ == "__main__":
conn, sql = init_db()
links = conn.execute("SELECT id, url FROM links")
for link in links:
print(link)
try:
id, code, msg = check_link(link[0], link[1])
except NotHTTPException as e:
print("\t{0}".format(e.message))
continue
if code != 200:
print("\t{0} {1}".format(code, msg))
store_result(id, code, msg, conn, sql)
conn.commit()
conn.close()
16 changes: 16 additions & 0 deletions links/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import sqlite3

def init_db():
sqlite_db_path = './links.sqlite'
conn = sqlite3.connect(sqlite_db_path)
sql = conn.cursor()
sql.execute('CREATE TABLE IF NOT EXISTS links (id INTEGER PRIMARY KEY, url VARCHAR(255) UNIQUE NOT NULL)')
sql.execute('CREATE TABLE IF NOT EXISTS papers (id INTEGER PRIMARY KEY, filename VARCHAR(255) UNIQUE NOT NULL)')
sql.execute('CREATE TABLE IF NOT EXISTS links_papers (id INTEGER PRIMARY KEY, link_id INTEGER NOT NULL, paper_id INTEGER NOT NULL, FOREIGN KEY(link_id) REFERENCES links(id), FOREIGN KEY(paper_id) REFERENCES papers(id))')
sql.execute('CREATE TABLE IF NOT EXISTS checks (id INTEGER PRIMARY KEY, datetime DATETIME NOT NULL, code INTEGER NOT NULL, message VARCHAR(255), link INTEGER NOT NULL, FOREIGN KEY(link) REFERENCES links(id))')
return (conn, sql)

if __name__ == '__main__':
conn, sql = init_db()
conn.commit()
conn.close()
133 changes: 133 additions & 0 deletions links/link_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
from __future__ import division
from db import init_db

# Every link is either checked (HTTP(S)) or unchecked (FTP).
def num_links(sql):
return sql.execute("SELECT COUNT(DISTINCT url) FROM links").fetchone()[0]

def num_checked_links(sql):
return sql.execute("SELECT COUNT(DISTINCT checks.link) FROM checks").fetchone()[0]

def num_unchecked_links(sql):
return sql.execute("SELECT COUNT(DISTINCT l.id) FROM links AS l WHERE NOT EXISTS(SELECT link FROM checks WHERE l.id = checks.link)").fetchone()[0]

# Every checked link is either consistent (returns same HTTP status every check) or inconsistent.
def num_con_links(sql):
return sql.execute("SELECT COUNT(DISTINCT checks.link) FROM checks WHERE checks.link NOT IN (\
SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code)").fetchone()[0]

def con_links(sql, code):
return sql.execute("SELECT DISTINCT checks.link, links.url FROM checks JOIN links ON checks.link = links.id WHERE checks.link NOT IN (\
SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code)\
AND checks.code = ?", (code,)).fetchall()

def con_nw_links(sql): # Don't rely on this for counts: the same link may return a different message even if the code is the same.
# Might want to fix this eventually: pull distinct link IDs from checks in subquery and get URL/code/message from joins onto that?
return sql.execute("SELECT DISTINCT checks.link, links.url, checks.code, checks.message FROM checks JOIN links ON checks.link = links.id \
WHERE checks.link NOT IN\
(SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code <> c2.code)\
AND checks.code <> 200").fetchall()

def num_con_nw_links(sql):
return sql.execute("SELECT COUNT(DISTINCT c.link) FROM checks AS c WHERE c.link NOT IN\
(SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code <> c2.code)\
AND c.code <> 200").fetchone()[0]

def incon_links(sql): # Don't rely on this for counts either; if one link returns 3+ distinct codes, those will show up as 2+ rows
return sql.execute("SELECT DISTINCT c1.link, links.url, c1.code, c2.code FROM checks AS c1 JOIN links ON c1.link = links.id \
JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code ORDER BY c1.link").fetchall()

def num_incon_links(sql):
return sql.execute("SELECT COUNT(DISTINCT c1.link) FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code").fetchone()[0]

# Every inconsistent link either works sometimes or doesn't.
def num_incon_w(sql):
return sql.execute("SELECT COUNT(DISTINCT c1.link) FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code <> c2.code AND c1.code = 200").fetchone()[0]

def num_incon_nw(sql): # unreasonably slow
return sql.execute("SELECT COUNT(DISTINCT incon.link) FROM (SELECT c3.link, c3.code FROM checks AS c3 JOIN checks AS c4 ON c3.link = c4.link\
WHERE c3.code <> c4.code) incon\
WHERE NOT EXISTS(SELECT c1.link FROM checks AS c1 WHERE c1.code = 200 AND c1.link = incon.link)").fetchone()[0]

# percentize
def c(a, b):
return round(a / b * 100, 2)

def print_links(links):
max_url_len = max(len(x[1]) for x in links)
for l in links:
link_id, url, code, message = l
print "{0:<4}".format(link_id),
print "{0:<3}".format(code),
print "{{0:<{0}}}".format(max_url_len).format(url),
print "\t{0}".format(message)

if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser("Process data from links.sqlite.")
parser.add_argument("-d", dest="detailed", action="store_const", const=True, default=False, help="Display detailed output.")
parser.add_argument("-s", dest="sanity_check", action="store_const", const=True, default=False, help="Display sanity checks.")
args = parser.parse_args()
conn, sql = init_db()
total_links = num_links(sql)
print "There are {0} links in the database.".format(total_links)
num_checked = num_checked_links(sql)
num_unchecked = num_unchecked_links(sql)
print "{0} links were checked. {1} links weren't -- these are probably FTP.".format(num_checked, num_unchecked)
num_con = num_con_links(sql)
num_con_w = len(con_links(sql, 200))
num_con_nw = num_con_nw_links(sql)
print
print "{0} checked links were consistent: they returned the same HTTP status code on every check.".format(num_con)
print "\t{0} consistently worked.".format(num_con_w)
print "\t{0} consistently didn't.".format(num_con_nw),
if args.detailed:
print "Here they are:"
print_links(con_nw_links(sql))
else:
print
num_incon = num_incon_links(sql)
num_incon_w = num_incon_w(sql)
num_incon_nw = num_incon_nw(sql)
print
print "{0} checked links were inconsistent.".format(num_incon)
print "\t{0} sometimes worked and sometimes didn't.".format(num_incon_w)
print "\t{0} never worked, but varied as to why they didn't.".format(num_incon_nw)
if args.detailed:
print "Here are the inconsistent links:"
print_links(incon_links(sql))
print
print "Here are the reasons consistent links failed:"
err_msgs = {-5: "ValueError", -4: "httplib.BadStatusLine", -3: "socket.error", -2: "ssl.CertificateError", -1: "urllib2.URLError (lookup failed)", 200: "OK",\
301: "Moved permanently (redirect)", 302: "Found", 401: "Unauthorized", 403: "Forbidden", 404: "Not found", 500: "Internal server error",\
501: "Not implemented", 502: "Bad gateway", 503: "Service unavailable", 504: "Gateway timeout"}
err_codes = sql.execute("SELECT DISTINCT code FROM checks ORDER BY code").fetchall()
if args.sanity_check:
running_total = 0
for code_tuple in err_codes:
code = code_tuple[0]
if code == 200:
continue
links = con_links(sql, code)
if args.detailed:
print "\t{0}:".format(code)
for l in links:
print "\t\t{0:<4} {1}".format(*l)
else:
print "\t{0} {1}: {2}".format(code, err_msgs[code], len(links))
if args.sanity_check:
running_total += len(links)

if args.sanity_check:
num_working = sql.execute("SELECT COUNT(DISTINCT link) FROM checks WHERE code = 200").fetchone()[0]
num_not_working = sql.execute("SELECT COUNT(DISTINCT link) FROM checks WHERE code <> 200").fetchone()[0]
print
print "{0:>4} {1:>4} Links should either be checked or unchecked.".format(total_links, num_checked + num_unchecked)
print "{0:>4} {1:>4} Checked links should either be consistent or be inconsistent.".format(num_checked, num_con + num_incon)
print "{0:>4} {1:>4} Consistent links should either always work or never work.".format(num_con, num_con_w + num_con_nw)
print "{0:>4} {1:>4} Inconsistent links should either work sometimes or never work.".format(num_incon, num_incon_w + num_incon_nw)
print "{0:>4} {1:>4} Links that returned a 200 OK should be either consistent or inconsistent.".format(num_working, num_con_w + num_incon_w)
print "{0:>4} {1:>4} Links that returned something other than a 200 OK should be either consistently not working or inconsistent.".format(num_not_working, \
num_con_nw + num_incon_w + num_incon_nw)
print "{0:>4} {1:>4} All the smallest buckets should add up to the total.".format(total_links, num_con_w + num_con_nw + num_incon_w + num_incon_nw + num_unchecked, total_links)
print "{0:>4} {1:>4} As many consistent codes should error in total as error for any specific reason".format(num_con_nw, running_total)
Binary file added links/links.sqlite
Binary file not shown.
81 changes: 81 additions & 0 deletions links/process_pdfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from db import init_db
import sys, codecs, pyPdf

def find_pdf_links(filename):
pdf_file = open(filename)
pdf = pyPdf.PdfFileReader(pdf_file)
pdf_links = []
pdf_pages = pdf.getNumPages()
for i in range(pdf_pages):
page = pdf.getPage(i).getObject()
if page.has_key('/Annots'):
for k in page['/Annots']:
annot = k.getObject()
if annot.has_key('/A') and annot['/A'].has_key('/URI'):
pdf_links.append(annot['/A']['/URI'])
return pdf_links

def filter_links(links):
to_filter = ["http://www.aanda.org", "http://www.edpsciences.org",\
"http://dexter.edpsciences.org", "http://dx.doi.org",\
"http://linker.aanda.org", "http://arxiv.org",\
"http://adsabs.harvard.edu", "http://ui.adsabs.harvard.edu",\
"doi:", "DOI:", "mailto:", 'email:', "http://ascl.net", "ascl.net"]
to_filter = zip(to_filter, [len(u) for u in to_filter])
return [l for l in links if (not any([l[0:v] == u for u, v in to_filter])) and (l.find('@') == -1)]

def add_link(link, filename, paper_id, conn, sql):
sql.execute("SELECT url FROM links WHERE url = ?", (link,))
foo = sql.fetchone()
if not foo:
sql.execute("INSERT INTO links(url) VALUES (?)", (link,))
conn.commit()
link_id = sql.lastrowid
else:
sql.execute("SELECT id FROM links WHERE url = ?", (link,))
link_id = sql.fetchone()[0]
sql.execute("INSERT INTO links_papers(link_id, paper_id) VALUES (?, ?)", (link_id, paper_id))

def unpathify(filename):
return filename.split('/')[-1].split('.')[0]

def process_papers(filenames):
conn, sql = init_db()
processed_files = 0
added_links = 0
errored_files = []
for filename in filenames:
print("Reading file {0}".format(filename))
try:
links = filter_links(find_pdf_links(filename))
except pyPdf.utils.PdfReadError:
print("Couldn't read file")
errored_files.append(filename)
continue
filename_without_path = unpathify(filename)
sql.execute("SELECT filename FROM papers WHERE papers.filename = ?", (filename_without_path,))
if sql.fetchone():
continue
sql.execute("INSERT INTO papers(filename) VALUES (?)", (filename_without_path,))
conn.commit()
paper_id = sql.lastrowid
for link in links:
print("\tAdding link {0}".format(link))
add_link(link, filename, paper_id, conn, sql)
processed_files += 1
added_links += len(links)
conn.commit()
conn.close()
print("--- Complete! ---")
print("Processed {0} files, added {1} links".format(processed_files, added_links))
if len(errored_files) == 0:
print("No file read errors!")
elif len(errored_files) == 1:
print("1 file read error:\n\t{0}".format(errored_files[0]))
else:
print("{0} file read errors:".format(len(errored_files)))
for f in errored_files:
print("\t" + f)

if __name__ == "__main__":
process_papers(sys.argv[1:])

0 comments on commit 34265d0

Please sign in to comment.