-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
5 changed files
with
280 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from db import init_db | ||
import datetime, urllib2, ssl | ||
|
||
def store_result(id, code, msg, conn, sql): | ||
sql.execute("INSERT INTO checks(datetime, code, message, link) VALUES(?,?,?,?)", (datetime.datetime.now(), str(code), msg, int(id))) | ||
conn.commit() | ||
|
||
class NotHTTPException(Exception): | ||
pass | ||
|
||
def check_link(id, url): | ||
if url[:3] == 'ftp': | ||
raise NotHTTPException("FTP links are currently unsupported") | ||
if url[:4] != 'http': | ||
print(url) | ||
url = "http://" + url | ||
req = urllib2.Request(url) | ||
e = '' | ||
try: | ||
res = urllib2.urlopen(req) | ||
code = res.getcode() | ||
except urllib2.HTTPError as e: | ||
code = e.code | ||
except urllib2.URLError as e: | ||
code = -1 | ||
except ssl.CertificateError as e: | ||
code = -2 | ||
except urllib2.socket.error as e: | ||
code = -3 | ||
except urllib2.httplib.BadStatusLine as e: | ||
code = -4 | ||
except ValueError as e: | ||
code = -5 | ||
return (id, code, str(e)) | ||
|
||
if __name__ == "__main__": | ||
conn, sql = init_db() | ||
links = conn.execute("SELECT id, url FROM links") | ||
for link in links: | ||
print(link) | ||
try: | ||
id, code, msg = check_link(link[0], link[1]) | ||
except NotHTTPException as e: | ||
print("\t{0}".format(e.message)) | ||
continue | ||
if code != 200: | ||
print("\t{0} {1}".format(code, msg)) | ||
store_result(id, code, msg, conn, sql) | ||
conn.commit() | ||
conn.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
import sqlite3 | ||
|
||
def init_db(): | ||
sqlite_db_path = './links.sqlite' | ||
conn = sqlite3.connect(sqlite_db_path) | ||
sql = conn.cursor() | ||
sql.execute('CREATE TABLE IF NOT EXISTS links (id INTEGER PRIMARY KEY, url VARCHAR(255) UNIQUE NOT NULL)') | ||
sql.execute('CREATE TABLE IF NOT EXISTS papers (id INTEGER PRIMARY KEY, filename VARCHAR(255) UNIQUE NOT NULL)') | ||
sql.execute('CREATE TABLE IF NOT EXISTS links_papers (id INTEGER PRIMARY KEY, link_id INTEGER NOT NULL, paper_id INTEGER NOT NULL, FOREIGN KEY(link_id) REFERENCES links(id), FOREIGN KEY(paper_id) REFERENCES papers(id))') | ||
sql.execute('CREATE TABLE IF NOT EXISTS checks (id INTEGER PRIMARY KEY, datetime DATETIME NOT NULL, code INTEGER NOT NULL, message VARCHAR(255), link INTEGER NOT NULL, FOREIGN KEY(link) REFERENCES links(id))') | ||
return (conn, sql) | ||
|
||
if __name__ == '__main__': | ||
conn, sql = init_db() | ||
conn.commit() | ||
conn.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
from __future__ import division | ||
from db import init_db | ||
|
||
# Every link is either checked (HTTP(S)) or unchecked (FTP). | ||
def num_links(sql): | ||
return sql.execute("SELECT COUNT(DISTINCT url) FROM links").fetchone()[0] | ||
|
||
def num_checked_links(sql): | ||
return sql.execute("SELECT COUNT(DISTINCT checks.link) FROM checks").fetchone()[0] | ||
|
||
def num_unchecked_links(sql): | ||
return sql.execute("SELECT COUNT(DISTINCT l.id) FROM links AS l WHERE NOT EXISTS(SELECT link FROM checks WHERE l.id = checks.link)").fetchone()[0] | ||
|
||
# Every checked link is either consistent (returns same HTTP status every check) or inconsistent. | ||
def num_con_links(sql): | ||
return sql.execute("SELECT COUNT(DISTINCT checks.link) FROM checks WHERE checks.link NOT IN (\ | ||
SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code)").fetchone()[0] | ||
|
||
def con_links(sql, code): | ||
return sql.execute("SELECT DISTINCT checks.link, links.url FROM checks JOIN links ON checks.link = links.id WHERE checks.link NOT IN (\ | ||
SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code)\ | ||
AND checks.code = ?", (code,)).fetchall() | ||
|
||
def con_nw_links(sql): # Don't rely on this for counts: the same link may return a different message even if the code is the same. | ||
# Might want to fix this eventually: pull distinct link IDs from checks in subquery and get URL/code/message from joins onto that? | ||
return sql.execute("SELECT DISTINCT checks.link, links.url, checks.code, checks.message FROM checks JOIN links ON checks.link = links.id \ | ||
WHERE checks.link NOT IN\ | ||
(SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code <> c2.code)\ | ||
AND checks.code <> 200").fetchall() | ||
|
||
def num_con_nw_links(sql): | ||
return sql.execute("SELECT COUNT(DISTINCT c.link) FROM checks AS c WHERE c.link NOT IN\ | ||
(SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code <> c2.code)\ | ||
AND c.code <> 200").fetchone()[0] | ||
|
||
def incon_links(sql): # Don't rely on this for counts either; if one link returns 3+ distinct codes, those will show up as 2+ rows | ||
return sql.execute("SELECT DISTINCT c1.link, links.url, c1.code, c2.code FROM checks AS c1 JOIN links ON c1.link = links.id \ | ||
JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code ORDER BY c1.link").fetchall() | ||
|
||
def num_incon_links(sql): | ||
return sql.execute("SELECT COUNT(DISTINCT c1.link) FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code").fetchone()[0] | ||
|
||
# Every inconsistent link either works sometimes or doesn't. | ||
def num_incon_w(sql): | ||
return sql.execute("SELECT COUNT(DISTINCT c1.link) FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code <> c2.code AND c1.code = 200").fetchone()[0] | ||
|
||
def num_incon_nw(sql): # unreasonably slow | ||
return sql.execute("SELECT COUNT(DISTINCT incon.link) FROM (SELECT c3.link, c3.code FROM checks AS c3 JOIN checks AS c4 ON c3.link = c4.link\ | ||
WHERE c3.code <> c4.code) incon\ | ||
WHERE NOT EXISTS(SELECT c1.link FROM checks AS c1 WHERE c1.code = 200 AND c1.link = incon.link)").fetchone()[0] | ||
|
||
# percentize | ||
def c(a, b): | ||
return round(a / b * 100, 2) | ||
|
||
def print_links(links): | ||
max_url_len = max(len(x[1]) for x in links) | ||
for l in links: | ||
link_id, url, code, message = l | ||
print "{0:<4}".format(link_id), | ||
print "{0:<3}".format(code), | ||
print "{{0:<{0}}}".format(max_url_len).format(url), | ||
print "\t{0}".format(message) | ||
|
||
if __name__ == "__main__": | ||
import argparse | ||
parser = argparse.ArgumentParser("Process data from links.sqlite.") | ||
parser.add_argument("-d", dest="detailed", action="store_const", const=True, default=False, help="Display detailed output.") | ||
parser.add_argument("-s", dest="sanity_check", action="store_const", const=True, default=False, help="Display sanity checks.") | ||
args = parser.parse_args() | ||
conn, sql = init_db() | ||
total_links = num_links(sql) | ||
print "There are {0} links in the database.".format(total_links) | ||
num_checked = num_checked_links(sql) | ||
num_unchecked = num_unchecked_links(sql) | ||
print "{0} links were checked. {1} links weren't -- these are probably FTP.".format(num_checked, num_unchecked) | ||
num_con = num_con_links(sql) | ||
num_con_w = len(con_links(sql, 200)) | ||
num_con_nw = num_con_nw_links(sql) | ||
print "{0} checked links were consistent: they returned the same HTTP status code on every check.".format(num_con) | ||
print "\t{0} consistently worked.".format(num_con_w) | ||
print "\t{0} consistently didn't.".format(num_con_nw), | ||
if args.detailed: | ||
print "Here they are:" | ||
print_links(con_nw_links(sql)) | ||
else: | ||
num_incon = num_incon_links(sql) | ||
num_incon_w = num_incon_w(sql) | ||
num_incon_nw = num_incon_nw(sql) | ||
print "{0} checked links were inconsistent.".format(num_incon) | ||
print "\t{0} sometimes worked and sometimes didn't.".format(num_incon_w) | ||
print "\t{0} never worked, but varied as to why they didn't.".format(num_incon_nw) | ||
if args.detailed: | ||
print "Here are the inconsistent links:" | ||
print_links(incon_links(sql)) | ||
print "Here are the reasons consistent links failed:" | ||
err_msgs = {-5: "ValueError", -4: "httplib.BadStatusLine", -3: "socket.error", -2: "ssl.CertificateError", -1: "urllib2.URLError (lookup failed)", 200: "OK",\ | ||
301: "Moved permanently (redirect)", 302: "Found", 401: "Unauthorized", 403: "Forbidden", 404: "Not found", 500: "Internal server error",\ | ||
501: "Not implemented", 502: "Bad gateway", 503: "Service unavailable", 504: "Gateway timeout"} | ||
err_codes = sql.execute("SELECT DISTINCT code FROM checks ORDER BY code").fetchall() | ||
if args.sanity_check: | ||
running_total = 0 | ||
for code_tuple in err_codes: | ||
code = code_tuple[0] | ||
if code == 200: | ||
continue | ||
links = con_links(sql, code) | ||
if args.detailed: | ||
print "\t{0}:".format(code) | ||
for l in links: | ||
print "\t\t{0:<4} {1}".format(*l) | ||
else: | ||
print "\t{0} {1}: {2}".format(code, err_msgs[code], len(links)) | ||
if args.sanity_check: | ||
running_total += len(links) | ||
|
||
if args.sanity_check: | ||
num_working = sql.execute("SELECT COUNT(DISTINCT link) FROM checks WHERE code = 200").fetchone()[0] | ||
num_not_working = sql.execute("SELECT COUNT(DISTINCT link) FROM checks WHERE code <> 200").fetchone()[0] | ||
print "{0:>4} {1:>4} Links should either be checked or unchecked.".format(total_links, num_checked + num_unchecked) | ||
print "{0:>4} {1:>4} Checked links should either be consistent or be inconsistent.".format(num_checked, num_con + num_incon) | ||
print "{0:>4} {1:>4} Consistent links should either always work or never work.".format(num_con, num_con_w + num_con_nw) | ||
print "{0:>4} {1:>4} Inconsistent links should either work sometimes or never work.".format(num_incon, num_incon_w + num_incon_nw) | ||
print "{0:>4} {1:>4} Links that returned a 200 OK should be either consistent or inconsistent.".format(num_working, num_con_w + num_incon_w) | ||
print "{0:>4} {1:>4} Links that returned something other than a 200 OK should be either consistently not working or inconsistent.".format(num_not_working, \ | ||
num_con_nw + num_incon_w + num_incon_nw) | ||
print "{0:>4} {1:>4} All the smallest buckets should add up to the total.".format(total_links, num_con_w + num_con_nw + num_incon_w + num_incon_nw + num_unchecked, total_links) | ||
print "{0:>4} {1:>4} As many consistent codes should error in total as error for any specific reason".format(num_con_nw, running_total) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from db import init_db | ||
import sys, codecs, pyPdf | ||
|
||
def find_pdf_links(filename): | ||
pdf_file = open(filename) | ||
pdf = pyPdf.PdfFileReader(pdf_file) | ||
pdf_links = [] | ||
pdf_pages = pdf.getNumPages() | ||
for i in range(pdf_pages): | ||
page = pdf.getPage(i).getObject() | ||
if page.has_key('/Annots'): | ||
for k in page['/Annots']: | ||
annot = k.getObject() | ||
if annot.has_key('/A') and annot['/A'].has_key('/URI'): | ||
pdf_links.append(annot['/A']['/URI']) | ||
return pdf_links | ||
|
||
def filter_links(links): | ||
to_filter = ["http://www.aanda.org", "http://www.edpsciences.org",\ | ||
"http://dexter.edpsciences.org", "http://dx.doi.org",\ | ||
"http://linker.aanda.org", "http://arxiv.org",\ | ||
"http://adsabs.harvard.edu", "http://ui.adsabs.harvard.edu",\ | ||
"doi:", "DOI:", "mailto:", 'email:', "http://ascl.net", "ascl.net"] | ||
to_filter = zip(to_filter, [len(u) for u in to_filter]) | ||
return [l for l in links if (not any([l[0:v] == u for u, v in to_filter])) and (l.find('@') == -1)] | ||
|
||
def add_link(link, filename, paper_id, conn, sql): | ||
sql.execute("SELECT url FROM links WHERE url = ?", (link,)) | ||
foo = sql.fetchone() | ||
if not foo: | ||
sql.execute("INSERT INTO links(url) VALUES (?)", (link,)) | ||
conn.commit() | ||
link_id = sql.lastrowid | ||
else: | ||
sql.execute("SELECT id FROM links WHERE url = ?", (link,)) | ||
link_id = sql.fetchone()[0] | ||
sql.execute("INSERT INTO links_papers(link_id, paper_id) VALUES (?, ?)", (link_id, paper_id)) | ||
|
||
def unpathify(filename): | ||
return filename.split('/')[-1].split('.')[0] | ||
|
||
def process_papers(filenames): | ||
conn, sql = init_db() | ||
processed_files = 0 | ||
added_links = 0 | ||
errored_files = [] | ||
for filename in filenames: | ||
print("Reading file {0}".format(filename)) | ||
try: | ||
links = filter_links(find_pdf_links(filename)) | ||
except pyPdf.utils.PdfReadError: | ||
print("Couldn't read file") | ||
errored_files.append(filename) | ||
continue | ||
filename_without_path = unpathify(filename) | ||
sql.execute("SELECT filename FROM papers WHERE papers.filename = ?", (filename_without_path,)) | ||
if sql.fetchone(): | ||
continue | ||
sql.execute("INSERT INTO papers(filename) VALUES (?)", (filename_without_path,)) | ||
conn.commit() | ||
paper_id = sql.lastrowid | ||
for link in links: | ||
print("\tAdding link {0}".format(link)) | ||
add_link(link, filename, paper_id, conn, sql) | ||
processed_files += 1 | ||
added_links += len(links) | ||
conn.commit() | ||
conn.close() | ||
print("--- Complete! ---") | ||
print("Processed {0} files, added {1} links".format(processed_files, added_links)) | ||
if len(errored_files) == 0: | ||
print("No file read errors!") | ||
elif len(errored_files) == 1: | ||
print("1 file read error:\n\t{0}".format(errored_files[0])) | ||
else: | ||
print("{0} file read errors:".format(len(errored_files))) | ||
for f in errored_files: | ||
print("\t" + f) | ||
|
||
if __name__ == "__main__": | ||
process_papers(sys.argv[1:]) |