Add link tools

teuben · Oct 17, 2017 · 34265d0 · 34265d0
1 parent 346068b
commit 34265d0
Show file tree

Hide file tree

Showing 5 changed files with 280 additions and 0 deletions.
diff --git a/links/check_links.py b/links/check_links.py
@@ -0,0 +1,50 @@
+from db import init_db
+import datetime, urllib2, ssl
+
+def store_result(id, code, msg, conn, sql):
+	sql.execute("INSERT INTO checks(datetime, code, message, link) VALUES(?,?,?,?)", (datetime.datetime.now(), str(code), msg, int(id)))
+	conn.commit()
+
+class NotHTTPException(Exception):
+	pass
+
+def check_link(id, url):
+	if url[:3] == 'ftp':
+		raise NotHTTPException("FTP links are currently unsupported")
+	if url[:4] != 'http':
+		print(url)
+		url = "http://" + url
+	req = urllib2.Request(url)
+	e = ''
+	try:
+		res = urllib2.urlopen(req)
+		code = res.getcode()
+	except urllib2.HTTPError as e:
+		code = e.code
+	except urllib2.URLError as e:
+		code = -1
+	except ssl.CertificateError as e:
+		code = -2
+	except urllib2.socket.error as e:
+		code = -3
+	except urllib2.httplib.BadStatusLine as e:
+		code = -4
+	except ValueError as e:
+		code = -5
+	return (id, code, str(e))
+
+if __name__ == "__main__":
+	conn, sql = init_db()
+	links = conn.execute("SELECT id, url FROM links")
+	for link in links:
+		print(link)
+		try:
+			id, code, msg = check_link(link[0], link[1])
+		except NotHTTPException as e:
+			print("\t{0}".format(e.message))
+			continue
+		if code != 200:
+			print("\t{0} {1}".format(code, msg))
+		store_result(id, code, msg, conn, sql)
+	conn.commit()
+	conn.close()
diff --git a/links/db.py b/links/db.py
@@ -0,0 +1,16 @@
+import sqlite3
+
+def init_db():
+	sqlite_db_path = './links.sqlite'
+	conn = sqlite3.connect(sqlite_db_path)
+	sql = conn.cursor()
+	sql.execute('CREATE TABLE IF NOT EXISTS links (id INTEGER PRIMARY KEY, url VARCHAR(255) UNIQUE NOT NULL)')
+	sql.execute('CREATE TABLE IF NOT EXISTS papers (id INTEGER PRIMARY KEY, filename VARCHAR(255) UNIQUE NOT NULL)')
+	sql.execute('CREATE TABLE IF NOT EXISTS links_papers (id INTEGER PRIMARY KEY, link_id INTEGER NOT NULL, paper_id INTEGER NOT NULL, FOREIGN KEY(link_id) REFERENCES links(id), FOREIGN KEY(paper_id) REFERENCES papers(id))')
+	sql.execute('CREATE TABLE IF NOT EXISTS checks (id INTEGER PRIMARY KEY, datetime DATETIME NOT NULL, code INTEGER NOT NULL, message VARCHAR(255), link INTEGER NOT NULL, FOREIGN KEY(link) REFERENCES links(id))')
+	return (conn, sql)
+
+if __name__ == '__main__':
+	conn, sql = init_db()
+	conn.commit()
+	conn.close()
diff --git a/links/link_data.py b/links/link_data.py
@@ -0,0 +1,133 @@
+from __future__ import division
+from db import init_db
+
+# Every link is either checked (HTTP(S)) or unchecked (FTP).
+def num_links(sql):
+	return sql.execute("SELECT COUNT(DISTINCT url) FROM links").fetchone()[0]
+
+def num_checked_links(sql):
+	return sql.execute("SELECT COUNT(DISTINCT checks.link) FROM checks").fetchone()[0]
+
+def num_unchecked_links(sql):
+	return sql.execute("SELECT COUNT(DISTINCT l.id) FROM links AS l WHERE NOT EXISTS(SELECT link FROM checks WHERE l.id = checks.link)").fetchone()[0]
+
+# Every checked link is either consistent (returns same HTTP status every check) or inconsistent.
+def num_con_links(sql):
+        return sql.execute("SELECT COUNT(DISTINCT checks.link) FROM checks WHERE checks.link NOT IN (\
+                                SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code)").fetchone()[0]
+
+def con_links(sql, code):
+	return sql.execute("SELECT DISTINCT checks.link, links.url FROM checks JOIN links ON checks.link = links.id WHERE checks.link NOT IN (\
+				SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code)\
+				AND checks.code = ?", (code,)).fetchall()
+
+def con_nw_links(sql): # Don't rely on this for counts: the same link may return a different message even if the code is the same.
+	# Might want to fix this eventually: pull distinct link IDs from checks in subquery and get URL/code/message from joins onto that?
+	return sql.execute("SELECT DISTINCT checks.link, links.url, checks.code, checks.message FROM checks JOIN links ON checks.link = links.id \
+				WHERE checks.link NOT IN\
+				(SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code <> c2.code)\
+				AND checks.code <> 200").fetchall()
+
+def num_con_nw_links(sql):
+	return sql.execute("SELECT COUNT(DISTINCT c.link) FROM checks AS c WHERE c.link NOT IN\
+				(SELECT DISTINCT c1.link FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code <> c2.code)\
+				AND c.code <> 200").fetchone()[0]
+
+def incon_links(sql): # Don't rely on this for counts either; if one link returns 3+ distinct codes, those will show up as 2+ rows
+	return sql.execute("SELECT DISTINCT c1.link, links.url, c1.code, c2.code FROM checks AS c1 JOIN links ON c1.link = links.id \
+				JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code ORDER BY c1.link").fetchall()
+
+def num_incon_links(sql):
+	return sql.execute("SELECT COUNT(DISTINCT c1.link) FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code < c2.code").fetchone()[0]
+
+# Every inconsistent link either works sometimes or doesn't.
+def num_incon_w(sql):
+	return sql.execute("SELECT COUNT(DISTINCT c1.link) FROM checks AS c1 JOIN checks AS c2 ON c1.link = c2.link WHERE c1.code <> c2.code AND c1.code = 200").fetchone()[0]
+
+def num_incon_nw(sql): # unreasonably slow
+	return sql.execute("SELECT COUNT(DISTINCT incon.link) FROM (SELECT c3.link, c3.code FROM checks AS c3 JOIN checks AS c4 ON c3.link = c4.link\
+		WHERE c3.code <> c4.code) incon\
+		WHERE NOT EXISTS(SELECT c1.link FROM checks AS c1 WHERE c1.code = 200 AND c1.link = incon.link)").fetchone()[0]
+
+# percentize
+def c(a, b):
+	return round(a / b * 100, 2)
+
+def print_links(links):
+	max_url_len = max(len(x[1]) for x in links)
+	for l in links:
+		link_id, url, code, message = l
+		print "{0:<4}".format(link_id),
+		print "{0:<3}".format(code),
+		print "{{0:<{0}}}".format(max_url_len).format(url),
+		print "\t{0}".format(message)
+
+if __name__ == "__main__":
+	import argparse
+	parser = argparse.ArgumentParser("Process data from links.sqlite.")
+	parser.add_argument("-d", dest="detailed", action="store_const", const=True, default=False, help="Display detailed output.")
+	parser.add_argument("-s", dest="sanity_check", action="store_const", const=True, default=False, help="Display sanity checks.")
+	args = parser.parse_args()
+	conn, sql = init_db()
+	total_links = num_links(sql)
+	print "There are {0} links in the database.".format(total_links)
+	num_checked = num_checked_links(sql)
+	num_unchecked = num_unchecked_links(sql)
+	print "{0} links were checked. {1} links weren't -- these are probably FTP.".format(num_checked, num_unchecked)
+	num_con = num_con_links(sql)
+	num_con_w = len(con_links(sql, 200))
+	num_con_nw = num_con_nw_links(sql)
+	print
+	print "{0} checked links were consistent: they returned the same HTTP status code on every check.".format(num_con)
+	print "\t{0} consistently worked.".format(num_con_w)
+	print "\t{0} consistently didn't.".format(num_con_nw),
+	if args.detailed:
+		print "Here they are:"
+		print_links(con_nw_links(sql))
+	else:
+		print
+	num_incon = num_incon_links(sql)
+	num_incon_w = num_incon_w(sql)
+	num_incon_nw = num_incon_nw(sql)
+	print
+	print "{0} checked links were inconsistent.".format(num_incon)
+	print "\t{0} sometimes worked and sometimes didn't.".format(num_incon_w)
+	print "\t{0} never worked, but varied as to why they didn't.".format(num_incon_nw)
+	if args.detailed:
+		print "Here are the inconsistent links:"
+		print_links(incon_links(sql))
+	print
+	print "Here are the reasons consistent links failed:"
+	err_msgs = {-5: "ValueError", -4: "httplib.BadStatusLine", -3: "socket.error", -2: "ssl.CertificateError", -1: "urllib2.URLError (lookup failed)", 200: "OK",\
+			301: "Moved permanently (redirect)", 302: "Found", 401: "Unauthorized", 403: "Forbidden", 404: "Not found", 500: "Internal server error",\
+			501: "Not implemented", 502: "Bad gateway", 503: "Service unavailable", 504: "Gateway timeout"}
+	err_codes = sql.execute("SELECT DISTINCT code FROM checks ORDER BY code").fetchall()
+	if args.sanity_check:
+		running_total = 0
+	for code_tuple in err_codes:
+		code = code_tuple[0]
+		if code == 200:
+			continue
+		links = con_links(sql, code)
+		if args.detailed:
+			print "\t{0}:".format(code)
+			for l in links:
+				print "\t\t{0:<4} {1}".format(*l)
+		else:
+			print "\t{0} {1}: {2}".format(code, err_msgs[code], len(links))
+		if args.sanity_check:
+			running_total += len(links)
+
+	if args.sanity_check:
+		num_working = sql.execute("SELECT COUNT(DISTINCT link) FROM checks WHERE code = 200").fetchone()[0]
+		num_not_working = sql.execute("SELECT COUNT(DISTINCT link) FROM checks WHERE code <> 200").fetchone()[0]
+		print
+		print "{0:>4} {1:>4} Links should either be checked or unchecked.".format(total_links, num_checked + num_unchecked)
+		print "{0:>4} {1:>4} Checked links should either be consistent or be inconsistent.".format(num_checked, num_con + num_incon)
+		print "{0:>4} {1:>4} Consistent links should either always work or never work.".format(num_con, num_con_w + num_con_nw)
+		print "{0:>4} {1:>4} Inconsistent links should either work sometimes or never work.".format(num_incon, num_incon_w + num_incon_nw)
+		print "{0:>4} {1:>4} Links that returned a 200 OK should be either consistent or inconsistent.".format(num_working, num_con_w + num_incon_w)
+		print "{0:>4} {1:>4} Links that returned something other than a 200 OK should be either consistently not working or inconsistent.".format(num_not_working, \
+			num_con_nw + num_incon_w + num_incon_nw)
+		print "{0:>4} {1:>4} All the smallest buckets should add up to the total.".format(total_links, num_con_w + num_con_nw + num_incon_w + num_incon_nw + num_unchecked, total_links)
+		print "{0:>4} {1:>4} As many consistent codes should error in total as error for any specific reason".format(num_con_nw, running_total)
diff --git a/links/links.sqlite b/links/links.sqlite
diff --git a/links/process_pdfs.py b/links/process_pdfs.py
@@ -0,0 +1,81 @@
+from db import init_db
+import sys, codecs, pyPdf
+
+def find_pdf_links(filename):
+	pdf_file = open(filename)
+	pdf = pyPdf.PdfFileReader(pdf_file)
+	pdf_links = []
+	pdf_pages = pdf.getNumPages()
+	for i in range(pdf_pages):
+		page = pdf.getPage(i).getObject()
+		if page.has_key('/Annots'):
+			for k in page['/Annots']:
+				annot = k.getObject()
+				if annot.has_key('/A') and annot['/A'].has_key('/URI'):
+					pdf_links.append(annot['/A']['/URI'])
+	return pdf_links
+
+def filter_links(links):
+	to_filter = ["http://www.aanda.org", "http://www.edpsciences.org",\
+				 "http://dexter.edpsciences.org", "http://dx.doi.org",\
+				 "http://linker.aanda.org", "http://arxiv.org",\
+				 "http://adsabs.harvard.edu", "http://ui.adsabs.harvard.edu",\
+				 "doi:", "DOI:", "mailto:", 'email:', "http://ascl.net", "ascl.net"]
+	to_filter = zip(to_filter, [len(u) for u in to_filter])
+	return [l for l in links if (not any([l[0:v] == u for u, v in to_filter])) and (l.find('@') == -1)]
+
+def add_link(link, filename, paper_id, conn, sql):
+	sql.execute("SELECT url FROM links WHERE url = ?", (link,))
+	foo = sql.fetchone()
+	if not foo:
+		sql.execute("INSERT INTO links(url) VALUES (?)", (link,))
+		conn.commit()
+		link_id = sql.lastrowid
+	else:
+		sql.execute("SELECT id FROM links WHERE url = ?", (link,))
+		link_id = sql.fetchone()[0]
+	sql.execute("INSERT INTO links_papers(link_id, paper_id) VALUES (?, ?)", (link_id, paper_id))
+
+def unpathify(filename):
+	return filename.split('/')[-1].split('.')[0]
+
+def process_papers(filenames):
+	conn, sql = init_db()
+	processed_files = 0
+	added_links = 0
+	errored_files = []
+	for filename in filenames:
+		print("Reading file {0}".format(filename))
+		try:
+			links = filter_links(find_pdf_links(filename))
+		except pyPdf.utils.PdfReadError:
+			print("Couldn't read file")
+			errored_files.append(filename)
+			continue
+		filename_without_path = unpathify(filename)
+		sql.execute("SELECT filename FROM papers WHERE papers.filename = ?", (filename_without_path,))
+		if sql.fetchone():
+			continue
+		sql.execute("INSERT INTO papers(filename) VALUES (?)", (filename_without_path,))
+		conn.commit()
+		paper_id = sql.lastrowid
+		for link in links:
+			print("\tAdding link {0}".format(link))
+			add_link(link, filename, paper_id, conn, sql)
+		processed_files += 1
+		added_links += len(links)
+	conn.commit()
+	conn.close()
+	print("--- Complete! ---")
+	print("Processed {0} files, added {1} links".format(processed_files, added_links))
+	if len(errored_files) == 0:
+		print("No file read errors!")
+	elif len(errored_files) == 1:
+		print("1 file read error:\n\t{0}".format(errored_files[0]))
+	else:
+		print("{0} file read errors:".format(len(errored_files)))
+		for f in errored_files:
+			print("\t" + f)
+
+if __name__ == "__main__":
+	process_papers(sys.argv[1:])