Adds arxiv extractors with tests and refactors utility to be called 'mwcites'

halfak · halfak · commit 3d2cb9b7034f · 2015-04-06T11:57:05.000-05:00
diff --git a/README.rst b/README.rst
@@ -31,16 +31,17 @@ Documentation is provided ``$ mwcitations extract -h``.
      * PubMed
      * DOI
      * ISBN
-     
+     * arXiv
+
     Outputs a TSV file with the following fields:
 
      * page_id: The identifier of the Wikipedia article (int), e.g. 1325125
      * page_title: The title of the Wikipedia article (utf-8), e.g. Club cell
      * rev_id: The Wikipedia revision where the citation was first added (int),
                e.g. 282470030
-     * timestamp: The timestamp of the revision where the citation was first added.
-                  (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
-     * type: The type of identifier, e.g. pmid
+     * timestamp: The timestamp of the revision where the citation was first
+                  added. (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
+     * type: The type of identifier, e.g. pmid, pmcid, doi, isbn or arxiv
      * id: The id of the cited scholarly article (utf-8),
            e.g 10.1183/09031936.00213411
 
diff --git a/datasets/mw_dump_stub.xml b/datasets/mw_dump_stub.xml
@@ -111,6 +111,8 @@
       American Behavioral Scientist,
       0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;.  Hats pants and banana
       {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
+      [http://arxiv.org/abs/0706.0004v1]
+      [https://arxiv.org/abs/0706.0005v1]
       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
       10.2387/234310.2347/39423</text>
       <sha1>pfjkfb1u54sksl4exkxge4f5v1mn7cl</sha1>
@@ -137,6 +139,7 @@
       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp]
       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp[waffles]]
       {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
+      [https://arxiv.org/abs/0706.0005v1]
       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
       10.2387/234310.2347/39423</text>
       <sha1>pfjkfb1u54tnksksxkxgehhgv1mn7cl</sha1>
diff --git a/mwcitations b/mwcitations
diff --git a/mwcites/extractors/arxiv.py b/mwcites/extractors/arxiv.py
@@ -0,0 +1,17 @@
+import re
+
+from ..identifier import Identifier
+
+# From http://arxiv.org/help/arxiv_identifier
+old_id = r"-?(?P<old_id>([a-z]+(.[a-z]+)/)?[0-9]{4}[0-9]+)"
+new_id = r"(?P<new_id>[0-9]{4}.[0-9]+)(v[0-9]+)?"
+
+prefixes=["arxiv\s*=\s*", "//arxiv\.org/(abs/)?", "arxiv:\s?"]
+
+ARXIV_RE = re.compile(r"({0})".format("|".join(prefixes)) +
+                      r"({0}|{1})".format(old_id, new_id), re.I|re.U)
+
+def extract(text):
+    for match in ARXIV_RE.finditer(text):
+        id = match.group('new_id') or match.group("old_id")
+        yield Identifier("arxiv", id.lower())
diff --git a/mwcites/extractors/doi.py b/mwcites/extractors/doi.py
@@ -5,7 +5,6 @@
 
 from ..identifier import Identifier
 
-
 DOI_START_RE = re.compile(r'10\.[0-9]{4,}/')
 
 HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
diff --git a/mwcites/extractors/tests/test_arxiv.py b/mwcites/extractors/tests/test_arxiv.py
@@ -0,0 +1,43 @@
+import pprint
+
+from nose.tools import eq_
+
+from .. import arxiv
+from ...identifier import Identifier
+
+INPUT_TEXT = """
+This is a doi randomly placed in the text 10.0000/m1
+Here's a typo that might be construed as a doi 10.60 people were there.
+{{cite|...|arxiv=0706.0001v1|pmid=10559875}}
+<ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012).
+The rise and decline of an open collaboration system: How Wikipedia’s
+reaction to popularity is causing its decline.
+American Behavioral Scientist,
+0002764212469365 arxiv:0706.0002v1</ref>.  Hats pants and banana
+[http://arxiv.org/0706.0003]
+[http://arxiv.org/abs/0706.0004v1]
+[https://arxiv.org/abs/0706.0005v1]
+[https://arxiv.org/abs/math.GT/0309001]
+[https://arxiv.org/abs/-math.gs/0309002]
+{{cite|...|arxiv=foobar.hats/0101003|issue=1656}}
+http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
+10.2387/234310.2347/39423
+<!--
+    10.2387/234310.2347/39423-->
+"""
+EXPECTED = [
+    Identifier('arxiv', "0706.0001"),
+    Identifier('arxiv', "0706.0002"),
+    Identifier('arxiv', "0706.0003"),
+    Identifier('arxiv', "0706.0004"),
+    Identifier('arxiv', "0706.0005"),
+    Identifier('arxiv', "math.gt/0309001"),
+    Identifier('arxiv', "math.gs/0309002"),
+    Identifier('arxiv', "foobar.hats/0101003")
+]
+
+def test_extract():
+    ids = list(arxiv.extract(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    eq_(ids, EXPECTED)
diff --git a/mwcites/mwcites.py b/mwcites/mwcites.py
@@ -7,9 +7,9 @@
 * extract -- Extracts citations from an XML database dump
 
 Usage:
-    mwcitations (-h | --help)
-    mwcitations <utility> [-h | --help]
-    
+    mwcites (-h | --help)
+    mwcites <utility> [-h | --help]
+
 Options:
     -h | --help  Shows this documentation
     <utility>    The name of the utility to run
@@ -22,12 +22,12 @@
 
 
 USAGE = """Usage:
-    mwcitations (-h | --help)
-    mwcitations <utility> [-h | --help]\n"""
+    mwcites (-h | --help)
+    mwcites <utility> [-h | --help]\n"""
 
 
 def main():
-    
+
     if len(sys.argv) < 2:
         sys.stderr.write(USAGE)
         sys.exit(1)
@@ -37,13 +37,13 @@ def main():
     elif sys.argv[1][:1] == "-":
         sys.stderr.write(USAGE)
         sys.exit(1)
-    
+
     module_name = sys.argv[1]
     try:
         module = import_module(".utilities." + module_name, package="mwcites")
     except ImportError:
         sys.stderr.write(traceback.format_exc())
         sys.stderr.write("Could not find utility {0}.\n".format(module_name))
         sys.exit(1)
-    
+
     module.main(sys.argv[2:])
diff --git a/mwcites/utilities/extract.py b/mwcites/utilities/extract.py
@@ -8,6 +8,7 @@
  * PubMed
  * DOI
  * ISBN
+ * arXiv
 
 Outputs a TSV file with the following fields:
 
@@ -17,7 +18,7 @@
            e.g. 282470030
  * timestamp: The timestamp of the revision where the citation was first added.
               (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
- * type: The type of identifier, e.g. pmid
+ * type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn
  * id: The id of the cited scholarly article (utf-8),
        e.g 10.1183/09031936.00213411
 
@@ -38,9 +39,9 @@
 import docopt
 from mw import xml_dump
 
-from ..extractors import doi, pubmed, isbn
+from ..extractors import arxiv, doi, isbn, pubmed
 
-ALL_EXTRACTORS = [doi, pubmed, isbn]
+ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv]
 
 HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
 
@@ -51,7 +52,8 @@ def main(argv=None):
     if args['--extractor'] == ['<all>']:
         extractors = ALL_EXTRACTORS
     else:
-        extractors = [import_from_path(path) for path in args['--extractor']]
+        extractors = [import_from_path(path.lower)
+                      for path in args['--extractor']]
 
     run(dump_files, extractors)
 
diff --git a/setup.py b/setup.py
@@ -1,8 +1,10 @@
 import os
 
 from setuptools import find_packages, setup
+
 import mwcites
 
+
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
@@ -23,7 +25,8 @@ def requirements(fname):
     packages=find_packages(),
     entry_points = {
         'console_scripts': [
-            'mwcitations=mwcites.mwcitations:main'
+            'mwcitations=mwcites.mwcites:main',
+            'mwcites=mwcites.mwcites:main'
         ],
     },
     long_description = read('README.rst'),
diff --git a/utility b/utility
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+from mwcites import mwcites
+
+mwcites.main()