Merge branch 'ahagenbruch-master'

halfak · halfak · commit 4b5e6f60a2ca · 2015-03-06T13:24:05.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -55,3 +55,6 @@ coverage.xml
 
 # Sphinx documentation
 docs/_build/
+
+# Pycharm directories
+.idea
diff --git a/README.rst b/README.rst
@@ -1,7 +1,10 @@
-Extract academic citaitons from Wikipedia
+Extract academic citations from Wikipedia
 =========================================
 This project contains a utility for extracting academic citation identifiers.
 
+**NOTE:** As one of its dependencies (`Mediawiki-Utilities <https://github.com/halfak/Mediawiki-Utilities>`_) requires
+Python 3 so does mwcites.
+
 ``pip install mwcites``
 
 Usage
@@ -23,10 +26,11 @@ Documentation is provided ``$ mwcitations extract -h``.
     articles by processing a pages-meta-history XML dump and matching regular
     expressions to revision content.
 
-    Currently supported identifies include:
+    Currently supported identifiers include:
 
      * PubMed
      * DOI
+     * ISBN
      
     Outputs a TSV file with the following fields:
 
diff --git a/datasets/mw_dump_stub.xml b/datasets/mw_dump_stub.xml
@@ -110,7 +110,7 @@
       reaction to popularity is causing its decline.
       American Behavioral Scientist,
       0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;.  Hats pants and banana
-      {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
+      {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
       10.2387/234310.2347/39423</text>
       <sha1>pfjkfb1u54sksl4exkxge4f5v1mn7cl</sha1>
@@ -136,7 +136,7 @@
       0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;.  Hats pants and banana
       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp]
       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp[waffles]]
-      {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
+      {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
       10.2387/234310.2347/39423</text>
       <sha1>pfjkfb1u54tnksksxkxgehhgv1mn7cl</sha1>
diff --git a/mwcites/__init__.py b/mwcites/__init__.py
@@ -0,0 +1 @@
+from .identifier import Identifier
diff --git a/mwcites/extractors/isbn.py b/mwcites/extractors/isbn.py
@@ -0,0 +1,8 @@
+import re
+from ..identifier import Identifier
+
+ISBN_RE = re.compile('isbn\s?=?\s?([0-9\-Xx]+)', re.I)
+
+def extract(text):
+    for match in ISBN_RE.finditer(text):
+        yield Identifier('isbn', match.group(1).replace('-', ''))
diff --git a/mwcites/extractors/tests/test_doi.py b/mwcites/extractors/tests/test_doi.py
@@ -58,7 +58,7 @@ def test_extract_island():
     pprint.pprint(ids)
     pprint.pprint(EXPECTED)
     eq_(ids, EXPECTED)
-    
+
 def test_extract_search():
     ids = list(doi.extract_search(INPUT_TEXT))
     pprint.pprint(ids)
diff --git a/mwcites/extractors/tests/test_isbn.py b/mwcites/extractors/tests/test_isbn.py
@@ -0,0 +1,44 @@
+import pprint
+from nose.tools import eq_
+
+from .. import isbn
+from ...identifier import Identifier
+
+INPUT_TEXT = """
+    | publisher=Academic Press | isbn=0124366031
+    | isbn=3540206310
+    | accessdate=2008-02-05 | isbn=0-618-34342-3
+    | isbn=978-0-140-27666-4
+    | isbn = 0-13-054091-9
+    | isbn=0195305736 }}&lt;/ref&gt; schlug [[Irving Langmuir]] 1919 vor, dass das Elektronen in einem Atom verbunden oder verklumpt seien. Elektronengruppen beset
+    | ISBN=978-3-7046-5112-9
+    * Peter L. Bergen: ''Heiliger Krieg, Inc.: Osama bin Ladens Terrornetz''. Siedler, Berlin 2001, ISBN 3-88680-752-5.
+    * Marwan Abou-Taam, Ruth Bigalke (Hgg) ''Die Reden des Osama bin Laden''. Diederichs, München 2006, ISBN 3-72052-773-5. (Reden und Ansprachen des b.L. im Original - ''Rezensionen: '' [http://www.sicherheit-heute.de/index.php?cccpage=readpolitik&amp;set_z_artikel=221 ]und [http://www.fr-online.de/in_und_ausland/kultur_und_medien/buecher/?em_cnt=868715&amp;sid=f55727] Frankf. Rundschau 26. April 2006)
+    * Michael Pekler, Andreas Ungerböck: ''Ang Lee und seine Filme''. Schüren, Marburg 2009, ISBN 978-3-89472-665-2.
+    &lt;ref name=&quot;flos1&quot;&gt;{{Literatur | Autor = René Flosdorff, Günther Hilgarth | Titel = Elektrische Energieverteilung | Verlag = Teubner | Auflage = 8. | Jahr = 2003 | Kapitel = Kapitel 1.2.2.4 | ISBN = 3-519-26424-2 }}&lt;/ref&gt;
+    Bei einer [[Sprungtemperatur]] von 1,2&amp;nbsp;K wird reines Aluminium [[Supraleiter|supraleitend]].&lt;ref&gt;{{Literatur | Autor = Ilschner | first = Bernhard | Titel = Werkstoffwissenschaften und Fertigungstechnik Eigenschaften, Vorgänge, Technologien | Verlag = Springer | Ort = Berlin | Jahr = 2010 | ISBN = 978-3-642-01734-6 | Seiten = 277}}&lt;/ref&gt;
+    * {{Literatur | Autor=Michael J. Padilla, Ioannis Miaoulis, Martha Cyr | Jahr = 2002 | Titel = Prentice Hall Science Explorer: Chemical Building Blocks | Verlag = Prentice-Hall, Inc. | Ort = Upper Saddle River, New Jersey USA | ISBN = 0-13-054091-9 | |Originalsprache=en}}
+    """
+
+
+EXPECTED = [
+    Identifier('isbn', '0124366031'),
+    Identifier('isbn', '3540206310'),
+    Identifier('isbn', '0618343423'),
+    Identifier('isbn', '9780140276664'),
+    Identifier('isbn', '0130540919'),
+    Identifier('isbn', '0195305736'),
+    Identifier('isbn', '9783704651129'),
+    Identifier('isbn', '3886807525'),
+    Identifier('isbn', '3720527735'),
+    Identifier('isbn', '9783894726652'),
+    Identifier('isbn', '3519264242'),
+    Identifier('isbn', '9783642017346'),
+    Identifier('isbn', '0130540919'),
+]
+
+def test_extract():
+    ids = list(isbn.extract(INPUT_TEXT))
+    pprint.pprint(ids)
+    pprint.pprint(EXPECTED)
+    eq_(ids, EXPECTED)
diff --git a/mwcites/extractors/tests/test_pubmed.py b/mwcites/extractors/tests/test_pubmed.py
@@ -1,10 +1,10 @@
 from nose.tools import eq_
 
 from .. import pubmed
-
+from ...identifier import Identifier
 
 def test_extract():
-    
+
     text = """
     This is some text with a template cite. {{cite|...|...|pmid=1}}.
     This is some text with a template cite. {{cite|...|...|pmid = 2|...}}.
@@ -15,12 +15,12 @@ def test_extract():
     """
     ids = list(pubmed.extract(text))
     expected = [
-        ('pmid', "1"),
-        ('pmid', "2"),
-        ('pmc', "3"),
-        ('pmc', "4"),
-        ('pmid', "5"),
-        ('pmc', "6")
+        Identifier('pmid', "1"),
+        Identifier('pmid', "2"),
+        Identifier('pmc', "3"),
+        Identifier('pmc', "4"),
+        Identifier('pmid', "5"),
+        Identifier('pmc', "6")
     ]
     print(ids)
     print(expected)
diff --git a/mwcites/utilities/extract.py b/mwcites/utilities/extract.py
@@ -7,7 +7,8 @@
 
  * PubMed
  * DOI
- 
+ * ISBN
+
 Outputs a TSV file with the following fields:
 
  * page_id: The identifier of the Wikipedia article (int), e.g. 1325125
@@ -37,30 +38,30 @@
 import docopt
 from mw import xml_dump
 
-from ..extractors import doi, pubmed
+from ..extractors import doi, pubmed, isbn
 
-ALL_EXTRACTORS = [doi, pubmed]
+ALL_EXTRACTORS = [doi, pubmed, isbn]
 
 HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
 
 def main(argv=None):
     args = docopt.docopt(__doc__, argv=argv)
     dump_files = args['<dump_file>']
-    
+
     if args['--extractor'] == ['<all>']:
         extractors = ALL_EXTRACTORS
     else:
         extractors = [import_from_path(path) for path in args['--extractor']]
-    
+
     run(dump_files, extractors)
 
 def run(dump_files, extractors):
-    
+
     print("\t".join(HEADERS))
-    
+
     cites = extract(dump_files, extractors=extractors)
     for page_id, title, rev_id, timestamp, type, id in cites:
-        
+
         print("\t".join(tsv_encode(v) for v in (page_id,
                                                 title,
                                                 rev_id,
@@ -71,17 +72,17 @@ def run(dump_files, extractors):
 def extract(dump_files, extractors=ALL_EXTRACTORS):
     """
     Extracts cites from a set of `dump_files`.
-    
+
     :Parameters:
         dump_files : str | `file`
             A set of files MediaWiki XML dump files
             (expects: pages-meta-history)
         extractors : `list`(`extractor`)
             A list of extractors to apply to the text
-    
+
     :Returns:
         `iterable` -- a generator of extracted cites
-    
+
     """
     # Dump processor function
     def process_dump(dump, path):
@@ -90,48 +91,48 @@ def process_dump(dump, path):
             else:
                 for cite in extract_cite_history(page, extractors):
                     yield cite
-        
+
     # Map call
     return xml_dump.map(dump_files, process_dump)
 
 def extract_cite_history(page, extractors):
     """
     Extracts cites from the history of a `page` (`mw.xml_dump.Page`).
-    
+
     :Parameters:
         page : `iterable`(`mw.xml_dump.Revision`)
             The page to extract cites from
         extractors : `list`(`extractor`)
             A list of extractors to apply to the text
-    
+
     :Returns:
         `iterable` -- a generator of extracted cites
-    
+
     """
     appearances = {} # For tracking the first appearance of an ID
     ids = set() # For holding onto the ids in the last revision.
     for revision in page:
         ids = set(extract_ids(revision.text, extractors))
-        
+
         # For each ID, check to see if we have seen it before
         for id in ids:
             if id not in appearances:
                appearances[id] = (revision.id, revision.timestamp)
-        
+
     for id in ids: #For the ids in the last version of the page
         rev_id, timestamp = appearances[id]
         yield (page.id, page.title, rev_id, timestamp, id.type, id.id)
 
 def extract_ids(text, extractors):
     """
     Uses `extractors` to extract citation identifiers from a text.
-    
+
     :Parameters:
         text : str
             The text to process
         extractors : `list`(`extractor`)
             A list of extractors to apply to the text
-    
+
     :Returns:
         `iterable` -- a generator of extracted identifiers
     """
@@ -142,12 +143,12 @@ def extract_ids(text, extractors):
 def import_from_path(path):
     """
     Imports a specific attribute from a module based on a class path.
-    
+
     :Parameters:
         path : str
             A dot delimited string representing the import path of the desired
             object.
-    
+
     :Returns:
         object -- An imported object
     """
@@ -166,13 +167,13 @@ def tsv_encode(val, none_string="NULL"):
     """
     Encodes a value for inclusion in a TSV.  Basically, it converts the value
     to a string and escapes TABs and linebreaks.
-    
+
     :Parameters:
         val : `mixed`
             The value to encode
         none_string : str
             The string to use when `None` is encountered
-    
+
     :Returns:
         str -- a string representing the encoded value
     """
@@ -181,5 +182,5 @@ def tsv_encode(val, none_string="NULL"):
     else:
         if isinstance(val, bytes):
             val = str(val, 'utf-8')
-        
+
         return str(val).replace("\t", "\\t").replace("\n", "\\n")
diff --git a/mwcites/utilities/tests/test_extract.py b/mwcites/utilities/tests/test_extract.py
@@ -3,29 +3,39 @@
 from mw import Timestamp
 from nose.tools import eq_
 
-from ..extract_cites import process_page
+from ..extract import extract_cite_history
+from ...identifier import Identifier
 
 
-def process_page():
+def test_extract_cite_history():
     FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])
-    
+
     FakeExtractor = namedtuple("Extractor", ['extract'])
-    
-    fake_page = [
-        FakeRevision(1, Timestamp(1), "id1 id2"),
-        FakeRevision(2, Timestamp(2), "id1 id3"),
-        FakeRevision(3, Timestamp(3), "id1 id2 id3"),
-        FakeRevision(4, Timestamp(4), "id1 id2 id4"),
-        FakeRevision(5, Timestamp(5), "id1 id2 id4"),
-    ]
-    fake_page.id = 1
-    fake_page.title = "Title"
-    
-    extractor = FakeExtractor(lambda t: ('fake', id) for id in t.split(" "))
-    
-    cites = list(process_page(fake_page, [extractor]))
-    
-    eq_(cites,
-        [(1, "Title", 1, Timestamp(1), "fake", "id1"),
-         (1, "Title", 1, Timestamp(1), "fake", "id2"),
-         (1, "Title", 4, Timestamp(4), "fake", "id4")])
+
+    class FakePage:
+        def __init__(self, id, title):
+            self.id = id
+            self.title = title
+        def __iter__(self):
+            return iter([
+                FakeRevision(1, Timestamp(1), "id1 id2"),
+                FakeRevision(2, Timestamp(2), "id1 id3"),
+                FakeRevision(3, Timestamp(3), "id1 id2 id3"),
+                FakeRevision(4, Timestamp(4), "id1 id2 id4"),
+                FakeRevision(5, Timestamp(5), "id1 id2 id4"),
+            ])
+
+    fake_page = FakePage(1, "Title")
+
+    def extract(text):
+        return (Identifier('fake', id) for id in text.split(" "))
+    extractor = FakeExtractor(extract)
+
+    expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
+                (1, "Title", 1, Timestamp(1), "fake", "id2"),
+                (1, "Title", 4, Timestamp(4), "fake", "id4")]
+
+    citations = list(extract_cite_history(fake_page, [extractor]))
+    eq_(len(citations), len(expected))
+    for cite in extract_cite_history(fake_page, [extractor]):
+        assert cite in expected
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,4 @@
 docopt
+more-itertools
+mwparserfromhell
+mediawiki-utilities