Merged in changes for ISBN, fixed tests and added 'isbn' to a list of ALL identifiers to extract.

halfak · halfak · commit 191a98b1f6fc · 2015-03-06T13:23:41.000-05:00
diff --git a/datasets/mw_dump_stub.xml b/datasets/mw_dump_stub.xml
@@ -110,7 +110,7 @@
       reaction to popularity is causing its decline.
       American Behavioral Scientist,
       0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;.  Hats pants and banana
-      {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
+      {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
       10.2387/234310.2347/39423</text>
       <sha1>pfjkfb1u54sksl4exkxge4f5v1mn7cl</sha1>
@@ -136,7 +136,7 @@
       0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;.  Hats pants and banana
       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp]
       [http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp[waffles]]
-      {{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
+      {{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
       http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
       10.2387/234310.2347/39423</text>
       <sha1>pfjkfb1u54tnksksxkxgehhgv1mn7cl</sha1>
diff --git a/mwcites/__init__.py b/mwcites/__init__.py
@@ -0,0 +1 @@
+from .identifier import Identifier
diff --git a/mwcites/extractors/isbn.py b/mwcites/extractors/isbn.py
@@ -5,4 +5,4 @@
 
 def extract(text):
     for match in ISBN_RE.finditer(text):
-        yield Identifier('isbn', match.group(1).replace('-', ''))
+        yield Identifier('isbn', match.group(1).replace('-', ''))
diff --git a/mwcites/extractors/tests/test_doi.py b/mwcites/extractors/tests/test_doi.py
@@ -2,8 +2,8 @@
 
 from nose.tools import eq_
 
-from mwcites.extractors import doi
-from mwcites.identifier import Identifier
+from .. import doi
+from ...identifier import Identifier
 
 INPUT_TEXT = """
 This is a doi randomly placed in the text 10.0000/m1
@@ -58,7 +58,7 @@ def test_extract_island():
     pprint.pprint(ids)
     pprint.pprint(EXPECTED)
     eq_(ids, EXPECTED)
-    
+
 def test_extract_search():
     ids = list(doi.extract_search(INPUT_TEXT))
     pprint.pprint(ids)
diff --git a/mwcites/extractors/tests/test_isbn.py b/mwcites/extractors/tests/test_isbn.py
@@ -1,24 +1,8 @@
 import pprint
 from nose.tools import eq_
 
-from mwcites.extractors import isbn
-from mwcites.identifier import Identifier
-
-EXPECTED = [
-    Identifier('isbn', '0124366031'),
-    Identifier('isbn', '3540206310'),
-    Identifier('isbn', '0618343423'),
-    Identifier('isbn', '9780140276664'),
-    Identifier('isbn', '0130540919'),
-    Identifier('isbn', '0195305736'),
-    Identifier('isbn', '9783704651129'),
-    Identifier('isbn', '3886807525'),
-    Identifier('isbn', '3720527735'),
-    Identifier('isbn', '9783894726652'),
-    Identifier('isbn', '3519264242'),
-    Identifier('isbn', '9783642017346'),
-    Identifier('isbn', '0130540919'),
-]
+from .. import isbn
+from ...identifier import Identifier
 
 INPUT_TEXT = """
     | publisher=Academic Press | isbn=0124366031
@@ -36,8 +20,25 @@
     * {{Literatur | Autor=Michael J. Padilla, Ioannis Miaoulis, Martha Cyr | Jahr = 2002 | Titel = Prentice Hall Science Explorer: Chemical Building Blocks | Verlag = Prentice-Hall, Inc. | Ort = Upper Saddle River, New Jersey USA | ISBN = 0-13-054091-9 | |Originalsprache=en}}
     """
 
+
+EXPECTED = [
+    Identifier('isbn', '0124366031'),
+    Identifier('isbn', '3540206310'),
+    Identifier('isbn', '0618343423'),
+    Identifier('isbn', '9780140276664'),
+    Identifier('isbn', '0130540919'),
+    Identifier('isbn', '0195305736'),
+    Identifier('isbn', '9783704651129'),
+    Identifier('isbn', '3886807525'),
+    Identifier('isbn', '3720527735'),
+    Identifier('isbn', '9783894726652'),
+    Identifier('isbn', '3519264242'),
+    Identifier('isbn', '9783642017346'),
+    Identifier('isbn', '0130540919'),
+]
+
 def test_extract():
     ids = list(isbn.extract(INPUT_TEXT))
     pprint.pprint(ids)
     pprint.pprint(EXPECTED)
-    eq_(ids, EXPECTED)
+    eq_(ids, EXPECTED)
diff --git a/mwcites/extractors/tests/test_pubmed.py b/mwcites/extractors/tests/test_pubmed.py
@@ -1,10 +1,10 @@
 from nose.tools import eq_
 
-from mwcites.extractors import pubmed
-
+from .. import pubmed
+from ...identifier import Identifier
 
 def test_extract():
-    
+
     text = """
     This is some text with a template cite. {{cite|...|...|pmid=1}}.
     This is some text with a template cite. {{cite|...|...|pmid = 2|...}}.
@@ -15,12 +15,12 @@ def test_extract():
     """
     ids = list(pubmed.extract(text))
     expected = [
-        ('pmid', "1"),
-        ('pmid', "2"),
-        ('pmc', "3"),
-        ('pmc', "4"),
-        ('pmid', "5"),
-        ('pmc', "6")
+        Identifier('pmid', "1"),
+        Identifier('pmid', "2"),
+        Identifier('pmc', "3"),
+        Identifier('pmc', "4"),
+        Identifier('pmid', "5"),
+        Identifier('pmc', "6")
     ]
     print(ids)
     print(expected)
diff --git a/mwcites/utilities/extract.py b/mwcites/utilities/extract.py
@@ -7,7 +7,8 @@
 
  * PubMed
  * DOI
- 
+ * ISBN
+
 Outputs a TSV file with the following fields:
 
  * page_id: The identifier of the Wikipedia article (int), e.g. 1325125
@@ -46,21 +47,21 @@
 def main(argv=None):
     args = docopt.docopt(__doc__, argv=argv)
     dump_files = args['<dump_file>']
-    
+
     if args['--extractor'] == ['<all>']:
         extractors = ALL_EXTRACTORS
     else:
         extractors = [import_from_path(path) for path in args['--extractor']]
-    
+
     run(dump_files, extractors)
 
 def run(dump_files, extractors):
-    
+
     print("\t".join(HEADERS))
-    
+
     cites = extract(dump_files, extractors=extractors)
     for page_id, title, rev_id, timestamp, type, id in cites:
-        
+
         print("\t".join(tsv_encode(v) for v in (page_id,
                                                 title,
                                                 rev_id,
@@ -71,17 +72,17 @@ def run(dump_files, extractors):
 def extract(dump_files, extractors=ALL_EXTRACTORS):
     """
     Extracts cites from a set of `dump_files`.
-    
+
     :Parameters:
         dump_files : str | `file`
             A set of files MediaWiki XML dump files
             (expects: pages-meta-history)
         extractors : `list`(`extractor`)
             A list of extractors to apply to the text
-    
+
     :Returns:
         `iterable` -- a generator of extracted cites
-    
+
     """
     # Dump processor function
     def process_dump(dump, path):
@@ -90,48 +91,48 @@ def process_dump(dump, path):
             else:
                 for cite in extract_cite_history(page, extractors):
                     yield cite
-        
+
     # Map call
     return xml_dump.map(dump_files, process_dump)
 
 def extract_cite_history(page, extractors):
     """
     Extracts cites from the history of a `page` (`mw.xml_dump.Page`).
-    
+
     :Parameters:
         page : `iterable`(`mw.xml_dump.Revision`)
             The page to extract cites from
         extractors : `list`(`extractor`)
             A list of extractors to apply to the text
-    
+
     :Returns:
         `iterable` -- a generator of extracted cites
-    
+
     """
     appearances = {} # For tracking the first appearance of an ID
     ids = set() # For holding onto the ids in the last revision.
     for revision in page:
         ids = set(extract_ids(revision.text, extractors))
-        
+
         # For each ID, check to see if we have seen it before
         for id in ids:
             if id not in appearances:
                appearances[id] = (revision.id, revision.timestamp)
-        
+
     for id in ids: #For the ids in the last version of the page
         rev_id, timestamp = appearances[id]
         yield (page.id, page.title, rev_id, timestamp, id.type, id.id)
 
 def extract_ids(text, extractors):
     """
     Uses `extractors` to extract citation identifiers from a text.
-    
+
     :Parameters:
         text : str
             The text to process
         extractors : `list`(`extractor`)
             A list of extractors to apply to the text
-    
+
     :Returns:
         `iterable` -- a generator of extracted identifiers
     """
@@ -142,12 +143,12 @@ def extract_ids(text, extractors):
 def import_from_path(path):
     """
     Imports a specific attribute from a module based on a class path.
-    
+
     :Parameters:
         path : str
             A dot delimited string representing the import path of the desired
             object.
-    
+
     :Returns:
         object -- An imported object
     """
@@ -166,13 +167,13 @@ def tsv_encode(val, none_string="NULL"):
     """
     Encodes a value for inclusion in a TSV.  Basically, it converts the value
     to a string and escapes TABs and linebreaks.
-    
+
     :Parameters:
         val : `mixed`
             The value to encode
         none_string : str
             The string to use when `None` is encountered
-    
+
     :Returns:
         str -- a string representing the encoded value
     """
@@ -181,5 +182,5 @@ def tsv_encode(val, none_string="NULL"):
     else:
         if isinstance(val, bytes):
             val = str(val, 'utf-8')
-        
+
         return str(val).replace("\t", "\\t").replace("\n", "\\n")
diff --git a/mwcites/utilities/tests/test_extract.py b/mwcites/utilities/tests/test_extract.py
@@ -3,29 +3,39 @@
 from mw import Timestamp
 from nose.tools import eq_
 
-from ..extract_cites import process_page
+from ..extract import extract_cite_history
+from ...identifier import Identifier
 
 
-def process_page():
+def test_extract_cite_history():
     FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])
-    
+
     FakeExtractor = namedtuple("Extractor", ['extract'])
-    
-    fake_page = [
-        FakeRevision(1, Timestamp(1), "id1 id2"),
-        FakeRevision(2, Timestamp(2), "id1 id3"),
-        FakeRevision(3, Timestamp(3), "id1 id2 id3"),
-        FakeRevision(4, Timestamp(4), "id1 id2 id4"),
-        FakeRevision(5, Timestamp(5), "id1 id2 id4"),
-    ]
-    fake_page.id = 1
-    fake_page.title = "Title"
-    
-    extractor = FakeExtractor(lambda t: ('fake', id) for id in t.split(" "))
-    
-    cites = list(process_page(fake_page, [extractor]))
-    
-    eq_(cites,
-        [(1, "Title", 1, Timestamp(1), "fake", "id1"),
-         (1, "Title", 1, Timestamp(1), "fake", "id2"),
-         (1, "Title", 4, Timestamp(4), "fake", "id4")])
+
+    class FakePage:
+        def __init__(self, id, title):
+            self.id = id
+            self.title = title
+        def __iter__(self):
+            return iter([
+                FakeRevision(1, Timestamp(1), "id1 id2"),
+                FakeRevision(2, Timestamp(2), "id1 id3"),
+                FakeRevision(3, Timestamp(3), "id1 id2 id3"),
+                FakeRevision(4, Timestamp(4), "id1 id2 id4"),
+                FakeRevision(5, Timestamp(5), "id1 id2 id4"),
+            ])
+
+    fake_page = FakePage(1, "Title")
+
+    def extract(text):
+        return (Identifier('fake', id) for id in text.split(" "))
+    extractor = FakeExtractor(extract)
+
+    expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
+                (1, "Title", 1, Timestamp(1), "fake", "id2"),
+                (1, "Title", 4, Timestamp(4), "fake", "id4")]
+
+    citations = list(extract_cite_history(fake_page, [extractor]))
+    eq_(len(citations), len(expected))
+    for cite in extract_cite_history(fake_page, [extractor]):
+        assert cite in expected