Skip to content

Commit 191a98b

Browse files
committed
Merged in changes for ISBN, fixed tests and added 'isbn' to a list of ALL identifiers to extract.
1 parent 50196dd commit 191a98b

File tree

8 files changed

+91
-78
lines changed

8 files changed

+91
-78
lines changed

datasets/mw_dump_stub.xml

+2-2
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@
110110
reaction to popularity is causing its decline.
111111
American Behavioral Scientist,
112112
0002764212469365 doi: 10.1177/0002764212469365</ref>. Hats pants and banana
113-
{{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
113+
{{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
114114
http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
115115
10.2387/234310.2347/39423</text>
116116
<sha1>pfjkfb1u54sksl4exkxge4f5v1mn7cl</sha1>
@@ -136,7 +136,7 @@
136136
0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;. Hats pants and banana
137137
[http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp]
138138
[http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp[waffles]]
139-
{{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
139+
{{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
140140
http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
141141
10.2387/234310.2347/39423</text>
142142
<sha1>pfjkfb1u54tnksksxkxgehhgv1mn7cl</sha1>

mwcites/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .identifier import Identifier

mwcites/extractors/isbn.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55

66
def extract(text):
77
for match in ISBN_RE.finditer(text):
8-
yield Identifier('isbn', match.group(1).replace('-', ''))
8+
yield Identifier('isbn', match.group(1).replace('-', ''))

mwcites/extractors/tests/test_doi.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22

33
from nose.tools import eq_
44

5-
from mwcites.extractors import doi
6-
from mwcites.identifier import Identifier
5+
from .. import doi
6+
from ...identifier import Identifier
77

88
INPUT_TEXT = """
99
This is a doi randomly placed in the text 10.0000/m1
@@ -58,7 +58,7 @@ def test_extract_island():
5858
pprint.pprint(ids)
5959
pprint.pprint(EXPECTED)
6060
eq_(ids, EXPECTED)
61-
61+
6262
def test_extract_search():
6363
ids = list(doi.extract_search(INPUT_TEXT))
6464
pprint.pprint(ids)

mwcites/extractors/tests/test_isbn.py

+20-19
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,8 @@
11
import pprint
22
from nose.tools import eq_
33

4-
from mwcites.extractors import isbn
5-
from mwcites.identifier import Identifier
6-
7-
EXPECTED = [
8-
Identifier('isbn', '0124366031'),
9-
Identifier('isbn', '3540206310'),
10-
Identifier('isbn', '0618343423'),
11-
Identifier('isbn', '9780140276664'),
12-
Identifier('isbn', '0130540919'),
13-
Identifier('isbn', '0195305736'),
14-
Identifier('isbn', '9783704651129'),
15-
Identifier('isbn', '3886807525'),
16-
Identifier('isbn', '3720527735'),
17-
Identifier('isbn', '9783894726652'),
18-
Identifier('isbn', '3519264242'),
19-
Identifier('isbn', '9783642017346'),
20-
Identifier('isbn', '0130540919'),
21-
]
4+
from .. import isbn
5+
from ...identifier import Identifier
226

237
INPUT_TEXT = """
248
| publisher=Academic Press | isbn=0124366031
@@ -36,8 +20,25 @@
3620
* {{Literatur | Autor=Michael J. Padilla, Ioannis Miaoulis, Martha Cyr | Jahr = 2002 | Titel = Prentice Hall Science Explorer: Chemical Building Blocks | Verlag = Prentice-Hall, Inc. | Ort = Upper Saddle River, New Jersey USA | ISBN = 0-13-054091-9 | |Originalsprache=en}}
3721
"""
3822

23+
24+
EXPECTED = [
25+
Identifier('isbn', '0124366031'),
26+
Identifier('isbn', '3540206310'),
27+
Identifier('isbn', '0618343423'),
28+
Identifier('isbn', '9780140276664'),
29+
Identifier('isbn', '0130540919'),
30+
Identifier('isbn', '0195305736'),
31+
Identifier('isbn', '9783704651129'),
32+
Identifier('isbn', '3886807525'),
33+
Identifier('isbn', '3720527735'),
34+
Identifier('isbn', '9783894726652'),
35+
Identifier('isbn', '3519264242'),
36+
Identifier('isbn', '9783642017346'),
37+
Identifier('isbn', '0130540919'),
38+
]
39+
3940
def test_extract():
4041
ids = list(isbn.extract(INPUT_TEXT))
4142
pprint.pprint(ids)
4243
pprint.pprint(EXPECTED)
43-
eq_(ids, EXPECTED)
44+
eq_(ids, EXPECTED)

mwcites/extractors/tests/test_pubmed.py

+9-9
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from nose.tools import eq_
22

3-
from mwcites.extractors import pubmed
4-
3+
from .. import pubmed
4+
from ...identifier import Identifier
55

66
def test_extract():
7-
7+
88
text = """
99
This is some text with a template cite. {{cite|...|...|pmid=1}}.
1010
This is some text with a template cite. {{cite|...|...|pmid = 2|...}}.
@@ -15,12 +15,12 @@ def test_extract():
1515
"""
1616
ids = list(pubmed.extract(text))
1717
expected = [
18-
('pmid', "1"),
19-
('pmid', "2"),
20-
('pmc', "3"),
21-
('pmc', "4"),
22-
('pmid', "5"),
23-
('pmc', "6")
18+
Identifier('pmid', "1"),
19+
Identifier('pmid', "2"),
20+
Identifier('pmc', "3"),
21+
Identifier('pmc', "4"),
22+
Identifier('pmid', "5"),
23+
Identifier('pmc', "6")
2424
]
2525
print(ids)
2626
print(expected)

mwcites/utilities/extract.py

+23-22
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
88
* PubMed
99
* DOI
10-
10+
* ISBN
11+
1112
Outputs a TSV file with the following fields:
1213
1314
* page_id: The identifier of the Wikipedia article (int), e.g. 1325125
@@ -46,21 +47,21 @@
4647
def main(argv=None):
4748
args = docopt.docopt(__doc__, argv=argv)
4849
dump_files = args['<dump_file>']
49-
50+
5051
if args['--extractor'] == ['<all>']:
5152
extractors = ALL_EXTRACTORS
5253
else:
5354
extractors = [import_from_path(path) for path in args['--extractor']]
54-
55+
5556
run(dump_files, extractors)
5657

5758
def run(dump_files, extractors):
58-
59+
5960
print("\t".join(HEADERS))
60-
61+
6162
cites = extract(dump_files, extractors=extractors)
6263
for page_id, title, rev_id, timestamp, type, id in cites:
63-
64+
6465
print("\t".join(tsv_encode(v) for v in (page_id,
6566
title,
6667
rev_id,
@@ -71,17 +72,17 @@ def run(dump_files, extractors):
7172
def extract(dump_files, extractors=ALL_EXTRACTORS):
7273
"""
7374
Extracts cites from a set of `dump_files`.
74-
75+
7576
:Parameters:
7677
dump_files : str | `file`
7778
A set of files MediaWiki XML dump files
7879
(expects: pages-meta-history)
7980
extractors : `list`(`extractor`)
8081
A list of extractors to apply to the text
81-
82+
8283
:Returns:
8384
`iterable` -- a generator of extracted cites
84-
85+
8586
"""
8687
# Dump processor function
8788
def process_dump(dump, path):
@@ -90,48 +91,48 @@ def process_dump(dump, path):
9091
else:
9192
for cite in extract_cite_history(page, extractors):
9293
yield cite
93-
94+
9495
# Map call
9596
return xml_dump.map(dump_files, process_dump)
9697

9798
def extract_cite_history(page, extractors):
9899
"""
99100
Extracts cites from the history of a `page` (`mw.xml_dump.Page`).
100-
101+
101102
:Parameters:
102103
page : `iterable`(`mw.xml_dump.Revision`)
103104
The page to extract cites from
104105
extractors : `list`(`extractor`)
105106
A list of extractors to apply to the text
106-
107+
107108
:Returns:
108109
`iterable` -- a generator of extracted cites
109-
110+
110111
"""
111112
appearances = {} # For tracking the first appearance of an ID
112113
ids = set() # For holding onto the ids in the last revision.
113114
for revision in page:
114115
ids = set(extract_ids(revision.text, extractors))
115-
116+
116117
# For each ID, check to see if we have seen it before
117118
for id in ids:
118119
if id not in appearances:
119120
appearances[id] = (revision.id, revision.timestamp)
120-
121+
121122
for id in ids: #For the ids in the last version of the page
122123
rev_id, timestamp = appearances[id]
123124
yield (page.id, page.title, rev_id, timestamp, id.type, id.id)
124125

125126
def extract_ids(text, extractors):
126127
"""
127128
Uses `extractors` to extract citation identifiers from a text.
128-
129+
129130
:Parameters:
130131
text : str
131132
The text to process
132133
extractors : `list`(`extractor`)
133134
A list of extractors to apply to the text
134-
135+
135136
:Returns:
136137
`iterable` -- a generator of extracted identifiers
137138
"""
@@ -142,12 +143,12 @@ def extract_ids(text, extractors):
142143
def import_from_path(path):
143144
"""
144145
Imports a specific attribute from a module based on a class path.
145-
146+
146147
:Parameters:
147148
path : str
148149
A dot delimited string representing the import path of the desired
149150
object.
150-
151+
151152
:Returns:
152153
object -- An imported object
153154
"""
@@ -166,13 +167,13 @@ def tsv_encode(val, none_string="NULL"):
166167
"""
167168
Encodes a value for inclusion in a TSV. Basically, it converts the value
168169
to a string and escapes TABs and linebreaks.
169-
170+
170171
:Parameters:
171172
val : `mixed`
172173
The value to encode
173174
none_string : str
174175
The string to use when `None` is encountered
175-
176+
176177
:Returns:
177178
str -- a string representing the encoded value
178179
"""
@@ -181,5 +182,5 @@ def tsv_encode(val, none_string="NULL"):
181182
else:
182183
if isinstance(val, bytes):
183184
val = str(val, 'utf-8')
184-
185+
185186
return str(val).replace("\t", "\\t").replace("\n", "\\n")

mwcites/utilities/tests/test_extract.py

+32-22
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,39 @@
33
from mw import Timestamp
44
from nose.tools import eq_
55

6-
from ..extract_cites import process_page
6+
from ..extract import extract_cite_history
7+
from ...identifier import Identifier
78

89

9-
def process_page():
10+
def test_extract_cite_history():
1011
FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])
11-
12+
1213
FakeExtractor = namedtuple("Extractor", ['extract'])
13-
14-
fake_page = [
15-
FakeRevision(1, Timestamp(1), "id1 id2"),
16-
FakeRevision(2, Timestamp(2), "id1 id3"),
17-
FakeRevision(3, Timestamp(3), "id1 id2 id3"),
18-
FakeRevision(4, Timestamp(4), "id1 id2 id4"),
19-
FakeRevision(5, Timestamp(5), "id1 id2 id4"),
20-
]
21-
fake_page.id = 1
22-
fake_page.title = "Title"
23-
24-
extractor = FakeExtractor(lambda t: ('fake', id) for id in t.split(" "))
25-
26-
cites = list(process_page(fake_page, [extractor]))
27-
28-
eq_(cites,
29-
[(1, "Title", 1, Timestamp(1), "fake", "id1"),
30-
(1, "Title", 1, Timestamp(1), "fake", "id2"),
31-
(1, "Title", 4, Timestamp(4), "fake", "id4")])
14+
15+
class FakePage:
16+
def __init__(self, id, title):
17+
self.id = id
18+
self.title = title
19+
def __iter__(self):
20+
return iter([
21+
FakeRevision(1, Timestamp(1), "id1 id2"),
22+
FakeRevision(2, Timestamp(2), "id1 id3"),
23+
FakeRevision(3, Timestamp(3), "id1 id2 id3"),
24+
FakeRevision(4, Timestamp(4), "id1 id2 id4"),
25+
FakeRevision(5, Timestamp(5), "id1 id2 id4"),
26+
])
27+
28+
fake_page = FakePage(1, "Title")
29+
30+
def extract(text):
31+
return (Identifier('fake', id) for id in text.split(" "))
32+
extractor = FakeExtractor(extract)
33+
34+
expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
35+
(1, "Title", 1, Timestamp(1), "fake", "id2"),
36+
(1, "Title", 4, Timestamp(4), "fake", "id4")]
37+
38+
citations = list(extract_cite_history(fake_page, [extractor]))
39+
eq_(len(citations), len(expected))
40+
for cite in extract_cite_history(fake_page, [extractor]):
41+
assert cite in expected

0 commit comments

Comments
 (0)