Skip to content

Commit 4b5e6f6

Browse files
committed
Merge branch 'ahagenbruch-master'
2 parents d771fb0 + 191a98b commit 4b5e6f6

File tree

11 files changed

+133
-59
lines changed

11 files changed

+133
-59
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,6 @@ coverage.xml
5555

5656
# Sphinx documentation
5757
docs/_build/
58+
59+
# Pycharm directories
60+
.idea

README.rst

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1-
Extract academic citaitons from Wikipedia
1+
Extract academic citations from Wikipedia
22
=========================================
33
This project contains a utility for extracting academic citation identifiers.
44

5+
**NOTE:** As one of its dependencies (`Mediawiki-Utilities <https://github.com/halfak/Mediawiki-Utilities>`_) requires
6+
Python 3 so does mwcites.
7+
58
``pip install mwcites``
69

710
Usage
@@ -23,10 +26,11 @@ Documentation is provided ``$ mwcitations extract -h``.
2326
articles by processing a pages-meta-history XML dump and matching regular
2427
expressions to revision content.
2528

26-
Currently supported identifies include:
29+
Currently supported identifiers include:
2730

2831
* PubMed
2932
* DOI
33+
* ISBN
3034
3135
Outputs a TSV file with the following fields:
3236

datasets/mw_dump_stub.xml

+2-2
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@
110110
reaction to popularity is causing its decline.
111111
American Behavioral Scientist,
112112
0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;. Hats pants and banana
113-
{{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
113+
{{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
114114
http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
115115
10.2387/234310.2347/39423</text>
116116
<sha1>pfjkfb1u54sksl4exkxge4f5v1mn7cl</sha1>
@@ -136,7 +136,7 @@
136136
0002764212469365 doi: 10.1177/0002764212469365&lt;/ref&gt;. Hats pants and banana
137137
[http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp]
138138
[http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp[waffles]]
139-
{{cite|...|doi=10.1098/rspb.2008.1131|issue=1656}}
139+
{{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
140140
http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
141141
10.2387/234310.2347/39423</text>
142142
<sha1>pfjkfb1u54tnksksxkxgehhgv1mn7cl</sha1>

mwcites/__init__.py

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .identifier import Identifier

mwcites/extractors/isbn.py

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import re
2+
from ..identifier import Identifier
3+
4+
ISBN_RE = re.compile('isbn\s?=?\s?([0-9\-Xx]+)', re.I)
5+
6+
def extract(text):
7+
for match in ISBN_RE.finditer(text):
8+
yield Identifier('isbn', match.group(1).replace('-', ''))

mwcites/extractors/tests/test_doi.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ def test_extract_island():
5858
pprint.pprint(ids)
5959
pprint.pprint(EXPECTED)
6060
eq_(ids, EXPECTED)
61-
61+
6262
def test_extract_search():
6363
ids = list(doi.extract_search(INPUT_TEXT))
6464
pprint.pprint(ids)

mwcites/extractors/tests/test_isbn.py

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import pprint
2+
from nose.tools import eq_
3+
4+
from .. import isbn
5+
from ...identifier import Identifier
6+
7+
INPUT_TEXT = """
8+
| publisher=Academic Press | isbn=0124366031
9+
| isbn=3540206310
10+
| accessdate=2008-02-05 | isbn=0-618-34342-3
11+
| isbn=978-0-140-27666-4
12+
| isbn = 0-13-054091-9
13+
| isbn=0195305736 }}&lt;/ref&gt; schlug [[Irving Langmuir]] 1919 vor, dass das Elektronen in einem Atom verbunden oder verklumpt seien. Elektronengruppen beset
14+
| ISBN=978-3-7046-5112-9
15+
* Peter L. Bergen: ''Heiliger Krieg, Inc.: Osama bin Ladens Terrornetz''. Siedler, Berlin 2001, ISBN 3-88680-752-5.
16+
* Marwan Abou-Taam, Ruth Bigalke (Hgg) ''Die Reden des Osama bin Laden''. Diederichs, München 2006, ISBN 3-72052-773-5. (Reden und Ansprachen des b.L. im Original - ''Rezensionen: '' [http://www.sicherheit-heute.de/index.php?cccpage=readpolitik&amp;set_z_artikel=221 ]und [http://www.fr-online.de/in_und_ausland/kultur_und_medien/buecher/?em_cnt=868715&amp;sid=f55727] Frankf. Rundschau 26. April 2006)
17+
* Michael Pekler, Andreas Ungerböck: ''Ang Lee und seine Filme''. Schüren, Marburg 2009, ISBN 978-3-89472-665-2.
18+
&lt;ref name=&quot;flos1&quot;&gt;{{Literatur | Autor = René Flosdorff, Günther Hilgarth | Titel = Elektrische Energieverteilung | Verlag = Teubner | Auflage = 8. | Jahr = 2003 | Kapitel = Kapitel 1.2.2.4 | ISBN = 3-519-26424-2 }}&lt;/ref&gt;
19+
Bei einer [[Sprungtemperatur]] von 1,2&amp;nbsp;K wird reines Aluminium [[Supraleiter|supraleitend]].&lt;ref&gt;{{Literatur | Autor = Ilschner | first = Bernhard | Titel = Werkstoffwissenschaften und Fertigungstechnik Eigenschaften, Vorgänge, Technologien | Verlag = Springer | Ort = Berlin | Jahr = 2010 | ISBN = 978-3-642-01734-6 | Seiten = 277}}&lt;/ref&gt;
20+
* {{Literatur | Autor=Michael J. Padilla, Ioannis Miaoulis, Martha Cyr | Jahr = 2002 | Titel = Prentice Hall Science Explorer: Chemical Building Blocks | Verlag = Prentice-Hall, Inc. | Ort = Upper Saddle River, New Jersey USA | ISBN = 0-13-054091-9 | |Originalsprache=en}}
21+
"""
22+
23+
24+
EXPECTED = [
25+
Identifier('isbn', '0124366031'),
26+
Identifier('isbn', '3540206310'),
27+
Identifier('isbn', '0618343423'),
28+
Identifier('isbn', '9780140276664'),
29+
Identifier('isbn', '0130540919'),
30+
Identifier('isbn', '0195305736'),
31+
Identifier('isbn', '9783704651129'),
32+
Identifier('isbn', '3886807525'),
33+
Identifier('isbn', '3720527735'),
34+
Identifier('isbn', '9783894726652'),
35+
Identifier('isbn', '3519264242'),
36+
Identifier('isbn', '9783642017346'),
37+
Identifier('isbn', '0130540919'),
38+
]
39+
40+
def test_extract():
41+
ids = list(isbn.extract(INPUT_TEXT))
42+
pprint.pprint(ids)
43+
pprint.pprint(EXPECTED)
44+
eq_(ids, EXPECTED)

mwcites/extractors/tests/test_pubmed.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
from nose.tools import eq_
22

33
from .. import pubmed
4-
4+
from ...identifier import Identifier
55

66
def test_extract():
7-
7+
88
text = """
99
This is some text with a template cite. {{cite|...|...|pmid=1}}.
1010
This is some text with a template cite. {{cite|...|...|pmid = 2|...}}.
@@ -15,12 +15,12 @@ def test_extract():
1515
"""
1616
ids = list(pubmed.extract(text))
1717
expected = [
18-
('pmid', "1"),
19-
('pmid', "2"),
20-
('pmc', "3"),
21-
('pmc', "4"),
22-
('pmid', "5"),
23-
('pmc', "6")
18+
Identifier('pmid', "1"),
19+
Identifier('pmid', "2"),
20+
Identifier('pmc', "3"),
21+
Identifier('pmc', "4"),
22+
Identifier('pmid', "5"),
23+
Identifier('pmc', "6")
2424
]
2525
print(ids)
2626
print(expected)

mwcites/utilities/extract.py

+25-24
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
88
* PubMed
99
* DOI
10-
10+
* ISBN
11+
1112
Outputs a TSV file with the following fields:
1213
1314
* page_id: The identifier of the Wikipedia article (int), e.g. 1325125
@@ -37,30 +38,30 @@
3738
import docopt
3839
from mw import xml_dump
3940

40-
from ..extractors import doi, pubmed
41+
from ..extractors import doi, pubmed, isbn
4142

42-
ALL_EXTRACTORS = [doi, pubmed]
43+
ALL_EXTRACTORS = [doi, pubmed, isbn]
4344

4445
HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
4546

4647
def main(argv=None):
4748
args = docopt.docopt(__doc__, argv=argv)
4849
dump_files = args['<dump_file>']
49-
50+
5051
if args['--extractor'] == ['<all>']:
5152
extractors = ALL_EXTRACTORS
5253
else:
5354
extractors = [import_from_path(path) for path in args['--extractor']]
54-
55+
5556
run(dump_files, extractors)
5657

5758
def run(dump_files, extractors):
58-
59+
5960
print("\t".join(HEADERS))
60-
61+
6162
cites = extract(dump_files, extractors=extractors)
6263
for page_id, title, rev_id, timestamp, type, id in cites:
63-
64+
6465
print("\t".join(tsv_encode(v) for v in (page_id,
6566
title,
6667
rev_id,
@@ -71,17 +72,17 @@ def run(dump_files, extractors):
7172
def extract(dump_files, extractors=ALL_EXTRACTORS):
7273
"""
7374
Extracts cites from a set of `dump_files`.
74-
75+
7576
:Parameters:
7677
dump_files : str | `file`
7778
A set of files MediaWiki XML dump files
7879
(expects: pages-meta-history)
7980
extractors : `list`(`extractor`)
8081
A list of extractors to apply to the text
81-
82+
8283
:Returns:
8384
`iterable` -- a generator of extracted cites
84-
85+
8586
"""
8687
# Dump processor function
8788
def process_dump(dump, path):
@@ -90,48 +91,48 @@ def process_dump(dump, path):
9091
else:
9192
for cite in extract_cite_history(page, extractors):
9293
yield cite
93-
94+
9495
# Map call
9596
return xml_dump.map(dump_files, process_dump)
9697

9798
def extract_cite_history(page, extractors):
9899
"""
99100
Extracts cites from the history of a `page` (`mw.xml_dump.Page`).
100-
101+
101102
:Parameters:
102103
page : `iterable`(`mw.xml_dump.Revision`)
103104
The page to extract cites from
104105
extractors : `list`(`extractor`)
105106
A list of extractors to apply to the text
106-
107+
107108
:Returns:
108109
`iterable` -- a generator of extracted cites
109-
110+
110111
"""
111112
appearances = {} # For tracking the first appearance of an ID
112113
ids = set() # For holding onto the ids in the last revision.
113114
for revision in page:
114115
ids = set(extract_ids(revision.text, extractors))
115-
116+
116117
# For each ID, check to see if we have seen it before
117118
for id in ids:
118119
if id not in appearances:
119120
appearances[id] = (revision.id, revision.timestamp)
120-
121+
121122
for id in ids: #For the ids in the last version of the page
122123
rev_id, timestamp = appearances[id]
123124
yield (page.id, page.title, rev_id, timestamp, id.type, id.id)
124125

125126
def extract_ids(text, extractors):
126127
"""
127128
Uses `extractors` to extract citation identifiers from a text.
128-
129+
129130
:Parameters:
130131
text : str
131132
The text to process
132133
extractors : `list`(`extractor`)
133134
A list of extractors to apply to the text
134-
135+
135136
:Returns:
136137
`iterable` -- a generator of extracted identifiers
137138
"""
@@ -142,12 +143,12 @@ def extract_ids(text, extractors):
142143
def import_from_path(path):
143144
"""
144145
Imports a specific attribute from a module based on a class path.
145-
146+
146147
:Parameters:
147148
path : str
148149
A dot delimited string representing the import path of the desired
149150
object.
150-
151+
151152
:Returns:
152153
object -- An imported object
153154
"""
@@ -166,13 +167,13 @@ def tsv_encode(val, none_string="NULL"):
166167
"""
167168
Encodes a value for inclusion in a TSV. Basically, it converts the value
168169
to a string and escapes TABs and linebreaks.
169-
170+
170171
:Parameters:
171172
val : `mixed`
172173
The value to encode
173174
none_string : str
174175
The string to use when `None` is encountered
175-
176+
176177
:Returns:
177178
str -- a string representing the encoded value
178179
"""
@@ -181,5 +182,5 @@ def tsv_encode(val, none_string="NULL"):
181182
else:
182183
if isinstance(val, bytes):
183184
val = str(val, 'utf-8')
184-
185+
185186
return str(val).replace("\t", "\\t").replace("\n", "\\n")

mwcites/utilities/tests/test_extract.py

+32-22
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,39 @@
33
from mw import Timestamp
44
from nose.tools import eq_
55

6-
from ..extract_cites import process_page
6+
from ..extract import extract_cite_history
7+
from ...identifier import Identifier
78

89

9-
def process_page():
10+
def test_extract_cite_history():
1011
FakeRevision = namedtuple("Revision", ['id', 'timestamp', 'text'])
11-
12+
1213
FakeExtractor = namedtuple("Extractor", ['extract'])
13-
14-
fake_page = [
15-
FakeRevision(1, Timestamp(1), "id1 id2"),
16-
FakeRevision(2, Timestamp(2), "id1 id3"),
17-
FakeRevision(3, Timestamp(3), "id1 id2 id3"),
18-
FakeRevision(4, Timestamp(4), "id1 id2 id4"),
19-
FakeRevision(5, Timestamp(5), "id1 id2 id4"),
20-
]
21-
fake_page.id = 1
22-
fake_page.title = "Title"
23-
24-
extractor = FakeExtractor(lambda t: ('fake', id) for id in t.split(" "))
25-
26-
cites = list(process_page(fake_page, [extractor]))
27-
28-
eq_(cites,
29-
[(1, "Title", 1, Timestamp(1), "fake", "id1"),
30-
(1, "Title", 1, Timestamp(1), "fake", "id2"),
31-
(1, "Title", 4, Timestamp(4), "fake", "id4")])
14+
15+
class FakePage:
16+
def __init__(self, id, title):
17+
self.id = id
18+
self.title = title
19+
def __iter__(self):
20+
return iter([
21+
FakeRevision(1, Timestamp(1), "id1 id2"),
22+
FakeRevision(2, Timestamp(2), "id1 id3"),
23+
FakeRevision(3, Timestamp(3), "id1 id2 id3"),
24+
FakeRevision(4, Timestamp(4), "id1 id2 id4"),
25+
FakeRevision(5, Timestamp(5), "id1 id2 id4"),
26+
])
27+
28+
fake_page = FakePage(1, "Title")
29+
30+
def extract(text):
31+
return (Identifier('fake', id) for id in text.split(" "))
32+
extractor = FakeExtractor(extract)
33+
34+
expected = [(1, "Title", 1, Timestamp(1), "fake", "id1"),
35+
(1, "Title", 1, Timestamp(1), "fake", "id2"),
36+
(1, "Title", 4, Timestamp(4), "fake", "id4")]
37+
38+
citations = list(extract_cite_history(fake_page, [extractor]))
39+
eq_(len(citations), len(expected))
40+
for cite in extract_cite_history(fake_page, [extractor]):
41+
assert cite in expected

requirements.txt

+3
Original file line numberDiff line numberDiff line change
@@ -1 +1,4 @@
11
docopt
2+
more-itertools
3+
mwparserfromhell
4+
mediawiki-utilities

0 commit comments

Comments
 (0)