Skip to content

Commit 3d2cb9b

Browse files
committed
Adds arxiv extractors with tests and refactors utility to be called 'mwcites'
1 parent 601e72b commit 3d2cb9b

File tree

10 files changed

+90
-22
lines changed

10 files changed

+90
-22
lines changed

README.rst

+5-4
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,17 @@ Documentation is provided ``$ mwcitations extract -h``.
3131
* PubMed
3232
* DOI
3333
* ISBN
34-
34+
* arXiv
35+
3536
Outputs a TSV file with the following fields:
3637

3738
* page_id: The identifier of the Wikipedia article (int), e.g. 1325125
3839
* page_title: The title of the Wikipedia article (utf-8), e.g. Club cell
3940
* rev_id: The Wikipedia revision where the citation was first added (int),
4041
e.g. 282470030
41-
* timestamp: The timestamp of the revision where the citation was first added.
42-
(ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
43-
* type: The type of identifier, e.g. pmid
42+
* timestamp: The timestamp of the revision where the citation was first
43+
added. (ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
44+
* type: The type of identifier, e.g. pmid, pmcid, doi, isbn or arxiv
4445
* id: The id of the cited scholarly article (utf-8),
4546
e.g 10.1183/09031936.00213411
4647

datasets/mw_dump_stub.xml

+3
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@
111111
American Behavioral Scientist,
112112
0002764212469365 doi: 10.1177/0002764212469365</ref>. Hats pants and banana
113113
{{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
114+
[http://arxiv.org/abs/0706.0004v1]
115+
[https://arxiv.org/abs/0706.0005v1]
114116
http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
115117
10.2387/234310.2347/39423</text>
116118
<sha1>pfjkfb1u54sksl4exkxge4f5v1mn7cl</sha1>
@@ -137,6 +139,7 @@
137139
[http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp]
138140
[http://dx.doi.org/10.1170/foo&lt;bar&gt;(herp)derp[waffles]]
139141
{{cite|...|doi=10.1098/rspb.2008.1131|isbn = 28-1298-2020|issue=1656}}
142+
[https://arxiv.org/abs/0706.0005v1]
140143
http://www.google.com/sky/#latitude=3.362&amp;longitude=160.1238441&amp;zoom=
141144
10.2387/234310.2347/39423</text>
142145
<sha1>pfjkfb1u54tnksksxkxgehhgv1mn7cl</sha1>

mwcitations

-4
This file was deleted.

mwcites/extractors/arxiv.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import re
2+
3+
from ..identifier import Identifier
4+
5+
# From http://arxiv.org/help/arxiv_identifier
6+
old_id = r"-?(?P<old_id>([a-z]+(.[a-z]+)/)?[0-9]{4}[0-9]+)"
7+
new_id = r"(?P<new_id>[0-9]{4}.[0-9]+)(v[0-9]+)?"
8+
9+
prefixes=["arxiv\s*=\s*", "//arxiv\.org/(abs/)?", "arxiv:\s?"]
10+
11+
ARXIV_RE = re.compile(r"({0})".format("|".join(prefixes)) +
12+
r"({0}|{1})".format(old_id, new_id), re.I|re.U)
13+
14+
def extract(text):
15+
for match in ARXIV_RE.finditer(text):
16+
id = match.group('new_id') or match.group("old_id")
17+
yield Identifier("arxiv", id.lower())

mwcites/extractors/doi.py

-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55

66
from ..identifier import Identifier
77

8-
98
DOI_START_RE = re.compile(r'10\.[0-9]{4,}/')
109

1110
HTML_TAGS = ['ref', 'span', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import pprint
2+
3+
from nose.tools import eq_
4+
5+
from .. import arxiv
6+
from ...identifier import Identifier
7+
8+
INPUT_TEXT = """
9+
This is a doi randomly placed in the text 10.0000/m1
10+
Here's a typo that might be construed as a doi 10.60 people were there.
11+
{{cite|...|arxiv=0706.0001v1|pmid=10559875}}
12+
<ref>Halfaker, A., Geiger, R. S., Morgan, J. T., & Riedl, J. (2012).
13+
The rise and decline of an open collaboration system: How Wikipedia’s
14+
reaction to popularity is causing its decline.
15+
American Behavioral Scientist,
16+
0002764212469365 arxiv:0706.0002v1</ref>. Hats pants and banana
17+
[http://arxiv.org/0706.0003]
18+
[http://arxiv.org/abs/0706.0004v1]
19+
[https://arxiv.org/abs/0706.0005v1]
20+
[https://arxiv.org/abs/math.GT/0309001]
21+
[https://arxiv.org/abs/-math.gs/0309002]
22+
{{cite|...|arxiv=foobar.hats/0101003|issue=1656}}
23+
http://www.google.com/sky/#latitude=3.362&longitude=160.1238441&zoom=
24+
10.2387/234310.2347/39423
25+
<!--
26+
10.2387/234310.2347/39423-->
27+
"""
28+
EXPECTED = [
29+
Identifier('arxiv', "0706.0001"),
30+
Identifier('arxiv', "0706.0002"),
31+
Identifier('arxiv', "0706.0003"),
32+
Identifier('arxiv', "0706.0004"),
33+
Identifier('arxiv', "0706.0005"),
34+
Identifier('arxiv', "math.gt/0309001"),
35+
Identifier('arxiv', "math.gs/0309002"),
36+
Identifier('arxiv', "foobar.hats/0101003")
37+
]
38+
39+
def test_extract():
40+
ids = list(arxiv.extract(INPUT_TEXT))
41+
pprint.pprint(ids)
42+
pprint.pprint(EXPECTED)
43+
eq_(ids, EXPECTED)

mwcites/mwcitations.py renamed to mwcites/mwcites.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
* extract -- Extracts citations from an XML database dump
88
99
Usage:
10-
mwcitations (-h | --help)
11-
mwcitations <utility> [-h | --help]
12-
10+
mwcites (-h | --help)
11+
mwcites <utility> [-h | --help]
12+
1313
Options:
1414
-h | --help Shows this documentation
1515
<utility> The name of the utility to run
@@ -22,12 +22,12 @@
2222

2323

2424
USAGE = """Usage:
25-
mwcitations (-h | --help)
26-
mwcitations <utility> [-h | --help]\n"""
25+
mwcites (-h | --help)
26+
mwcites <utility> [-h | --help]\n"""
2727

2828

2929
def main():
30-
30+
3131
if len(sys.argv) < 2:
3232
sys.stderr.write(USAGE)
3333
sys.exit(1)
@@ -37,13 +37,13 @@ def main():
3737
elif sys.argv[1][:1] == "-":
3838
sys.stderr.write(USAGE)
3939
sys.exit(1)
40-
40+
4141
module_name = sys.argv[1]
4242
try:
4343
module = import_module(".utilities." + module_name, package="mwcites")
4444
except ImportError:
4545
sys.stderr.write(traceback.format_exc())
4646
sys.stderr.write("Could not find utility {0}.\n".format(module_name))
4747
sys.exit(1)
48-
48+
4949
module.main(sys.argv[2:])

mwcites/utilities/extract.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
* PubMed
99
* DOI
1010
* ISBN
11+
* arXiv
1112
1213
Outputs a TSV file with the following fields:
1314
@@ -17,7 +18,7 @@
1718
e.g. 282470030
1819
* timestamp: The timestamp of the revision where the citation was first added.
1920
(ISO 8601 datetime), e.g. 2009-04-08T01:52:20Z
20-
* type: The type of identifier, e.g. pmid
21+
* type: The type of identifier, e.g. pmid, pmcid, doi, arxiv or isbn
2122
* id: The id of the cited scholarly article (utf-8),
2223
e.g 10.1183/09031936.00213411
2324
@@ -38,9 +39,9 @@
3839
import docopt
3940
from mw import xml_dump
4041

41-
from ..extractors import doi, pubmed, isbn
42+
from ..extractors import arxiv, doi, isbn, pubmed
4243

43-
ALL_EXTRACTORS = [doi, pubmed, isbn]
44+
ALL_EXTRACTORS = [doi, pubmed, isbn, arxiv]
4445

4546
HEADERS = ("page_id", "page_title", "rev_id", "timestamp", "type", "id")
4647

@@ -51,7 +52,8 @@ def main(argv=None):
5152
if args['--extractor'] == ['<all>']:
5253
extractors = ALL_EXTRACTORS
5354
else:
54-
extractors = [import_from_path(path) for path in args['--extractor']]
55+
extractors = [import_from_path(path.lower)
56+
for path in args['--extractor']]
5557

5658
run(dump_files, extractors)
5759

setup.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import os
22

33
from setuptools import find_packages, setup
4+
45
import mwcites
56

7+
68
def read(fname):
79
return open(os.path.join(os.path.dirname(__file__), fname)).read()
810

@@ -23,7 +25,8 @@ def requirements(fname):
2325
packages=find_packages(),
2426
entry_points = {
2527
'console_scripts': [
26-
'mwcitations=mwcites.mwcitations:main'
28+
'mwcitations=mwcites.mwcites:main',
29+
'mwcites=mwcites.mwcites:main'
2730
],
2831
},
2932
long_description = read('README.rst'),

utility

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/usr/bin/env python
2+
from mwcites import mwcites
3+
4+
mwcites.main()

0 commit comments

Comments
 (0)