7
7
8
8
* PubMed
9
9
* DOI
10
-
10
+ * ISBN
11
+
11
12
Outputs a TSV file with the following fields:
12
13
13
14
* page_id: The identifier of the Wikipedia article (int), e.g. 1325125
46
47
def main (argv = None ):
47
48
args = docopt .docopt (__doc__ , argv = argv )
48
49
dump_files = args ['<dump_file>' ]
49
-
50
+
50
51
if args ['--extractor' ] == ['<all>' ]:
51
52
extractors = ALL_EXTRACTORS
52
53
else :
53
54
extractors = [import_from_path (path ) for path in args ['--extractor' ]]
54
-
55
+
55
56
run (dump_files , extractors )
56
57
57
58
def run (dump_files , extractors ):
58
-
59
+
59
60
print ("\t " .join (HEADERS ))
60
-
61
+
61
62
cites = extract (dump_files , extractors = extractors )
62
63
for page_id , title , rev_id , timestamp , type , id in cites :
63
-
64
+
64
65
print ("\t " .join (tsv_encode (v ) for v in (page_id ,
65
66
title ,
66
67
rev_id ,
@@ -71,17 +72,17 @@ def run(dump_files, extractors):
71
72
def extract (dump_files , extractors = ALL_EXTRACTORS ):
72
73
"""
73
74
Extracts cites from a set of `dump_files`.
74
-
75
+
75
76
:Parameters:
76
77
dump_files : str | `file`
77
78
A set of files MediaWiki XML dump files
78
79
(expects: pages-meta-history)
79
80
extractors : `list`(`extractor`)
80
81
A list of extractors to apply to the text
81
-
82
+
82
83
:Returns:
83
84
`iterable` -- a generator of extracted cites
84
-
85
+
85
86
"""
86
87
# Dump processor function
87
88
def process_dump (dump , path ):
@@ -90,48 +91,48 @@ def process_dump(dump, path):
90
91
else :
91
92
for cite in extract_cite_history (page , extractors ):
92
93
yield cite
93
-
94
+
94
95
# Map call
95
96
return xml_dump .map (dump_files , process_dump )
96
97
97
98
def extract_cite_history (page , extractors ):
98
99
"""
99
100
Extracts cites from the history of a `page` (`mw.xml_dump.Page`).
100
-
101
+
101
102
:Parameters:
102
103
page : `iterable`(`mw.xml_dump.Revision`)
103
104
The page to extract cites from
104
105
extractors : `list`(`extractor`)
105
106
A list of extractors to apply to the text
106
-
107
+
107
108
:Returns:
108
109
`iterable` -- a generator of extracted cites
109
-
110
+
110
111
"""
111
112
appearances = {} # For tracking the first appearance of an ID
112
113
ids = set () # For holding onto the ids in the last revision.
113
114
for revision in page :
114
115
ids = set (extract_ids (revision .text , extractors ))
115
-
116
+
116
117
# For each ID, check to see if we have seen it before
117
118
for id in ids :
118
119
if id not in appearances :
119
120
appearances [id ] = (revision .id , revision .timestamp )
120
-
121
+
121
122
for id in ids : #For the ids in the last version of the page
122
123
rev_id , timestamp = appearances [id ]
123
124
yield (page .id , page .title , rev_id , timestamp , id .type , id .id )
124
125
125
126
def extract_ids (text , extractors ):
126
127
"""
127
128
Uses `extractors` to extract citation identifiers from a text.
128
-
129
+
129
130
:Parameters:
130
131
text : str
131
132
The text to process
132
133
extractors : `list`(`extractor`)
133
134
A list of extractors to apply to the text
134
-
135
+
135
136
:Returns:
136
137
`iterable` -- a generator of extracted identifiers
137
138
"""
@@ -142,12 +143,12 @@ def extract_ids(text, extractors):
142
143
def import_from_path (path ):
143
144
"""
144
145
Imports a specific attribute from a module based on a class path.
145
-
146
+
146
147
:Parameters:
147
148
path : str
148
149
A dot delimited string representing the import path of the desired
149
150
object.
150
-
151
+
151
152
:Returns:
152
153
object -- An imported object
153
154
"""
@@ -166,13 +167,13 @@ def tsv_encode(val, none_string="NULL"):
166
167
"""
167
168
Encodes a value for inclusion in a TSV. Basically, it converts the value
168
169
to a string and escapes TABs and linebreaks.
169
-
170
+
170
171
:Parameters:
171
172
val : `mixed`
172
173
The value to encode
173
174
none_string : str
174
175
The string to use when `None` is encountered
175
-
176
+
176
177
:Returns:
177
178
str -- a string representing the encoded value
178
179
"""
@@ -181,5 +182,5 @@ def tsv_encode(val, none_string="NULL"):
181
182
else :
182
183
if isinstance (val , bytes ):
183
184
val = str (val , 'utf-8' )
184
-
185
+
185
186
return str (val ).replace ("\t " , "\\ t" ).replace ("\n " , "\\ n" )
0 commit comments