This repository has been archived by the owner on Apr 24, 2020. It is now read-only.
forked from faraday/wikiprep-esa
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscanData.py
407 lines (321 loc) · 12.3 KB
/
scanData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
Copyright (C) 2010 Cagatay Calli <[email protected]>
Scans XML output (gum.xml) from Wikiprep, creates 3 tables:
TABLE: article COLUMNS: id INT, title VARBINARY(255)
TABLE: text COLUMNS: old_id INT, old_text MEDIUMBLOB
TABLE: pagelinks COLUMNS: source_id INT, target_id INT
USAGE: scanData.py <hgw.xml/gum.xml file from Wikiprep> --format=<Wikiprep dump format> [--stopcats=<stop category file>]
IMPORTANT: If you use XML output from a recent version of Wikiprep
(e.g. Zemanta fork), then set FORMAT to 'Zemanta-legacy' or 'Zemanta-modern'.
ABOUT STOP CATEGORY FILTERING:
Stop category filtering is not active in default configuration. You can change
provide an updated list of stop categories, derived from your Wikipedia dump
with --stopcats option.
e.g. scanData.py sample.gum.xml --stopcats=sampleCategoryList.txt
Cleaning up irrelevant articles is important in ESA so providing such a file
is recommended.
'''
import sys
import re
import MySQLdb
import signal
from optparse import OptionParser
from subprocess import Popen, PIPE
import lxml.html as html
from lxml.etree import ParserError
import Stemmer
import xmlwikiprep
# Wikiprep dump format enum
# formats: 1) Gabrilovich 2) Zemanta-legacy 3) Zemanta-modern
F_GABRI = 0 # gabrilovich
F_ZLEGACY = 1 # zemanta legacy
F_ZMODERN = 2 # zemanta modern
usage = """
USAGE: scanData.py links_count.txt gum1.xml.gz gum1.xml.gz ... --format=<Wikiprep dump format> [--stopcats=<stop category file>]
Wikiprep dump formats:
1. Gabrilovich [gl, gabrilovich]
2. Zemanta legacy [zl, legacy, zemanta-legacy]
3. Zemanta modern [zm, modern, zemanta-modern]
'2005_wiki_stop_categories.txt' can be used for 2005 dump of Gabrilovich et al.
"""
parser = OptionParser(usage=usage)
parser.add_option("-s", "--stopcats", dest="stopcats", help="Path to stop categories file", metavar="STOPCATS")
parser.add_option("-f", "--format", dest="_format", help="Wikiprep dump format (g for Gabrilovich, zl for Zemanta-legacy,zm for Zemanta-modern)", metavar="FORMAT")
(options, args) = parser.parse_args()
if not args:
print usage
sys.exit()
if not options.stopcats:
print 'Stop category list is not provided. (You can provide this with --stopcats argument.)'
print 'Continuing without stop category filter...'
if not options._format:
print """
Wikiprep dump format not specified! Please select one from below with --format option:
Wikiprep dump formats:
1. Gabrilovich [gl, gabrilovich]
2. Zemanta legacy [zl, legacy, zemanta-legacy]
3. Zemanta modern [zm, modern, zemanta-modern]
"""
sys.exit()
if options._format in ['zm', 'zemanta-modern', 'Zemanta-modern', 'Zemanta-Modern', 'modern']:
FORMAT = F_ZMODERN
print '--> Using zemanta-modern format!'
elif options._format in ['gl', 'gabrilovich', 'Gabrilovich']:
FORMAT = F_GABRI
print '--> Using gabrilovich format!'
elif options._format in ['zl', 'zemanta-legacy', 'Zemanta-legacy', 'Zemanta-Legacy', 'legacy']:
FORMAT = F_ZLEGACY
print '--> Using zemanta-legacy format!'
# scanData.py <hgw_file> [--stopcats=<stop category file>]
hgwpath = args # hgw/gum.xml
TITLE_WEIGHT = 4
STOP_CATEGORY_FILTER = bool(options.stopcats)
# reToken = re.compile('[a-zA-Z\-]+')
reToken = re.compile("[^ \t\n\r`~!@#$%^&*()_=+|\[;\]\{\},./?<>:’'\\\\\"]+")
reAlpha = re.compile("^[a-zA-Z\-_]+$")
NONSTOP_THRES = 100
STEMMER = Stemmer.Stemmer('porter')
# read stop word list from 'lewis_smart_sorted_uniq.txt'
wordList = []
try:
f = open('lewis_smart_sorted_uniq.txt', 'r')
for word in f.readlines():
wordList.append(word.strip())
f.close()
except:
print 'Stop words cannot be read! Please put "lewis_smart_sorted_uniq.txt" file containing stop words in this folder.'
sys.exit(1)
STOP_WORDS = frozenset(wordList)
if STOP_CATEGORY_FILTER:
# read list of stop categories from 'extended_stop_categories.txt'
catList = []
try:
f = open(options.stopcats, 'r')
for line in f.readlines():
strId = line.split('\t')[0]
if strId:
catList.append(int(strId))
f.close()
except:
print 'Stop categories cannot be read!'
sys.exit(1)
STOP_CATS = frozenset(catList)
# read disambig IDs for legacy format
disambigList = []
if FORMAT != F_ZMODERN:
disambigPath = hgwpath.replace('hgw.xml', 'disambig')
print disambigPath
try:
f = open(disambigPath, 'r')
for i in range(3):
f.readline()
prevId = ''
for line in f.readlines():
if prevId and line.startswith(prevId):
continue
id = line.split('\t', 1)[0].strip()
disambigList.append(int(id))
prevId = id
f.close()
except:
print 'Disambig file cannot be read! Please check if a file with .disambig suffix exists in Wikiprep dump location'
sys.exit(1)
DISAMBIG_IDS = frozenset(disambigList)
try:
conn = MySQLdb.connect(host='localhost', user='root', passwd='123456', db='wiki', charset="utf8", use_unicode=True)
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit(1)
try:
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS article")
cursor.execute("""
CREATE TABLE article
(
id INT(10),
title VARBINARY(255) NOT NULL,
PRIMARY KEY (id),
KEY title (title(32))
) DEFAULT CHARSET=binary
""")
cursor.execute("DROP TABLE IF EXISTS text")
cursor.execute("""
CREATE TABLE text
(
old_id INT(10) unsigned NOT NULL,
old_text MEDIUMBLOB NOT NULL,
PRIMARY KEY (old_id)
) DEFAULT CHARSET=binary MAX_ROWS=10000000 AVG_ROW_LENGTH=10240;
""")
except MySQLdb.Error, e:
print "Error %d: %s" % (e.args[0], e.args[1])
sys.exit(1)
## handler for SIGTERM ###
def signalHandler(signum, frame):
global conn, cursor
cursor.close()
conn.close()
sys.exit(1)
signal.signal(signal.SIGTERM, signalHandler)
#####
reOtherNamespace = re.compile("^(User|Wikipedia|File|MediaWiki|Template|Help|Category|Portal|Book|Talk|Special|Media|WP|User talk|Wikipedia talk|File talk|MediaWiki talk|Template talk|Help talk|Category talk|Portal talk):.+", re.DOTALL)
# category, disambig, stub pages are removed by flags
# regex as article filter (dates, lists, etc.)
'''
# slightly improved title filter, filtering all dates,lists etc.
re_strings = ['^(January|February|March|April|May|June|July|August|September|October|November|December) \d+$',
'^\d+((s|(st|nd|th) (century|millenium)))?( (AD|BC|AH|BH|AP|BP))?( in [^$]+)?$',
'.+\(disambiguation\)']
'''
# title filter of Gabrilovich et al. contains: * year_in ... * month_year * digit formats
re_strings = ['^(January|February|March|April|May|June|July|August|September|October|November|December) \d{4}$',
'^\d{4} in [^$]+?$',
'^\d+$']
piped_re = re.compile("|".join(re_strings), re.DOTALL | re.IGNORECASE)
# list filter
reList = re.compile('^List of .+', re.DOTALL | re.IGNORECASE)
###
articleBuffer = [] # len: 100 / now: 200
textBuffer = [] # same as articleBuffer, stores text
###
inlinkDict = {}
outlinkDict = {}
cursor.execute("SELECT i.target_id, i.inlink FROM inlinks i")
rows = cursor.fetchall()
for row in rows:
inlinkDict[row[0]] = row[1]
print '--> inlinkDict size:', len(inlinkDict)
cursor.execute("SELECT o.source_id, o.outlink FROM outlinks o")
rows = cursor.fetchall()
for row in rows:
outlinkDict[row[0]] = row[1]
print '--> outlinkDict size:', len(outlinkDict)
# for logging
# Filtered concept id=12 (hede hodo) [minIncomingLinks]
log = open('log.txt', 'w')
# pageContent - <page>..content..</page>
# pageDict - stores page attribute dict
def recordArticle(pageDoc):
global articleBuffer, textBuffer, STEMMER
if FORMAT == F_ZMODERN and (pageDoc['disambig'] or pageDoc['category'] or pageDoc['image']):
return
# a simple check for content
if pageDoc['length'] < 10:
return
title = pageDoc['title']
_id = pageDoc['_id']
# only keep articles of Main namespace
if reOtherNamespace.match(title):
return
# skip disambig
if FORMAT != F_ZMODERN and _id in DISAMBIG_IDS:
return
# ** stop category filter **
if STOP_CATEGORY_FILTER:
cats = frozenset(pageDoc['categories'])
# filter article with no category or belonging to stop categories
if not cats or STOP_CATS.intersection(cats):
log.write('Filtered concept id=' + str(_id) + ' (' + title.encode('utf8', 'ignore') + ') [stop category]\n')
return
# ******
# ** title filter **
if piped_re.match(title):
log.write('Filtered concept id=' + str(_id) + ' (' + title.encode('utf8', 'ignore') + ') [regex]\n')
return
'''if reList.match(title):
log.write('Filtered concept id='+str(id)+' ('+ title +') [list]\n')
return'''
# ******
# ** inlink-outlink filter **
if _id not in inlinkDict or inlinkDict[_id] < 5:
log.write('Filtered concept id=' + str(_id) + ' (' + title.encode('utf8', 'ignore') + ') [minIncomingLinks]\n')
return
if _id not in outlinkDict or outlinkDict[_id] < 5:
log.write('Filtered concept id=' + str(_id) + ' (' + title.encode('utf8', 'ignore') + ') [minOutgoingLinks]\n')
return
# ******
text = pageDoc['text']
# convert HTML to plain text
t = html.fromstring(title)
ctitle = t.text_content()
ctext = ''
try:
t = html.fromstring(text)
ctext = t.text_content()
except ParserError:
log.write('Skipped concept id=' + str(_id) + ' (' + title.encode('utf8', 'ignore') + ') [parse error]\n')
return
# filter articles with fewer than 100 -UNIQUE- non-stop words
cmerged = ctitle + ' \n ' + ctext
tokens = set()
wordCount = 0
for m in reToken.finditer(cmerged):
w = m.group()
if not w or len(w) <= 2 or not reAlpha.match(w):
continue
lword = w.lower()
if not lword in STOP_WORDS:
sword = STEMMER.stemWord(STEMMER.stemWord(STEMMER.stemWord(lword))) # 3xPorter
if not sword in tokens:
wordCount += 1
tokens.add(sword)
if wordCount == NONSTOP_THRES:
break
if wordCount < NONSTOP_THRES:
log.write('Filtered concept id=' + str(_id) + ' (' + title.encode('utf8', 'ignore') + ') [minNumFeaturesPerArticle]\n')
return
cadd = ''
for i in range(TITLE_WEIGHT):
cadd += ctitle + ' \n '
cadd += ctext
# write article info (id,title,text)
articleBuffer.append((_id, ctitle.encode('utf8')))
textBuffer.append((_id, cadd.encode('utf8')))
if len(articleBuffer) >= 200:
cursor.executemany("""
INSERT INTO article (id,title)
VALUES (%s,%s)
""", articleBuffer)
cursor.executemany("""
INSERT INTO text (old_id,old_text)
VALUES (%s,%s)
""", textBuffer)
articleBuffer = []
textBuffer = []
for fname in args:
print >>sys.stderr, " -> Processing file", fname
#f = Popen(['zcat', fname], stdout=PIPE) # much faster than python gzip
f = Popen(['pigz', '-d', '-c', fname], stdout=PIPE) # even faster
for doc in xmlwikiprep.read(f.stdout):
recordArticle(doc)
# f = open(hgwpath, 'r')
# for doc in xmlwikiprep.read(f):
# recordArticle(doc)
# f.close()
if articleBuffer:
cursor.executemany("""
INSERT INTO article (id,title)
VALUES (%s,%s)
""", articleBuffer)
cursor.executemany("""
INSERT INTO text (old_id,old_text)
VALUES (%s,%s)
""", textBuffer)
articleBuffer = []
textBuffer = []
#cursor.execute("DROP TABLE outlinks")
# remove links to articles that are filtered out
cursor.execute("DROP TABLE IF EXISTS tmppagelinks")
cursor.execute("CREATE TABLE tmppagelinks LIKE pagelinks")
cursor.execute("INSERT tmppagelinks SELECT * FROM pagelinks WHERE EXISTS (SELECT id FROM article WHERE id = target_id) AND EXISTS (SELECT id FROM article WHERE id = source_id)")
cursor.execute("DROP TABLE pagelinks")
cursor.execute("RENAME TABLE tmppagelinks TO pagelinks")
cursor.execute("SELECT COUNT(id) FROM article")
r = cursor.fetchone()
print "Articles: ", r[0]
# release DB resources
cursor.close()
conn.close()
log.close()