-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMRGenerateContexts.py
237 lines (202 loc) · 9.78 KB
/
MRGenerateContexts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#import psyco
import logging, os
import codecs
from bz2 import *
import re
from ufo import *
from string import lower
from collections import defaultdict
from sim_utils import parse_lda_entry
ClientRegistry.Port = 62580 # port to connect to the mothership
Mapper.Port = 62581 # port for the RPC server on the child
# Output level
logger.setLevel( logging.INFO )
MinDocLength = 100
# MinContextOccurranceThreshold = 0
# MinPerWordUniqueContextThreshold = 0
# HeadWordsFile = 'wikipedia-1000w.headwords50.trimmed'
# HeadWordsFile = 'wikipedia-headwords-17k.txt'
# HeadWordsFile = 'wordsim-353.heads.bz2'
HeadWordsFile='contextVectorWords.cut50.txt.bz2'
#HeadWordsFile = 'evocation.heads.bz2'
#HeadWordsFile = 'turk.heads'
# HeadWordsFile = 'usim.heads'
# SourceDataType = 'gigaword'
# TermFreqFile = 'gigaword-term-doc-freq-5v.txt.bz2'
SourceDataType = 'wikipedia-strict'
TermFreqFile = 'wikipedia-term-doc-freq-5v.txt.bz2'
MinDocFreq = 5
# Choosing 'combined' can result in space savings during the mapper step, and
# results in one set of features per head word, rather than one set per context
OutputType = 'combined'
sys.stderr.write('combining outputs\n')
# OutputType = 'occurrence'
assert OutputType in ['combined', 'occurrence']
# ContextCountsFile = 'context-count-wikipedia-1000w-min10-1lr-wikipedia-1000w.headwords50.trimmed.txt'
ContextCountsFile = None
# StopWordsFile = 'empty.txt.bz2'
StopWordsFile = 'stopwords2.txt.bz2'
# StopWordsFile = 'stopwords.txt.bz2'
# ContextSize = 25
ContextSize = 5
ContextType = 'uni' # full contexts is just the previous ContextSize words
#ContextType = 'raw' # full contexts is just the previous ContextSize words
# ContextSize = 1
# ContextType = 'lr' # currently lr or uni
assert ContextType in ['uni', 'lr', 'raw']
Phase = 2 # Phase 1 counts the context occurrences
BannedArticleTypes = ['Image:', 'Wikipedia:', 'Template:', 'Category:']
if Phase == 1:
BZ2ShardedMothership.OutputFile = 'context-count-wikipedia-%dw-min%s-%d%s-%s.txt.bz2' % (MinDocLength,\
MinContextOccurranceThreshold,\
ContextSize, ContextType,\
HeadWordsFile)
else:
BZ2ShardedMothership.OutputFile = '%s-%dw-min%df-%s-%d%s-%s-contexts.txt.bz2' % (SourceDataType, MinDocLength,\
MinDocFreq,\
# MinPerWordUniqueContextThreshold,\
OutputType,\
ContextSize, ContextType,\
HeadWordsFile)
class MyMapper(Mapper):
def initialize(self, args):
"""
Read in the set of headwords
"""
self.head_words = set()
logger.info('Reading in head words...')
# reader = codecs.getreader('utf8')(open(HeadWordsFile))
# reader = codecs.open(HeadWordsFile, 'r', 'utf8', errors='replace')
reader = codecs.getreader('utf8')(BZ2File(HeadWordsFile))
for line in reader.readlines():
word = line.strip().split('\t')[0]
self.head_words.add(word)
reader.close()
self.stop_words = set()
logger.info('Reading in stop words...')
reader = codecs.getreader('utf8')(BZ2File(StopWordsFile))
for line in reader.readlines():
word = line.replace('\n', '')
if word not in self.head_words:
self.stop_words.add(word)
reader.close()
if Phase == 2:
self.good_contexts = set()
if ContextCountsFile:
logger.info('Reading in good contexts...')
reader = codecs.getreader('utf8')(open(ContextCountsFile))
for line in reader.readlines():
context = line.split('\t')[0]
self.good_contexts.add(context)
reader.close()
# Read in the term frequency information
logger.info('Reading in term freq...')
self.unigram_term_freq = defaultdict(int)
self.unigram_doc_freq = defaultdict(int)
reader = codecs.getreader('utf8')(BZ2File(TermFreqFile))
for line in reader.readlines():
(word, tf, df) = line.split('\t')
self.unigram_term_freq[word] = int(tf)
self.unigram_doc_freq[word] = int(df)
reader.close()
def map(self, token):
# Stick these in here to hide them from ungoliant
# from utils.cleaner import *
from utils import get_document_iterator
logger.info('Mapping token [%r]' % token)
combined_counts = defaultdict(lambda: defaultdict(int))
for (doc_count, (current_title, document, _)) in get_document_iterator(SourceDataType, token):
words = document.replace('<CR>', ' ').split()
# print current_title, words
if len(words) > MinDocLength:
words = filter(lambda x: x not in self.stop_words, words)
for (i,w) in enumerate(words):
if w in self.head_words:
try:
if ContextType == 'lr':
context = '_'.join(words[max(0,i-ContextSize):i] + '_<>_' +
words[i+1:min(i+1+ContextSize, len(words))])
elif ContextType == 'uni':
buffer = defaultdict(int)
for ww in [x for x in words[max(0,i-ContextSize-1):i] if self.unigram_doc_freq[x] >= MinDocFreq]:
buffer[ww] += 1
for ww in [x for x in words[i+1:min(i+ContextSize+1, len(words))] if self.unigram_doc_freq[x] >= MinDocFreq]:
buffer[ww] += 1
elif ContextType == 'raw':
buffer = words[max(0,i-ContextSize):i]
buffer.append('###%s###' % words[i])
buffer.extend(words[i+1:min(i+ContextSize, len(words))])
if Phase == 1:
self.output('%s\t%s' % (context, w))
elif Phase == 2:
assert not self.good_contexts
if OutputType == 'combined':
assert not ContextType == 'raw'
for k,v in buffer.iteritems():
combined_counts[w][intern(k.encode('ascii'))] += v
elif OutputType == 'occurrence':
if ContextType == 'raw':
context = u' '.join([k.decode('utf8') for k in buffer])
self.output(u'%s\t%s' % (w, context))
else:
context = u'\t'.join([u'%s:%d' % (k.decode('utf8'),v) for k,v in buffer.iteritems()])
self.output(u'%s\t%s' % (w, context))
except UnicodeEncodeError:
sys.stderr.write('FAILED\n')
if doc_count % 100 == 0:
logger.info('Processed %d documents' % doc_count)
# Do intermediate combining for space efficiency
if OutputType == 'combined':
for w, contexts in combined_counts.iteritems():
context = u'\t'.join([u'%s:%d' % (k,v) for k,v in contexts.iteritems()])
self.output(u'%s\t%s' % (w, context))
# Return success
return True
def reduce(self, data_heap):
"""
Sum-reducer, for all the occurances of word/context, just make a count. This is where we can
do things like thresholding.
"""
if Phase == 1:
self.phase_one_reduce(data_heap)
elif Phase == 2:
self.phase_two_reduce(data_heap)
def phase_one_reduce(self, data_heap):
current = None
occurrences = set()
while data_heap:
(context, _, word) = heappop(data_heap).partition('\t')
if current and context != current:
if len(occurrences) >= MinContextOccurranceThreshold:
self.output('%s\t%s' % (current, len(occurrences)))
else:
#self.output('POOP %s\t%s' % (current, len(occurrences)))
pass
occurrences = set()
current = context
occurrences.add(word)
def phase_two_reduce(self, data_heap):
"""
This one collects word/context pairs and outputs to the LDA format
"""
current = None
occurrences = defaultdict(int)
while data_heap:
if OutputType == 'occurrence':
self.output(heappop(data_heap))
else:
tokens = heappop(data_heap).strip().split('\t')
word, contexts = tokens[0], map(parse_lda_entry, tokens[1:])
if current and word != current:
self.output('%s\t%s' % (current, '\t'.join(['%s:%d' % (k,v)
for (k,v) in occurrences.iteritems()])))
occurrences = defaultdict(int)
current = word
for w,c in contexts:
occurrences[w] += c
self.output('%s\t%s' % (current, '\t'.join(['%s:%d' % (k,v) for (k,v) in occurrences.iteritems()])))
UFOMapper = MyMapper
UFOMothership = BZ2ShardedMothership
if __name__ == '__main__':
#psyco.full()
start_ufo(UFOMapper, UFOMothership)