-
Notifications
You must be signed in to change notification settings - Fork 11
/
model_gpu.py
183 lines (145 loc) · 6.95 KB
/
model_gpu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# -*- coding: utf-8 -*-
__author__ = 'klb3713'
# import math
import logging
import random
import copy
import numpy
import theano
import config
import vocabulary
from theano import tensor as T
# from theano.sandbox.softsign import softsign
from parameters_gpu import Parameters
logger = logging.getLogger(__name__)
class Model(object):
"""
A Model can:
@type parameters: L{Parameters}
@todo: Document
"""
def __init__(self):
vocabulary.load_vocabulary()
self.parameters = Parameters()
self.train_loss = 0
self.train_err = 0
self.train_lossnonzero = 0
self.train_cnt = 0
self.COMPILE_MODE = theano.compile.Mode('c|py', 'fast_run')
self.train_function = self._get_train_function()
def reset(self):
self.train_loss = 0
self.train_err = 0
self.train_lossnonzero = 0
self.train_cnt = 0
def __getstate__(self):
return (self.parameters,
self.train_loss,
self.train_err,
self.train_lossnonzero,
self.train_cnt)
def __setstate__(self, state):
(self.parameters,
self.train_loss,
self.train_err,
self.train_lossnonzero,
self.train_cnt) = state
def _get_train_function(self):
floatX = theano.config.floatX
correct_inputs = T.matrix(dtype=floatX)
noise_inputs = T.matrix(dtype=floatX)
learning_rate = T.scalar(dtype=floatX)
correct_prehidden = T.dot(correct_inputs, self.parameters.hidden_weights) + self.parameters.hidden_biases
hidden = T.tanh(correct_prehidden)
correct_score = T.dot(hidden, self.parameters.output_weights) + self.parameters.output_biases
noise_prehidden = T.dot(noise_inputs, self.parameters.hidden_weights) + self.parameters.hidden_biases
hidden = T.tanh(noise_prehidden)
noise_score = T.dot(hidden, self.parameters.output_weights) + self.parameters.output_biases
losses = T.clip(1 - correct_score + noise_score, 0, 1e999)
total_loss = T.sum(losses)
(dhidden_weights,
dhidden_biases,
doutput_weights,
doutput_biases) = T.grad(total_loss,
[self.parameters.hidden_weights,
self.parameters.hidden_biases,
self.parameters.output_weights,
self.parameters.output_biases])
dcorrect_inputs = T.grad(total_loss, correct_inputs)
dnoise_inputs = T.grad(total_loss, noise_inputs)
para_gpara = zip((self.parameters.hidden_weights,
self.parameters.hidden_biases,
self.parameters.output_weights,
self.parameters.output_biases),
(dhidden_weights, dhidden_biases, doutput_weights, doutput_biases))
updates = [(p, p - learning_rate * gp) for p, gp in para_gpara]
logger.info("About to compile train function...")
train_function = theano.function([correct_inputs, noise_inputs, learning_rate],
[dcorrect_inputs, dnoise_inputs, total_loss, losses, correct_score, noise_score],
mode=self.COMPILE_MODE,
updates=updates)
logger.info("Done constructing function for train")
return train_function
def embeds(self, sequences):
"""
Embed sequences of vocabulary IDs.
If we are given a list of MINIBATCH lists of SEQLEN items,
return a matrices of shape (MINIBATCH, EMBSIZE*SEQLEN)
"""
embs = []
for sequence in sequences:
seq = [self.parameters.embeddings[s] for s in sequence]
embs.append(numpy.concatenate(seq))
return numpy.vstack(embs)
def corrupt_examples(self, correct_sequences):
noise_sequences = []
half_window = config.WINDOW_SIZE / 2
for e in correct_sequences:
noise_sequence = copy.copy(e)
noise_sequence[half_window] = random.randint(0, self.parameters.vocab_size - 1)
noise_sequences.append(noise_sequence)
return noise_sequences
def train(self, correct_sequences):
learning_rate = config.LEARNING_RATE
noise_sequences = self.corrupt_examples(correct_sequences)
half_window = config.WINDOW_SIZE / 2
for i in xrange(100):
r = self.train_function(self.embeds(correct_sequences), self.embeds(noise_sequences), learning_rate)
(dcorrect_inputss, dnoise_inputss, total_loss, losses, correct_scores, noise_scores) = r
self.train_loss += total_loss
self.train_err += (correct_scores <= noise_scores).sum()
self.train_lossnonzero += (losses > 0).sum()
# logger.info("Train loss: %f" % self.train_loss)
# logger.info("Train error: %d" % self.train_err)
# logger.info("Train loss nonzero: %d" % self.train_lossnonzero)
# self.reset()
for index in range(len(correct_sequences)):
correct_sequence = correct_sequences[index]
noise_sequence = noise_sequences[index]
dcorrect_inputs = dcorrect_inputss[index].reshape((config.WINDOW_SIZE, config.EMBEDDING_SIZE))
dnoise_inputs = dnoise_inputss[index].reshape((config.WINDOW_SIZE, config.EMBEDDING_SIZE))
embedding_learning_rate = config.EMBEDDING_LEARNING_RATE
for (i, di) in zip(correct_sequence, dcorrect_inputs):
self.parameters.embeddings[i] -= embedding_learning_rate * di
for (i, di) in zip(noise_sequence, dnoise_inputs):
self.parameters.embeddings[i] -= embedding_learning_rate * di
for noise_sequence in noise_sequences:
noise_sequence[half_window] = random.randint(0, self.parameters.vocab_size - 1)
self.train_cnt += len(correct_sequences)
def save_word2vec_format(self, fname, binary=False):
"""
Store the input-hidden weight matrix in the same format used by the original
C word2vec-tool, for compatibility.
"""
logger.info("storing %sx%s projection weights into %s" % (self.parameters.vocab_size, self.parameters.embedding_size, fname))
with open(fname, 'wb') as fout:
fout.write("%s %s\n" % self.parameters.embeddings.shape)
# store in sorted order: most frequent words at the top
for word, count in sorted(vocabulary.words, key=lambda item: -item[1]):
# word = utils.to_utf8(word) # always store in utf8
index = vocabulary.id(word)
row = self.parameters.embeddings[index]
if binary:
fout.write("%s %s\n" % (word, row.tostring()))
else:
fout.write("%s %s\n" % (word, ' '.join("%f" % val for val in row)))