Skip to content

Commit 215909d

Browse files
committed
Switch over to bfgs coded ourselves, instead of using scipy
1 parent b091f80 commit 215909d

File tree

3 files changed

+320
-15
lines changed

3 files changed

+320
-15
lines changed

.gitignore

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55
!/src/*.*
66
!/src/Makefile
77

8-
# compiled object files, and directories automatically created on Darwin OS
9-
# for debug symbols.
8+
# compiled object files and python files, and directories automatically created
9+
# on Darwin OS for debug symbols.
1010
*.o
11+
*.pyc
1112
*.dSYM
1213

1314
# emacs saves

scripts/internal/bfgs.py

+298
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,298 @@
1+
#!/usr/bin/env python
2+
3+
from __future__ import print_function
4+
import math, sys
5+
import numpy as np
6+
import numpy.linalg as la
7+
8+
9+
"""
10+
This is a version of BFGS specialized for the case where the function
11+
is constrained to a particular convex region via a barrier function,
12+
and where we can efficiently evaluate (via calling f_finite(x), which
13+
returns bool) whether the function is finite at the given point.
14+
15+
x0 The value to start the optimization at.
16+
f The function being minimized. f(x) returns a pair (value, gradient).
17+
f_finite f_finite(x) returns true if f(x) would be finite, and false otherwise.
18+
init_hessian This gives you a way to specify a "better guess" at the initial
19+
Hessian.
20+
return value Returns a 4-tuple (x, f(x), f'(x), inverse-hessian-approximation).
21+
22+
23+
"""
24+
def Bfgs(x0, f, f_finite, init_inv_hessian = None):
25+
b = __bfgs(x0, f, f_finite, init_inv_hessian)
26+
return b.Minimize()
27+
28+
29+
30+
31+
class __bfgs:
32+
def __init__(self, x0, f, f_finite, init_inv_hessian = None,
33+
gradient_tolerance = 0.0005, progress_tolerance = 1.0e-06,
34+
progress_tolerance_num_iters = 3):
35+
self.c1 = 1.0e-04 # constant used in line search
36+
self.c2 = 0.9 # constant used in line search
37+
assert len(x0.shape) == 1
38+
self.dim = x0.shape[0]
39+
self.f = f
40+
self.f_finite = f_finite
41+
self.gradient_tolerance = gradient_tolerance
42+
self.progress_tolerance = progress_tolerance
43+
assert progress_tolerance_num_iters >= 1
44+
self.progress_tolerance_num_iters = progress_tolerance_num_iters
45+
46+
if not self.f_finite(x0):
47+
self.LogMessage("Function is not finite at initial point {0}".format(x0))
48+
sys.exit(1)
49+
50+
# evaluations will be a list of 3-tuples (x, function-value f(x),
51+
# function-derivative f'(x)). it's written to and read from by the
52+
# function self.FunctionValueAndDerivative().
53+
self.cached_evaluations = [ ]
54+
55+
self.x = [ x0 ]
56+
(value0, deriv0) = self.FunctionValueAndDerivative(x0)
57+
self.value = [ value0 ]
58+
self.deriv = [ deriv0 ]
59+
60+
deriv_magnitude = math.sqrt(np.dot(deriv0, deriv0))
61+
self.LogMessage("On iteration 0, value is {0}, deriv-magnitude {1}".format(
62+
value0, deriv_magnitude))
63+
64+
# note: self.inv_hessian is referred to as H_k in the Nocedal
65+
# and Wright textbook.
66+
if init_inv_hessian is None:
67+
self.inv_hessian = np.identity(self.dim)
68+
else:
69+
self.inv_hessian = init_inv_hessian
70+
71+
def Minimize(self):
72+
while not self.Converged():
73+
self.Iterate()
74+
self.FinalDebugOutput()
75+
return (self.x[-1], self.value[-1], self.deriv[-1], self.inv_hessian)
76+
77+
78+
def FinalDebugOutput(self):
79+
pass
80+
# currently this does nothing.
81+
82+
# This does one iteration of update.
83+
def Iterate(self):
84+
self.p = - np.dot(self.inv_hessian, self.deriv[-1])
85+
alpha = self.LineSearch()
86+
if alpha is None:
87+
self.LogMessage("Restarting BFGS with unit Hessian since line search failed")
88+
self.inv_hessian = np.identity(self.dim)
89+
return
90+
cur_x = self.x[-1]
91+
next_x = cur_x + alpha * self.p
92+
(next_value, next_deriv) = self.FunctionValueAndDerivative(next_x)
93+
next_deriv_magnitude = math.sqrt(np.dot(next_deriv, next_deriv))
94+
self.LogMessage("On iteration {0}, value is {1}, deriv-magnitude {2}".format(
95+
len(self.x), next_value, next_deriv_magnitude))
96+
97+
# obtain s_k = x_{k+1} - x_k, y_k = gradient_{k+1} - gradient_{k}
98+
# see eq. 6.5 in Nocedal and Wright.
99+
self.x.append(next_x)
100+
self.value.append(next_value)
101+
self.deriv.append(next_deriv)
102+
s_k = alpha * self.p
103+
y_k = self.deriv[-1] - self.deriv[-2]
104+
ysdot = np.dot(s_k, y_k)
105+
if not ysdot > 0:
106+
self.LogMessage("Restarting BFGS with unit Hessian since curvature "
107+
"condition failed [likely a bug in the optimization code]")
108+
self.inv_hessian = np.identity(self.dim)
109+
return
110+
rho_k = 1.0 / ysdot # eq. 6.14 in Nocedal and Wright.
111+
# the next equation is eq. 6.17 in Nocedal and Wright.
112+
# we don't bother rearranging it for efficiency because the dimension is small.
113+
I = np.identity(self.dim)
114+
self.inv_hessian = ((I - np.outer(s_k, y_k) * rho_k) * self.inv_hessian *
115+
(I - np.outer(y_k, s_k) * rho_k)) + np.outer(s_k, s_k) * rho_k
116+
# todo: maybe make the line above more efficient.
117+
118+
# the function LineSearch is to be called after you have set self.x and
119+
# self.p. It returns an alpha value satisfying the strong Wolfe conditions,
120+
# or None if the line search failed. It is Algorithm 3.5 of Nocedal and
121+
# Wright.
122+
def LineSearch(self):
123+
alpha_max = 1.0e+10
124+
alpha1 = self.GetDefaultAlpha()
125+
increase_factor = 2.0 # amount by which we increase alpha if
126+
# needed... after the 1st time we make it 4.
127+
if alpha1 is None:
128+
self.LogMessage("Line search failed unexpectedly in making sure "
129+
"f(x) is finite.")
130+
return None
131+
132+
alpha = [ 0.0, alpha1 ]
133+
(phi_0, phi_dash_0) = self.FunctionValueAndDerivativeForAlpha(0.0)
134+
phi = [phi_0]
135+
phi_dash = [phi_dash_0]
136+
137+
if phi_dash_0 >= 0.0:
138+
self.LogMessage("{0}: line search failed unexpectedly: not a descent "
139+
"direction")
140+
141+
while True:
142+
i = len(phi)
143+
alpha_i = alpha[-1]
144+
(phi_i, phi_dash_i) = self.FunctionValueAndDerivativeForAlpha(alpha_i)
145+
phi.append(phi_i)
146+
phi_dash.append(phi_dash_i)
147+
if (phi_i > phi_0 + self.c1 * alpha_i * phi_dash_0 or
148+
(i > 1 and phi_i > phi[-2])):
149+
return self.Zoom(alpha[-2], alpha_i)
150+
if abs(phi_dash_i) <= -self.c2 * phi_dash_0:
151+
self.LogMessage("Line search: accepting default alpha = {0}".format(alpha_i))
152+
return alpha_i
153+
if phi_dash_i >= 0:
154+
return self.Zoom(alpha_i, alpha[-2])
155+
156+
# the algorithm says "choose alpha_{i+1} \in (alpha_i, alpha_max).
157+
# the rest of this block is implementing that.
158+
next_alpha = alpha_i * increase_factor
159+
increase_factor = 4.0 # after we double once, we get more aggressive.
160+
if next_alpha > alpha_max:
161+
# something went wrong if alpha needed to get this large.
162+
# most likely we'll restart BFGS.
163+
self.LogMessage("Line search failed unexpectedly, went "
164+
"past the max.");
165+
return None
166+
# make sure the function is finite at the next alpha, if possible.
167+
# we don't need to worry about efficiency too much, as this check
168+
# for finiteness is very fast.
169+
while next_alpha > alpha_i * 1.2 and not self.IsFiniteForAlpha(next_alpha):
170+
next_alpha *= 0.9
171+
while next_alpha > alpha_i * 1.02 and not self.IsFiniteForAlpha(next_alpha):
172+
next_alpha *= 0.99
173+
self.LogMessage("Increasing alpha from {0} to {1} in line search".format(alpha_i,
174+
next_alpha))
175+
alpha.append(next_alpha)
176+
177+
# This function, from Nocedal and Wright (alg. 3.6) is called from from
178+
# LineSearch. It returns the alpha value satisfying the strong Wolfe
179+
# conditions, or None if there was an error.
180+
def Zoom(self, alpha_lo, alpha_hi):
181+
# these function evaluations don't really happen, we use caching.
182+
(phi_0, phi_dash_0) = self.FunctionValueAndDerivativeForAlpha(0.0)
183+
(phi_lo, phi_dash_lo) = self.FunctionValueAndDerivativeForAlpha(alpha_lo)
184+
(phi_hi, phi_dash_hi) = self.FunctionValueAndDerivativeForAlpha(alpha_hi)
185+
186+
min_diff = 1.0e-10
187+
while True:
188+
if abs(alpha_lo - alpha_hi) < min_diff:
189+
self.LogMessage("Line search failed, interval is too small: [{0},{1}]".format(
190+
alpha_lo, alpha_hi))
191+
return None
192+
193+
# the algorithm says "Interpolate (using quadratic, cubic or
194+
# bisection) to find a trial step length between alpha_lo and
195+
# alpha_hi. We basically choose bisection, but because alpha_lo is
196+
# guaranteed to always have a "better" (lower) function value than
197+
# alpha_hi, we actually want to be a little bit closer to alpha_lo,
198+
# so we go one third of the distance between alpha_lo and alpha_hi.
199+
alpha_j = alpha_lo + 0.3333 * (alpha_hi - alpha_lo)
200+
(phi_j, phi_dash_j) = self.FunctionValueAndDerivativeForAlpha(alpha_j)
201+
if phi_j > phi_0 + self.c1 * alpha_j * phi_dash_0 or phi_j >= phi_lo:
202+
(alpha_hi, phi_hi, phi_dash_hi) = (alpha_j, phi_j, phi_dash_j)
203+
else:
204+
if abs(phi_dash_j) <= - self.c2 * phi_dash_0:
205+
self.LogMessage("Acceptable alpha is {0}".format(alpha_j))
206+
return alpha_j
207+
if phi_dash_j * (alpha_hi - alpha_lo) >= 0.0:
208+
(alpha_hi, phi_hi, phi_dash_hi) = (alpha_lo, phi_lo, phi_dash_lo)
209+
(alpha_lo, phi_lo, phi_dash_lo) = (alpha_j, phi_j, phi_dash_j)
210+
211+
212+
# The function GetDefaultAlpha(), called from LineSearch(), is to be called
213+
# after you have set self.x and self.p. It normally returns 1.0, but it
214+
# will reduce it by factors of 0.9 until the function evaluated at 2*alpha
215+
# is finite. This is because generally speaking, approaching the edge of
216+
# the barrier function too rapidly will lead to poor function values. Note:
217+
# evaluating whether the function is finite is very efficient.
218+
# If the function was not finite even at very tiny alpha, then something
219+
# probably went wrong; we'll restart BFGS in this case.
220+
def GetDefaultAlpha(self):
221+
min_alpha = 1.0e-10
222+
alpha = 1.0
223+
while alpha > min_alpha and not self.IsFiniteForAlpha(alpha * 2.0):
224+
alpha *= 0.9
225+
return alpha if alpha > min_alpha else None
226+
227+
# this function, called from LineSearch(), returns true if the function is finite
228+
# at the given alpha value.
229+
def IsFiniteForAlpha(self, alpha):
230+
x = self.x[-1] + self.p * alpha
231+
return self.f_finite(x)
232+
233+
def FunctionValueAndDerivativeForAlpha(self, alpha):
234+
x = self.x[-1] + self.p * alpha
235+
(value, deriv) = self.FunctionValueAndDerivative(x)
236+
return (value, np.dot(self.p, deriv))
237+
238+
def Converged(self):
239+
# we say that we're converged if either the gradient magnitude
240+
current_gradient = self.deriv[-1]
241+
gradient_magnitude = math.sqrt(np.dot(current_gradient, current_gradient))
242+
if gradient_magnitude < self.gradient_tolerance:
243+
self.LogMessage("BFGS converged on iteration {0} due to gradient magnitude {1} "
244+
"less than gradient tolerance {2}".format(
245+
len(self.x), gradient_magnitude, gradient_tolerance))
246+
return True
247+
n = self.progress_tolerance_num_iters
248+
if len(self.x) > n:
249+
cur_value = self.value[-1]
250+
prev_value = self.value[-1 - n]
251+
# the following will be nonnegative.
252+
change_per_iter_amortized = (prev_value - cur_value) / n
253+
if change_per_iter_amortized < self.progress_tolerance:
254+
self.LogMessage("BFGS converged on iteration {0} due to objf-change per "
255+
"iteration amortized over {1} iterations = {2} < "
256+
"threshold = {3}.".format(
257+
len(self.x), n, change_per_iter_amortized, self.progress_tolerance))
258+
return True
259+
return False
260+
261+
# this returns the function value and derivative for x, as a tuple; it
262+
# does caching.
263+
def FunctionValueAndDerivative(self, x):
264+
for i in range(len(self.cached_evaluations)):
265+
if np.array_equal(x, self.cached_evaluations[i][0]):
266+
return (self.cached_evaluations[i][1],
267+
self.cached_evaluations[i][2])
268+
# we didn't find it cached, so we need to actually evaluate the
269+
# function. this is where it gets slow.
270+
(value, deriv) = self.f(x)
271+
self.cached_evaluations.append((x, value, deriv))
272+
return (value, deriv)
273+
274+
def LogMessage(self, message):
275+
print(sys.argv[0] + ": " + message, file=sys.stderr)
276+
277+
278+
279+
280+
def __TestFunction(x):
281+
dim = 15
282+
a = np.array(range(1, dim + 1))
283+
B = np.diag(range(5, dim + 5))
284+
285+
# define a function f(x) = x.a + x^T B x
286+
value = np.dot(x, a) + np.dot(x, np.dot(B, x))
287+
288+
# derivative is a + 2 B x.
289+
deriv = a + np.dot(B, x) * 2.0
290+
return (value, deriv)
291+
292+
293+
def __TestBfgs():
294+
dim = 15
295+
x0 = np.array(range(10, dim + 10))
296+
(a,b,c,d) = Bfgs(x0, __TestFunction, lambda x : True, )
297+
298+
#__TestBfgs()

scripts/optimize_metaparameters.py

+19-13
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,15 @@
33
# we're using python 3.x style print but want it to work in python 2.x,
44
from __future__ import print_function
55
import numpy as np
6-
from scipy.optimize import minimize
76

87
import re, os, argparse, sys, math, warnings, shutil
98
from math import log
109

10+
# we need to add the ./internal/ subdirectory to the pythonpath before importing
11+
# 'bfgs'.
12+
sys.path = [ os.path.abspath(os.path.dirname(sys.argv[0])) + "/internal" ] + sys.path
13+
import bfgs
14+
1115
parser = argparse.ArgumentParser(description="Optimizes metaparameters for LM estimation; "
1216
"this utility uses derivatives from get_objf_and_derivs.py or "
1317
"get_objf_and_derivs_split.py");
@@ -171,7 +175,8 @@ def ModifyWithBarrierFunction(x, objf, derivs):
171175

172176

173177
# this will return a 2-tuple (objf, deriv). note, the objective function and
174-
# derivative are both negated because scipy only supports minimization.
178+
# derivative are both negated because conventionally optimization problems are
179+
# framed as minimization problems.
175180
def GetObjfAndDeriv(x):
176181
global iteration
177182
if not MetaparametersAreAllowed(x):
@@ -242,10 +247,8 @@ def GetObjfAndDeriv(x):
242247
print("Iteration {0}: objf={1}, deriv-magnitude={2} (with barrier function)".format(
243248
iteration, objf, math.sqrt(np.vdot(derivs, derivs))), file=sys.stderr)
244249

245-
# we need to negate the objective function and derivatives, since
246-
# scipy only supports minimizing functions and we want to maximize.
247-
# we actually apply a negative scale not equal to -1.0, as otherwise
248-
# the first step is too small.
250+
# we need to negate the objective function and derivatives, since we are
251+
# minimizing.
249252
scale = -1.0
250253
return (objf * scale, derivs * scale)
251254

@@ -256,20 +259,23 @@ def GetObjfAndDeriv(x):
256259
# and derivatives.
257260
iteration = 0
258261

259-
result = minimize(GetObjfAndDeriv, x0, method='BFGS', jac=True,
260-
options={'disp': True, 'gtol': args.gradient_tolerance, 'mls':50})
261262

262-
print("result is ", result, file=sys.stderr)
263+
(x, value, deriv, inv_hessian) = bfgs.Bfgs(x0, GetObjfAndDeriv, MetaparametersAreAllowed)
264+
265+
266+
#result = minimize(GetObjfAndDeriv, x0, method='BFGS', jac=True,
267+
# options={'disp': True, 'gtol': args.gradient_tolerance, 'mls':50})
268+
269+
print("optimize_metaparameters: final x value is ", x, file=sys.stderr)
263270

264-
WriteMetaparameters("{0}/final.metaparams".format(args.optimize_dir),
265-
result.x)
271+
WriteMetaparameters("{0}/final.metaparams".format(args.optimize_dir), x)
266272

267273
old_objf = ReadObjf("{0}/0.objf".format(args.optimize_dir))
268-
new_objf = ReadObjf("{0}/{1}.objf".format(args.optimize_dir, iteration - 1))
274+
new_objf = -1.0 * value
269275

270276
print("optimize_metaparameters.py: log-prob on dev data increased "
271277
"from {0} to {1} over {2} passes of derivative estimation (perplexity: {3}->{4}".format(
272-
old_objf, new_objf, iteration, math.exp(-old_objf), math.exp(-new_objf)),
278+
old_objf, new_objf, iteration, math.exp(-old_objf), math.exp(-new_objf)),
273279
file=sys.stderr)
274280

275281
print("optimize_metaparameters.py: do `diff -y {0}/{{0,final}}.metaparams` "

0 commit comments

Comments
 (0)