Switch over to bfgs coded ourselves, instead of using scipy

danpovey · danpovey · commit 215909dd2d06 · 2016-05-15T22:53:39.000-04:00
diff --git a/.gitignore b/.gitignore
@@ -5,9 +5,10 @@
 !/src/*.*
 !/src/Makefile
 
-# compiled object files, and directories automatically created on Darwin OS
-# for debug symbols.
+# compiled object files and python files, and directories automatically created
+# on Darwin OS for debug symbols.
 *.o
+*.pyc
 *.dSYM
 
 # emacs saves
diff --git a/scripts/internal/bfgs.py b/scripts/internal/bfgs.py
@@ -0,0 +1,298 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import math, sys
+import numpy as np
+import numpy.linalg as la
+
+
+"""
+This is a version of BFGS specialized for the case where the function
+is constrained to a particular convex region via a barrier function,
+and where we can efficiently evaluate (via calling f_finite(x), which
+returns bool) whether the function is finite at the given point.
+
+          x0   The value to start the optimization at.
+           f   The function being minimized.  f(x) returns a pair (value, gradient).
+    f_finite   f_finite(x) returns true if f(x) would be finite, and false otherwise.
+init_hessian   This gives you a way to specify a "better guess" at the initial
+               Hessian.
+return value   Returns a 4-tuple (x, f(x), f'(x), inverse-hessian-approximation).
+
+
+"""
+def Bfgs(x0, f, f_finite, init_inv_hessian = None):
+    b = __bfgs(x0, f, f_finite, init_inv_hessian)
+    return b.Minimize()
+
+
+
+
+class __bfgs:
+    def __init__(self, x0, f, f_finite, init_inv_hessian = None,
+                 gradient_tolerance = 0.0005, progress_tolerance = 1.0e-06,
+                 progress_tolerance_num_iters = 3):
+        self.c1 = 1.0e-04  # constant used in line search
+        self.c2 = 0.9      # constant used in line search
+        assert len(x0.shape) == 1
+        self.dim = x0.shape[0]
+        self.f = f
+        self.f_finite = f_finite
+        self.gradient_tolerance = gradient_tolerance
+        self.progress_tolerance = progress_tolerance
+        assert progress_tolerance_num_iters >= 1
+        self.progress_tolerance_num_iters = progress_tolerance_num_iters
+
+        if not self.f_finite(x0):
+            self.LogMessage("Function is not finite at initial point {0}".format(x0))
+            sys.exit(1)
+
+        # evaluations will be a list of 3-tuples (x, function-value f(x),
+        # function-derivative f'(x)).  it's written to and read from by the
+        # function self.FunctionValueAndDerivative().
+        self.cached_evaluations = [ ]
+
+        self.x = [ x0 ]
+        (value0, deriv0) = self.FunctionValueAndDerivative(x0)
+        self.value = [ value0 ]
+        self.deriv = [ deriv0 ]
+
+        deriv_magnitude = math.sqrt(np.dot(deriv0, deriv0))
+        self.LogMessage("On iteration 0, value is {0}, deriv-magnitude {1}".format(
+                value0, deriv_magnitude))
+
+        # note: self.inv_hessian is referred to as H_k in the Nocedal
+        # and Wright textbook.
+        if init_inv_hessian is None:
+            self.inv_hessian = np.identity(self.dim)
+        else:
+            self.inv_hessian = init_inv_hessian
+
+    def Minimize(self):
+        while not self.Converged():
+            self.Iterate()
+        self.FinalDebugOutput()
+        return (self.x[-1], self.value[-1], self.deriv[-1], self.inv_hessian)
+
+
+    def FinalDebugOutput(self):
+        pass
+        # currently this does nothing.
+
+    # This does one iteration of update.
+    def Iterate(self):
+        self.p = - np.dot(self.inv_hessian, self.deriv[-1])
+        alpha = self.LineSearch()
+        if alpha is None:
+            self.LogMessage("Restarting BFGS with unit Hessian since line search failed")
+            self.inv_hessian = np.identity(self.dim)
+            return
+        cur_x = self.x[-1]
+        next_x = cur_x + alpha * self.p
+        (next_value, next_deriv) = self.FunctionValueAndDerivative(next_x)
+        next_deriv_magnitude = math.sqrt(np.dot(next_deriv, next_deriv))
+        self.LogMessage("On iteration {0}, value is {1}, deriv-magnitude {2}".format(
+                len(self.x), next_value, next_deriv_magnitude))
+
+        # obtain s_k = x_{k+1} - x_k, y_k = gradient_{k+1} - gradient_{k}
+        # see eq. 6.5 in Nocedal and Wright.
+        self.x.append(next_x)
+        self.value.append(next_value)
+        self.deriv.append(next_deriv)
+        s_k = alpha * self.p
+        y_k = self.deriv[-1] - self.deriv[-2]
+        ysdot = np.dot(s_k, y_k)
+        if not ysdot > 0:
+            self.LogMessage("Restarting BFGS with unit Hessian since curvature "
+                            "condition failed [likely a bug in the optimization code]")
+            self.inv_hessian = np.identity(self.dim)
+            return
+        rho_k = 1.0 / ysdot  # eq. 6.14 in Nocedal and Wright.
+        # the next equation is eq. 6.17 in Nocedal and Wright.
+        # we don't bother rearranging it for efficiency because the dimension is small.
+        I = np.identity(self.dim)
+        self.inv_hessian = ((I - np.outer(s_k, y_k) * rho_k) * self.inv_hessian *
+                            (I - np.outer(y_k, s_k) * rho_k)) + np.outer(s_k, s_k) * rho_k
+        # todo: maybe make the line above more efficient.
+
+    # the function LineSearch is to be called after you have set self.x and
+    # self.p.  It returns an alpha value satisfying the strong Wolfe conditions,
+    # or None if the line search failed.  It is Algorithm 3.5 of Nocedal and
+    # Wright.
+    def LineSearch(self):
+        alpha_max = 1.0e+10
+        alpha1 = self.GetDefaultAlpha()
+        increase_factor = 2.0  # amount by which we increase alpha if
+                               # needed... after the 1st time we make it 4.
+        if alpha1 is None:
+            self.LogMessage("Line search failed unexpectedly in making sure "
+                            "f(x) is finite.")
+            return None
+
+        alpha = [ 0.0, alpha1 ]
+        (phi_0, phi_dash_0) = self.FunctionValueAndDerivativeForAlpha(0.0)
+        phi = [phi_0]
+        phi_dash = [phi_dash_0]
+
+        if phi_dash_0 >= 0.0:
+            self.LogMessage("{0}: line search failed unexpectedly: not a descent "
+                            "direction")
+
+        while True:
+            i = len(phi)
+            alpha_i = alpha[-1]
+            (phi_i, phi_dash_i) = self.FunctionValueAndDerivativeForAlpha(alpha_i)
+            phi.append(phi_i)
+            phi_dash.append(phi_dash_i)
+            if (phi_i > phi_0 + self.c1 * alpha_i * phi_dash_0 or
+                (i > 1 and phi_i > phi[-2])):
+                return self.Zoom(alpha[-2], alpha_i)
+            if abs(phi_dash_i) <= -self.c2 * phi_dash_0:
+                self.LogMessage("Line search: accepting default alpha = {0}".format(alpha_i))
+                return alpha_i
+            if phi_dash_i >= 0:
+                return self.Zoom(alpha_i, alpha[-2])
+
+            # the algorithm says "choose alpha_{i+1} \in (alpha_i, alpha_max).
+            # the rest of this block is implementing that.
+            next_alpha = alpha_i * increase_factor
+            increase_factor = 4.0   # after we double once, we get more aggressive.
+            if next_alpha > alpha_max:
+                # something went wrong if alpha needed to get this large.
+                # most likely we'll restart BFGS.
+                self.LogMessage("Line search failed unexpectedly, went "
+                                "past the max.");
+                return None
+            # make sure the function is finite at the next alpha, if possible.
+            # we don't need to worry about efficiency too much, as this check
+            # for finiteness is very fast.
+            while next_alpha > alpha_i * 1.2 and not self.IsFiniteForAlpha(next_alpha):
+                next_alpha *= 0.9
+            while next_alpha > alpha_i * 1.02 and not self.IsFiniteForAlpha(next_alpha):
+                next_alpha *= 0.99
+            self.LogMessage("Increasing alpha from {0} to {1} in line search".format(alpha_i,
+                                                                                     next_alpha))
+            alpha.append(next_alpha)
+
+    # This function, from Nocedal and Wright (alg. 3.6) is called from from
+    # LineSearch.  It returns the alpha value satisfying the strong Wolfe
+    # conditions, or None if there was an error.
+    def Zoom(self, alpha_lo, alpha_hi):
+        # these function evaluations don't really happen, we use caching.
+        (phi_0, phi_dash_0) = self.FunctionValueAndDerivativeForAlpha(0.0)
+        (phi_lo, phi_dash_lo) = self.FunctionValueAndDerivativeForAlpha(alpha_lo)
+        (phi_hi, phi_dash_hi) = self.FunctionValueAndDerivativeForAlpha(alpha_hi)
+
+        min_diff = 1.0e-10
+        while True:
+            if abs(alpha_lo - alpha_hi) < min_diff:
+                self.LogMessage("Line search failed, interval is too small: [{0},{1}]".format(
+                        alpha_lo, alpha_hi))
+                return None
+
+            # the algorithm says "Interpolate (using quadratic, cubic or
+            # bisection) to find a trial step length between alpha_lo and
+            # alpha_hi.  We basically choose bisection, but because alpha_lo is
+            # guaranteed to always have a "better" (lower) function value than
+            # alpha_hi, we actually want to be a little bit closer to alpha_lo,
+            # so we go one third of the distance between alpha_lo and alpha_hi.
+            alpha_j = alpha_lo + 0.3333 * (alpha_hi - alpha_lo)
+            (phi_j, phi_dash_j) = self.FunctionValueAndDerivativeForAlpha(alpha_j)
+            if phi_j > phi_0 + self.c1 * alpha_j * phi_dash_0 or phi_j >= phi_lo:
+                (alpha_hi, phi_hi, phi_dash_hi) = (alpha_j, phi_j, phi_dash_j)
+            else:
+                if abs(phi_dash_j) <= - self.c2 * phi_dash_0:
+                    self.LogMessage("Acceptable alpha is {0}".format(alpha_j))
+                    return alpha_j
+                if phi_dash_j * (alpha_hi - alpha_lo) >= 0.0:
+                    (alpha_hi, phi_hi, phi_dash_hi) = (alpha_lo, phi_lo, phi_dash_lo)
+                (alpha_lo, phi_lo, phi_dash_lo) = (alpha_j, phi_j, phi_dash_j)
+
+
+    # The function GetDefaultAlpha(), called from LineSearch(), is to be called
+    # after you have set self.x and self.p.  It normally returns 1.0, but it
+    # will reduce it by factors of 0.9 until the function evaluated at 2*alpha
+    # is finite.  This is because generally speaking, approaching the edge of
+    # the barrier function too rapidly will lead to poor function values.  Note:
+    # evaluating whether the function is finite is very efficient.
+    # If the function was not finite even at very tiny alpha, then something
+    # probably went wrong; we'll restart BFGS in this case.
+    def GetDefaultAlpha(self):
+        min_alpha = 1.0e-10
+        alpha = 1.0
+        while alpha > min_alpha and not self.IsFiniteForAlpha(alpha * 2.0):
+            alpha *= 0.9
+        return alpha if alpha > min_alpha else None
+
+    # this function, called from LineSearch(), returns true if the function is finite
+    # at the given alpha value.
+    def IsFiniteForAlpha(self, alpha):
+        x = self.x[-1] + self.p * alpha
+        return self.f_finite(x)
+
+    def FunctionValueAndDerivativeForAlpha(self, alpha):
+        x = self.x[-1] + self.p * alpha
+        (value, deriv) = self.FunctionValueAndDerivative(x)
+        return (value, np.dot(self.p, deriv))
+
+    def Converged(self):
+        # we say that we're converged if either the gradient magnitude
+        current_gradient = self.deriv[-1]
+        gradient_magnitude = math.sqrt(np.dot(current_gradient, current_gradient))
+        if gradient_magnitude < self.gradient_tolerance:
+            self.LogMessage("BFGS converged on iteration {0} due to gradient magnitude {1} "
+                            "less than gradient tolerance {2}".format(
+                    len(self.x), gradient_magnitude, gradient_tolerance))
+            return True
+        n = self.progress_tolerance_num_iters
+        if len(self.x) > n:
+            cur_value = self.value[-1]
+            prev_value = self.value[-1 - n]
+            # the following will be nonnegative.
+            change_per_iter_amortized = (prev_value - cur_value) / n
+            if change_per_iter_amortized < self.progress_tolerance:
+                self.LogMessage("BFGS converged on iteration {0} due to objf-change per "
+                                "iteration amortized over {1} iterations = {2} < "
+                                "threshold = {3}.".format(
+                    len(self.x), n, change_per_iter_amortized, self.progress_tolerance))
+                return True
+        return False
+
+    # this returns the function value and derivative for x, as a tuple; it
+    # does caching.
+    def FunctionValueAndDerivative(self, x):
+        for i in range(len(self.cached_evaluations)):
+            if np.array_equal(x, self.cached_evaluations[i][0]):
+                return (self.cached_evaluations[i][1],
+                        self.cached_evaluations[i][2])
+        # we didn't find it cached, so we need to actually evaluate the
+        # function.  this is where it gets slow.
+        (value, deriv) = self.f(x)
+        self.cached_evaluations.append((x, value, deriv))
+        return (value, deriv)
+
+    def LogMessage(self, message):
+        print(sys.argv[0] + ": " + message, file=sys.stderr)
+
+
+
+
+def __TestFunction(x):
+    dim = 15
+    a = np.array(range(1, dim + 1))
+    B = np.diag(range(5, dim + 5))
+
+    # define a function f(x) = x.a + x^T B x
+    value = np.dot(x, a) + np.dot(x, np.dot(B, x))
+
+    # derivative is a + 2 B x.
+    deriv = a + np.dot(B, x) * 2.0
+    return (value, deriv)
+
+
+def __TestBfgs():
+    dim = 15
+    x0 = np.array(range(10, dim + 10))
+    (a,b,c,d) = Bfgs(x0, __TestFunction, lambda x : True, )
+
+#__TestBfgs()
diff --git a/scripts/optimize_metaparameters.py b/scripts/optimize_metaparameters.py
@@ -3,11 +3,15 @@
 # we're using python 3.x style print but want it to work in python 2.x,
 from __future__ import print_function
 import numpy as np
-from scipy.optimize import minimize
 
 import re, os, argparse, sys, math, warnings, shutil
 from math import log
 
+# we need to add the ./internal/ subdirectory to the pythonpath before importing
+# 'bfgs'.
+sys.path = [ os.path.abspath(os.path.dirname(sys.argv[0])) + "/internal" ] + sys.path
+import bfgs
+
 parser = argparse.ArgumentParser(description="Optimizes metaparameters for LM estimation; "
                                  "this utility uses derivatives from get_objf_and_derivs.py or "
                                  "get_objf_and_derivs_split.py");
@@ -171,7 +175,8 @@ def ModifyWithBarrierFunction(x, objf, derivs):
 
 
 # this will return a 2-tuple (objf, deriv).  note, the objective function and
-# derivative are both negated because scipy only supports minimization.
+# derivative are both negated because conventionally optimization problems are
+# framed as minimization problems.
 def GetObjfAndDeriv(x):
     global iteration
     if not MetaparametersAreAllowed(x):
@@ -242,10 +247,8 @@ def GetObjfAndDeriv(x):
     print("Iteration {0}: objf={1}, deriv-magnitude={2} (with barrier function)".format(
             iteration, objf, math.sqrt(np.vdot(derivs, derivs))), file=sys.stderr)
 
-    # we need to negate the objective function and derivatives, since
-    # scipy only supports minimizing functions and we want to maximize.
-    # we actually apply a negative scale not equal to -1.0, as otherwise
-    # the first step is too small.
+    # we need to negate the objective function and derivatives, since we are
+    # minimizing.
     scale = -1.0
     return (objf * scale, derivs * scale)
 
@@ -256,20 +259,23 @@ def GetObjfAndDeriv(x):
 # and derivatives.
 iteration = 0
 
-result = minimize(GetObjfAndDeriv, x0, method='BFGS', jac=True,
-                  options={'disp': True, 'gtol': args.gradient_tolerance, 'mls':50})
 
-print("result is ", result, file=sys.stderr)
+(x, value, deriv, inv_hessian) = bfgs.Bfgs(x0, GetObjfAndDeriv, MetaparametersAreAllowed)
+
+
+#result = minimize(GetObjfAndDeriv, x0, method='BFGS', jac=True,
+#                  options={'disp': True, 'gtol': args.gradient_tolerance, 'mls':50})
+
+print("optimize_metaparameters: final x value is ", x, file=sys.stderr)
 
-WriteMetaparameters("{0}/final.metaparams".format(args.optimize_dir),
-                    result.x)
+WriteMetaparameters("{0}/final.metaparams".format(args.optimize_dir), x)
 
 old_objf = ReadObjf("{0}/0.objf".format(args.optimize_dir))
-new_objf = ReadObjf("{0}/{1}.objf".format(args.optimize_dir, iteration - 1))
+new_objf = -1.0 * value
 
 print("optimize_metaparameters.py: log-prob on dev data increased "
       "from {0} to {1} over {2} passes of derivative estimation (perplexity: {3}->{4}".format(
-                old_objf, new_objf, iteration, math.exp(-old_objf), math.exp(-new_objf)),
+        old_objf, new_objf, iteration, math.exp(-old_objf), math.exp(-new_objf)),
       file=sys.stderr)
 
 print("optimize_metaparameters.py: do `diff -y {0}/{{0,final}}.metaparams` "