-
Notifications
You must be signed in to change notification settings - Fork 16
/
grad_utils.py
165 lines (137 loc) · 5.65 KB
/
grad_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# -*- coding: utf-8 -*-
"""Do gradient without autograd from tensorflow.
"""
import numpy as np
import pdb
from scipy import sparse
"""Logistic regression
"""
# This scale factor makes the newton cg converges faster, generally selected as lambda ~ 1 / (C*nr_sample)
# here we have nr_sample ~ 10^7, C = 0.1
# thus we have lambda ~ 1e-6
def batch_grad_logloss_lr(label,ypred,x,weight_ar=None,C=0.03,has_l2=True,scale_factor=1.0):
"""Return gradient on a batch.
Args:
label, ypred: an array of shape [None,]
x: an array or sparse.csc_matrix with shape [None,n]
has_l2: if set False, the weight_ar will be ignored.
Return:
batch_grad: gradient of each sample on parameters,
has shape [None,n]
"""
diffs = ypred - label
if isinstance(x,np.ndarray):
diffs = diffs.reshape(-1,1)
batch_grad = x * diffs
else:
diffs = sparse.diags(diffs)
batch_grad = x.T.dot(diffs).T
if weight_ar is not None:
# clip the feature index which is not seen in training set
weight_len = weight_ar.shape[0]
if x.shape[1] > weight_len:
x = x[:, :weight_len]
if has_l2:
batch_grad = C * batch_grad + weight_ar
else:
batch_grad = sparse.csr_matrix(C * batch_grad)
return scale_factor * batch_grad
def grad_logloss_theta_lr(label,ypred,x,weight_ar=None,C=0.03,has_l2=True,scale_factor=1.0):
"""Return d l_i / d_theta = d l_i / d_ypred * d y_pred / d theta
Args:
label: a scalar (one sample) or an array of shape [None,]
ypred: a scalar (one sample) or an array of shape [None,]
x: an array with shape [None,n], or sparse.csc_matrix object.
weight_ar: an array with shape [n,].
C: the parameter set in the objective function
Return:
grad_logloss_theta: gradient on the theta, shape: [n,]
"""
# Complex approach
# grad_logloss_ypred = (1 - label) / (1 - ypred + 1e-10) - label / (ypred + 1e-10)
# grad_ypred_theta = ypred * (1 - ypred) * x
# grad_logloss_theta = grad_logloss_ypred * grad_ypred_theta
# if there is only one sample in this batch
if not isinstance(label,np.ndarray) or not isinstance(ypred,np.ndarray):
label = np.array(label).flatten()
ypred = np.array(ypred).flatten()
if weight_ar is not None:
# clip the feature index which is not seen in training set
weight_len = weight_ar.shape[0]
if x.shape[1] > weight_len:
x = x[:, :weight_len]
if has_l2:
grad_logloss_theta = weight_ar + C * x.T.dot(ypred-label)
else:
grad_logloss_theta = C * x.T.dot(ypred-label)
return scale_factor * grad_logloss_theta
def grad_logloss_theta_lr_weighted(label,ypred,x,weight_ar=None,C=0.03,has_l2=True,scale_factor=1.0,weights=None):
"""Return d l_i / d_theta = d l_i / d_ypred * d y_pred / d theta
Args:
label: a scalar (one sample) or an array of shape [None,]
ypred: a scalar (one sample) or an array of shape [None,]
x: an array with shape [None,n], or sparse.csc_matrix object.
weight_ar: an array with shape [n,].
C: the parameter set in the objective function
Return:
grad_logloss_theta: gradient on the theta, shape: [n,]
"""
# Complex approach
# grad_logloss_ypred = (1 - label) / (1 - ypred + 1e-10) - label / (ypred + 1e-10)
# grad_ypred_theta = ypred * (1 - ypred) * x
# grad_logloss_theta = grad_logloss_ypred * grad_ypred_theta
# if there is only one sample in this batch
if not isinstance(label,np.ndarray) or not isinstance(ypred,np.ndarray):
label = np.array(label).flatten()
ypred = np.array(ypred).flatten()
if weight_ar is not None:
# clip the feature index which is not seen in training set
weight_len = weight_ar.shape[0]
if x.shape[1] > weight_len:
x = x[:, :weight_len]
diffs = ypred-label
if weights is not None:
diffs = weights * diffs
if has_l2:
grad_logloss_theta = weight_ar + C * x.T.dot(diffs)
else:
grad_logloss_theta = C * x.T.dot(diffs)
return scale_factor * grad_logloss_theta
def hessian_logloss_theta_lr(label,ypred,x,C=0.03,has_l2=True,scale_factor=1.0):
"""Get hessian matrix of logloss on theta.
Args:
label: ground truth label of x e.g. [None,]
ypred: predictions made by logisitic regression e.g. [None,]
x: features, an np.array has same shape with theta, e.g. [None,n]
l2_norm: float, if there is l2_loss in logloss term, the hessian has a `I` matrix term.
we recommend set >0 here because it helps newton_cg method's convergence in practice.
"""
assert C >= 0.0
# if there is only one sample in this batch
if not isinstance(label,np.ndarray) or not isinstance(ypred,np.ndarray):
label = np.array(label).flatten()
ypred = np.array(ypred).flatten()
h = x.multiply((ypred * (1 - ypred)).reshape(-1,1))
if isinstance(x, sparse.coo_matrix):
h = h.tocsr()
hessian = C * h.T.dot(x)
else:
hessian = C * np.matmul(h.T,x)
if has_l2:
diag_idx = np.arange(hessian.shape[0])
hessian[diag_idx,diag_idx] += 1.0
return scale_factor * hessian
def hessian_vector_product_lr(label,ypred,x,v,C=0.03,has_l2=True,scale_factor=1.0):
"""Get implicit hessian-vector-products without explicitly building the hessian.
H*v = v + C *X^T*(D*(X*v))
"""
xv = x.dot(v)
D = ypred * (1 - ypred)
dxv = xv * D
if has_l2:
hvp = C * x.T.dot(dxv) + v
else:
hvp = C * x.T.dot(dxv)
return scale_factor * hvp
if __name__ == '__main__':
main()