-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquestion2.py
100 lines (74 loc) · 3.47 KB
/
question2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#%%
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm
from tsmoothie.smoother import ConvolutionSmoother
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from pykalman import KalmanFilter
import matplotlib.pyplot as plt
#read data and format into a dataframe
x = pd.read_csv('data/xvalsSine.csv')
sine = pd.read_csv('data/cleanSine.csv')
data = pd.DataFrame([x.iloc[:,0],sine.iloc[:,0]],index=['x','sin(x)']).T
#split data into test and train sets randomly with 70/30 train/test split
X_train, X_test, y_train, y_test = train_test_split(data['x'], data['sin(x)'],
shuffle = True,
test_size=0.3,
random_state=1)
#use a polynomial order three to approximate well sine curve
polynomial_features= PolynomialFeatures(degree=3)
X_train_poly = polynomial_features.fit_transform(X_train.values.reshape(-1,1))
X_test_poly = polynomial_features.fit_transform(X_test.values.reshape(-1,1))
#calculate OLS model
model = sm.OLS(y_train.values, X_train_poly).fit()
model.summary()
print('''The OLS R-squared is equal to '''+str(model.rsquared))
print('''The OLS MSE is '''+str(model.mse_model))
y_predicted = model.predict(X_test_poly)
#bonus use a stochastic gradient descent
n_iter=100
sgd = SGDRegressor(max_iter=n_iter)
sgd.fit(X_train.values.reshape(-1,1), y_train)
y_predicted_sgd=sgd.predict(X_test.values.reshape(-1,1))
sgd_mse = mean_squared_error(y_test,y_predicted_sgd)
print('''The SGD MSE is '''+str(sgd_mse))
#start part ii
noisy_sine = pd.read_csv('data/noisySine.csv')
noisy_sine.columns = ['noisy sin(x)']
#filter noisy data
smoother = ConvolutionSmoother(window_len=50, window_type='ones') #can alter window length for further improvement
smoother.smooth(noisy_sine)
filtered_noisy_sine = pd.Series(smoother.smooth_data[0],name='Filtered_sin(x)')
data = data.join(filtered_noisy_sine)
#calculate sum of squared errors
errors = data['sin(x)']-data['Filtered_sin(x)']
sq_errors = errors**2
sse = sum(sq_errors)
print('''The sum of squared errors between the sin(x) dataset and smoothed "noisy" sin(x) dataset is '''+str(round(sse,3)))
#bonus predict 10 samples from a Kalman filter
# initial parameters
x_sig = x.var().values[0]
sin_sig = noisy_sine.var().values[0]
mu = noisy_sine.mean().values[0]
sig = sin_sig**2
#initial parameters
x_k = np.asarray([x.iloc[0,0],noisy_sine.iloc[0,0]]) #first estimate
Q = np.asarray([[0.001,0],[0,0.001]]) #Estimate error covariance (took a guess here)
A = np.asarray([[1,0],[0,1]]) #Transition matrix
R = np.asarray([[0,0],[0,np.mean(noisy_sine.values-sine.values)]]) #Measurement error
H = np.asarray([[1,0],[0,1]]) #Observation matrix
P = np.asarray([[0,0],[0,0]]) #Error matrix
estimation = []
for k in range(10):
z_k = np.asarray([x.iloc[k,0], noisy_sine.iloc[k,0]])
x_k = A.dot(x_k) #predict estimate
P = (A.dot(P)).dot(A.T) + Q #predict error covariance
K = (P.dot(H.T)).dot(np.linalg.inv((H.dot(P).dot(H.T)) + R)) #update Kalman Gain
x_k = x_k + K.dot((z_k - H.dot(x_k))) #update estimate
P = (np.identity(2) - K.dot(H)).dot(P) #update error covariance
estimation.append((x_k[0], x_k[1])) #append the estimations
print('From the Kalman filter the 10 samples are as follows'+str(estimation))
# %%