-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBoston_Housing_Prediction.m
133 lines (82 loc) · 3.16 KB
/
Boston_Housing_Prediction.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
% Clear previous variables
clear; clc;
% Load all the Boston housing data
data = xlsread('train.xls');
%%%%%%%%%%%%%%%%%%% Separate the data into labels and features %%%%%%%%%%%%%%%%%%%%%
labels = data(:, 1);
% Add the intercept vector
theta_0 = ones(size(data, 1), 1);
features = [theta_0, data(:, 2:end)]; % Added theta_0 to features to obtain the intercept
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Regression %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
parameters = (features'*features)\features'*labels;
% Predicted median housing prices
y_hat = parameters' * features';
%%%%%%%%%%%%%%%%%%%%%%%%%% Significance Calc %%%%%%%%%%%%%%%%%%%%%%%%%%
n = size(data, 1);
p = size(parameters, 1);
k = size(parameters, 1) - 1;
% Sum of squares residual, measures the amount of variance explained
error = y_hat - mean(labels);
SSR = error * error';
MSR = SSR / (k);
% Sum of squares error, measures the amount of variance not explained
% Calculates the amount of prediction error
error = labels - y_hat';
SSE = error' * error;
MSE = SSE / (n - p);
% Test for significance using F0 at a 95% confidence
F0 = MSR/MSE;
f_value = icdf('f', 1-0.05, k, n - p);
if F0 > f_value
fprintf('Regression is significant \n');
else
fprintf('Regression is insignificant \n');
end
%%%%%%%%%%%%%%%%%%%%%%%% Coefficient of Determination %%%%%%%%%%%%%%%%%%%%%
% Total variability within the data
error = labels - mean(labels);
Syy = error' * error;
% Coefficient of Determination
R2 = 1 - (SSE / Syy);
R2_adjusted = 1 - ((SSE / Syy) * ((n - 1) / (n - p)));
fprintf('The R2 and R2_adjusted are:')
disp([R2, R2_adjusted]);
variance = SSE / (n - p);
covariance = variance * inv(features' * features);
%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Confidence Intervals %%%%%%%%%%%%%%%%%%%%%%%%
alpha = 0.05 / 2; % Alpha for a 2 tailed test.
z_scores = icdf('normal',[alpha (1 - alpha)], 0, 1);
confidence_intervals = zeros([size(parameters, 1), 2]);
for i = 1:size(parameters, 1)
confidence_intervals(i, :) = [
parameters(i, 1) + sqrt(covariance(i, i)) / sqrt(size(data, 1)) * z_scores(1), ...
parameters(i, 1) + sqrt(covariance(i, i)) / sqrt(size(data, 1)) * z_scores(2) ...
];
end
% Plot the predicted and actual data
y_hat_plot = scatter(1:n, y_hat, 'b');
hold on
labels_plot = scatter(1:n, labels, 'r');
% upper_confidence = features * confidence_intervals(:, 1);
% lower_confidence = features * confidence_intervals(:, 2);
% plot_upper = plot(upper_confidence, 'g--');
% plot_lower = plot(lower_confidence, 'r--');
ylim([-10 80])
hold off
legend('Predicted Median Housing Price, $1000s', 'Real Median Housing Price, $1000s')
%%%%%%%%%%%%%%%%%%%%%%% Validating our model %%%%%%%%%%%%%%%%%%%%%%
% Load the data
validation = xlsread('test.xls');
n = numel(validation_labels); % Amount of data in validation labels
% Split the labels and features
validation_labels = validation(:, 1);
theta_0 = ones(size(validation, 1), 1);
validation_features = [theta_0, validation(:, 2:end)];
% Make the predictions
validation_yhat = validation_features * parameters;
% Plot the predictions
figure()
scatter(1:n, validation_yhat, 'b')
hold on
scatter(1:n, validation_labels, 'r')
ylim([-15, 40])