done

SoniSiddharth · May 23, 2021 · a5c5f95 · a5c5f95
1 parent 3d1c6be
commit a5c5f95
Show file tree

Hide file tree

Showing 25 changed files with 1,087 additions and 1 deletion.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+*.pyc
+.DS_Store
diff --git a/Makefile b/Makefile
@@ -0,0 +1,29 @@
+help:
+	@echo "make regression : For running Gradient Descent Regression model on random dataset"
+	@echo "make polynomial_features : For testing tranformation built similar to sklearn’s polynomial preprocessing"
+	@echo "make normal_regression : To check how theta vary with degree"
+	@echo "make poly_theta : To check how theta vary with degree in polynomial features"
+	@echo "make contour : For generating a contour of the gradient descent"
+	@echo "make compare_time : For comparing time taken by normal regression and gradient descent"
+	@echo "make collinear : For checking the feature dependency (collinear features)"
+
+regression:
+	@ python linear_regression_test.py 
+
+polynomial_features:
+	@ python poly_features_test.py
+
+normal_regression:
+	@ python Normal_regression.py 
+
+poly_theta:
+	@ python degreevstheta.py 
+
+contour:
+	@ python plot_contour.py 
+
+compare_time:
+	@ python compare_time.py 
+
+collinear:
+	@ python collinear_dataset.py 
diff --git a/Normal_regression.py b/Normal_regression.py
@@ -0,0 +1,31 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from preprocessing.polynomial_features import PolynomialFeatures
+
+x = np.array([i*np.pi/180 for i in range(60,300,4)])
+np.random.seed(10)  #Setting seed for reproducibility
+y = 4*x + 7 + np.random.normal(0,3,len(x))
+# print(x)
+
+def normal_regression(X,y):
+    X_transpose = np.transpose(X)
+    A = np.linalg.inv(X_transpose.dot(X))
+    B = X_transpose.dot(y)
+    return A.dot(B)
+
+arr_norm = []
+degrees = [i+1 for i in range(9)]
+x = np.array(np.matrix(x).transpose())
+
+include_bias = True
+for degree in degrees:
+    poly = PolynomialFeatures(degree,include_bias = include_bias)
+    X = poly.transform(x)
+    coeff = normal_regression(X,y)
+    arr_norm.append(np.linalg.norm(coeff))
+
+plt.plot(degrees, arr_norm)
+plt.xlabel("Degree of the polynomial")
+plt.ylabel("Magnitude of coefficient (theta)")
+plt.savefig('./images/q5plot.png')
+plt.show()
diff --git a/README.md b/README.md
@@ -1 +1,143 @@
-# ML-Linear-Regression-from-scratch
+# Linear Regression ⭐⭐
+
+## Directory Structure 📁
+
+```
+│   collinear_dataset.py     
+│   compare_time.py
+│   contour_plot.gif
+│   degreevstheta.py
+│   gif1.gif
+│   gif2.gif
+│   linear_regression_test.py
+│   line_plot.gif
+│   Makefile
+│   metrics.py
+│   Normal_regression.py     
+│   plot_contour.py
+│   poly_features_test.py    
+│   README.md
+│   surface_plot.gif
+│
+├───images
+│       q5plot.png
+│       q6plot.png
+│       q8features.png       
+│       q8samples.png
+│
+├───linearRegression
+│   │   linearRegression.py
+│   │   __init__.py
+│   │
+│   └───__pycache__
+│           linearRegression.cpython-37.pyc
+│           __init__.cpython-37.pyc
+│
+├───preprocessing
+│   │   polynomial_features.py
+│   │   __init__.py
+│   │
+│   └───__pycache__
+│           polynomial_features.cpython-37.pyc
+│           __init__.cpython-37.pyc
+│
+├───temp_images
+└───__pycache__
+        metrics.cpython-37.pyc
+```
+
+## Instructions to run 🏃
+
+```make help```<br>
+```make regression```<br>
+```make polynomial_features```<br> 
+```make normal_regression```<br>
+```make poly_theta```<br>
+```make contour```<br>
+```make compare_time```<br>
+```make collinear```<br>
+
+## Stochastic GD (Batch size = 1) ☝️
+
+- Learning rate type = constant
+RMSE:  0.9119624181584616
+MAE:  0.7126923090787688
+
+- Learning rate type = inverse
+RMSE:  0.9049599308106121
+MAE:  0.7098334683036919
+
+## Vanilla GD (Batch size = N) ✋
+
+- Learning rate type = constant
+RMSE:  0.9069295672718122
+MAE:  0.7108301179089876
+
+- Learning rate type = inverse
+RMSE:  0.9607329070540364
+MAE:  0.7641616657610887
+
+## Mini Batch GD (Batch size between 1 and N(5)) 🤘
+
+- Learning rate type = constant
+RMSE:  0.9046502501334435
+MAE:  0.7102161700019564
+
+- Learning rate type = inverse
+RMSE:  0.9268357442221973
+MAE:  0.7309246821952116
+
+## Polynomial Feature Transformation 🔰
+
+- The output [[1, 2]] is [[1, 1, 2, 1, 2, 4]]
+
+- The output for [[1, 2, 3]] is [[1, 1, 2, 3, 1, 2, 3, 4, 6, 9]]
+
+- The outputs are similar to sklearn's PolynomialFeatures fit transform
+
+## Theta vs degree  📈 
+
+![alt text](images/q5plot.png?raw=true)
+
+- Conclusion - As the degree of the polynomial increases, the norm of theta increases because of overfitting.
+
+## L2 Norm of Theta vs Degree of Polynomial for varying Sample size 📈
+
+![alt text](images/q6plot.png?raw=true)
+
+**Conclusion**
+
+- As the degree increases magnitude of theta increases due to overfitting of data.
+- But at the same degree, as the number of samples increases, the magnitude of theta decreases because more samples reduce the overfitting to some extent.
+
+## Linear Regression line fit 🔥
+![alt text](line_plot.gif?raw=true)
+
+## Linear Regression Surface plot 🔥
+![alt text](surface_plot.gif?raw=true)
+
+## Linear Regression Contour plot 🔥
+![alt text](contour_plot.gif?raw=true)
+
+## Time Complexities ⏳ 
+
+- Theoretical time complexity of Normal equation is **O(D^2N) + O(D^3)**
+- Theoretical time complexity of Gradient Descent equation is **O((t+N)D^2)**
+
+## Time vs Number of Features ⏳📊
+
+![alt text](images/q8features.png?raw=true)
+
+When the number of samples are kept constant, normal equation solution takes more time as it has a factor of D^3 whereas Gradient Descent has a factor of D^2 in the time complexity.
+
+## Time vs Number of Samples ⏳📊
+
+![alt text](images/q8samples.png?raw=true)
+
+When the number of features are kept constant varying number of samples, it can be noticed that time for normal equation is still higher as compared to gradient descent because of computational expenses.
+
+## Multicollinearity in Dataset ❗ ❗ 
+
+- The gradient descent implementation works for the multicollinearity.
+- But as the multiplication factor increases, RMSE and MAE values takes a large shoot
+- It reduces the precision of the coefficients
diff --git a/collinear_dataset.py b/collinear_dataset.py
@@ -0,0 +1,37 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from linearRegression.linearRegression import LinearRegression
+from metrics import *
+
+np.random.seed(42)
+
+N = 30
+
+print("----------------------------------- Multi collinear ----------------------------------")
+
+P = 4
+X = pd.DataFrame(np.random.randn(N, P))
+y = pd.Series(np.random.randn(N))
+
+X[P] = X.iloc[:][P-1]*6
+# print(X)
+
+LR = LinearRegression(fit_intercept=True)
+LR.fit_vectorised(X, y)
+y_hat = LR.predict(X)
+print('RMSE: ', rmse(y_hat, y))
+print('MAE: ', mae(y_hat, y))
+
+print("----------------------------------------- Normal dataset -------------------------------------")
+
+P = 5
+Xnew = pd.DataFrame(np.random.randn(N, P))
+ynew = pd.Series(np.random.randn(N))
+# print(Xnew)
+
+LRnew = LinearRegression(fit_intercept=True)
+LRnew.fit_vectorised(Xnew, ynew)
+y_hatnew = LRnew.predict(Xnew)
+print('RMSE: ', rmse(y_hatnew, ynew))
+print('MAE: ', mae(y_hatnew, ynew))
diff --git a/compare_time.py b/compare_time.py
@@ -0,0 +1,63 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from linearRegression.linearRegression import LinearRegression
+import time
+
+# np.random.seed(42)
+
+grad = []
+normal = []
+num_features = []
+
+N = 30
+for i in range (50,1000, 5):
+	X = pd.DataFrame(np.random.randn(N, i))
+	y = pd.Series(np.random.randn(N))
+
+	LR = LinearRegression(fit_intercept=True)	
+	start = time.time()
+	LR.fit_vectorised(X, y)
+	grad.append(time.time()-start)
+
+	LR_normal = LinearRegression(fit_intercept=True)
+	start_time = time.time()
+	LR_normal.fit_normal(X, y) 
+	normal.append(time.time()-start_time)
+	num_features.append(i)
+
+plt.plot(num_features, grad, label = 'Gradient Descent')
+plt.plot(num_features, normal, label = 'Normal Equation')
+plt.xlabel('Num of features')
+plt.ylabel('time in seconds')
+plt.legend(loc = 'best')
+plt.savefig('./images/q8features.png')
+plt.show()
+
+grad = []
+normal = []
+num_samples = []
+P = 20
+
+for i in range (50,2000, 5):
+	X = pd.DataFrame(np.random.randn(i, P))
+	y = pd.Series(np.random.randn(i))
+
+	LR = LinearRegression(fit_intercept=True)	
+	start = time.time()
+	LR.fit_vectorised(X, y)
+	grad.append(time.time()-start)
+
+	LR_normal = LinearRegression(fit_intercept=True)
+	start_time = time.time()
+	LR_normal.fit_normal(X, y) 
+	normal.append(time.time()-start_time)
+	num_samples.append(i)
+
+plt.plot(num_samples, grad, label = 'Gradient Descent')
+plt.plot(num_samples, normal, label = 'Normal Equation')
+plt.xlabel('Num of samples')
+plt.ylabel('time in seconds')
+plt.legend(loc = 'best')
+plt.savefig('./images/q8samples.png')
+plt.show()
diff --git a/contour_plot.gif b/contour_plot.gif
diff --git a/degreevstheta.py b/degreevstheta.py
@@ -0,0 +1,39 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from preprocessing.polynomial_features import PolynomialFeatures
+
+np.random.seed(42)
+
+def normal_regression(X,y):
+    X_transpose = np.transpose(X)
+    A = np.linalg.inv(X_transpose.dot(X))
+    B = X_transpose.dot(y)
+    return A.dot(B)
+
+lst = []
+degrees = [1,3,5,7,9]
+sample_size = []
+l = 0
+for N in range (10,200,40):
+    # x = np.random.rand(N)
+    x = np.array([i*np.pi/180 for i in range(N,300,4)])
+    y = 4*x + 7 + np.random.normal(0,3,len(x))
+    x = np.array(np.matrix(x).transpose())
+    temp = []
+    for degree in degrees:
+        poly = PolynomialFeatures(degree,include_bias=True)
+        X = poly.transform(x)
+        coeff = normal_regression(X,y)
+        temp.append(np.log(np.linalg.norm(np.array(coeff))))
+    lst.append(temp)
+    l+=1
+    sample_size.append(len(x))
+
+for i in range (1,l+1):
+    plt.plot(degrees,lst[i-1],label='Num of Samples '+str(sample_size[i-1]))
+    plt.xlabel("Value of degree")
+    plt.ylabel("Log of L2 norm of coefficients")
+plt.legend(loc = 'best')
+plt.savefig('./images/q6plot.png')
+plt.show()
diff --git a/gif1.gif b/gif1.gif
diff --git a/gif2.gif b/gif2.gif
diff --git a/images/q5plot.png b/images/q5plot.png
diff --git a/images/q6plot.png b/images/q6plot.png
diff --git a/images/q8features.png b/images/q8features.png
diff --git a/images/q8samples.png b/images/q8samples.png
diff --git a/line_plot.gif b/line_plot.gif
diff --git a/linearRegression/__init__.py b/linearRegression/__init__.py
@@ -0,0 +1 @@
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Auto detect text files and perform LF normalization
		* text=auto