week4_assi2_sol2.R

# # -*- coding: utf-8 -*-
# """Week4 Assi2 Sol2.ipynb
# 
# Automatically generated by Colaboratory.
# 
# Original file is located at
#     https://colab.research.google.com/drive/1sloDXcEBLZDNisk0usmXCOtsFb1c8DgI
# """
###########################################################################
## Week-4, Homework-2, Sol-2 
## Sreya Dhar 
## Created: Feb 25, 2021
## Edited: Mar 03, 2021
###########################################################################

rm(list=ls())
setwd("C:/File E/EAS 507 Statistical Mining II/Week-4/HW-2")

## installing all the libaries in R kernel

# install.packages("arules")
# install.packages("Hmisc")
# install.packages("funModeling")
# install.packages("PerformanceAnalytics")
# install.packages("corrplot")
# install.packages("MASS")
# install.packages("ggplot2")
# install.packages("dplyr")
# install.packages("tidyverse")
# install.packages("tidyr")
# install.packages("repr")
# install.packages("ggstatsplot")
# install.packages("psych")
# install.packages("gplots")
# install.packages("rsample")

library(psych)
library("gplots")
library(rsample)

## importing the libraries in R kernel
library(arules)
library(MASS)
library(Hmisc)
library(ggplot2)
library(dplyr)
library(funModeling) 
library(tidyverse)
library(tidyr)
library(PerformanceAnalytics)
library(corrplot)
library(repr)
# library(ggstatsplot)
library(psych)
library("gplots")
library(rsample)
library(glmnet)
library(arulesViz)
library(colorspace)

# """**Data Processing or Exploratory Data Analysis on 'Auto' Dataset**"""

data(Boston)
head(Boston)

glimpse(Boston) ## overall view of Auto dataset

profiling_num(Boston)

sapply(Boston, class)

describe(Boston) ## description of the overall dataset,

status(Boston)

summary(Boston)

Boston_C <- Boston

#a

# dim(Boston_C)
options(repr.plot.width=8, repr.plot.height=8, repr.plot.res = 200)
plot_num(Boston_C)

## plotting the correlation values on chart matrix which also combined with histogram and scatter plots of different features.
options(repr.plot.width=10, repr.plot.height=10, repr.plot.res = 200)
chart.Correlation(Boston_C, histogram=TRUE, pch=15)

source("http://www.sthda.com/upload/rquery_cormat.r")

## plotting the heatmap diagram of correlation matrix on modified dataset
options(repr.plot.width=5, repr.plot.height=5, repr.plot.res = 200)
rquery.cormat(Boston_C, graphType="heatmap", type = "lower")

## heatmap ##
options(repr.plot.width=7, repr.plot.height=7, repr.plot.res = 200)
Auto_S <- as.data.frame(scale(Boston_C,center=TRUE,scale=TRUE))
heatmap.2(as.matrix(Auto_S), scale = "none", col = bluered(100), trace = "none", density.info = "none")

## correlation and p values between any two variables
# rquery.cormat(Boston_C, type="flatten", graph=FALSE)

head(Boston_C)

summary(Boston_C)
# 
break_crim <- quantile(Boston_C$crim,probs=c(0,0.25,0.60,0.85,1))
Boston_C[["crim"]]<-ordered(cut(Boston_C[["crim"]],breaks = break_crim,
                            labels=c("Low","Middle","High","critical")))
unique(Boston_C$crim)

break_zn <- quantile(Boston_C$zn,probs=c(0,0.75,0.85,0.95,1))
Boston_C[["zn"]]<-ordered(cut(Boston_C$zn,
                              breaks = break_zn,
                          labels=c("no_zn","acceptable","moderate","critical")))
# unique(Boston_C$zn)
break_indus <- quantile(Boston_C$indus,probs=c(0,0.25,0.65,0.9,1))
Boston_C[["indus"]]<-ordered(cut(Boston_C$indus,
                                 breaks = break_indus,
                             labels=c("Low","moderate","High","Excess")))

# unique(Boston_C$indus)

Boston_C[["chas"]]<-ordered(Boston_C$chas,labels=c("off-river","near-river"))

break_nox <- quantile(Boston_C$nox,probs=c(0,0.25,0.65,0.9,1))
Boston_C[["nox"]]<-ordered(cut(Boston_C$nox,
                               breaks = break_nox,
                           labels=c("Low","moderate","High","Excess")))

unique(Boston_C$nox)
break_rm <- quantile(Boston_C$rm,probs=c(0,0.03,0.85,1))
Boston_C[["rm"]]<-ordered(cut(Boston_C$rm,
                              c(3,5,7,9)),labels=c("small","Moderate","big"))

break_black <- quantile(Boston_C$black,probs=c(0, 0.02, 0.08, 0.25, 1))

Boston_C[["black"]]<-ordered(cut(Boston_C$black,
                               breaks = break_black,
                           labels=c("Low","moderate","High","Excess")))
# unique(Boston_C$age)
break_age <- quantile(Boston_C$age,probs=c(0,0.1,0.25,0.4,1))
Boston_C[["age"]]<-ordered(cut(Boston_C$age,
                               c(0, 25, 45, 65, 100)),
                           labels=c("Young", "Middle-aged", "Senior", "Elderly"))

break_dis <- quantile(Boston_C$dis,probs=c(0, 0.55,  0.9, 1))
Boston_C[["dis"]]<-ordered(cut(Boston_C$dis,
                               breaks = break_dis,
                           labels=c("Low","moderate","High")))

unique(Boston_C$dis)

break_rad <- quantile(Boston_C$rad,probs=c(0,0.25,0.5,.8))
Boston_C[["rad"]]<-ordered(cut(Boston_C$rad,
                               break_rad,
                           labels=c("Low","moderate","High")))
unique(Boston_C$rad)

break_tax <- quantile(Boston_C$tax,probs=c(0,0.25,0.65,0.9,1))
Boston_C[["tax"]]<-ordered(cut(Boston_C$tax,
                               break_tax,
                           labels=c("Low","Middle","High","Excess")))
unique(Boston_C$tax)

break_ptratio <- quantile(Boston_C$ptratio,probs=c(0,0.25,0.65,0.9,1))
Boston_C[["ptratio"]]<-ordered(cut(Boston_C$ptratio,
                                   breaks = break_ptratio,
                               labels=c("Low","moderate","High", "Excess")))
unique(Boston_C$ptratio)


break_lstat <- quantile(Boston_C$lstat,probs=c(0,0.25,0.65,0.9,1))
Boston_C[["lstat"]]<-ordered(cut(Boston_C$lstat,
                                 breaks = break_lstat,
                                 labels=c("Low","moderate","High", "Excess")))

unique(Boston_C$lstat)

break_medv <- quantile(Boston_C$medv,probs=c(0,0.25,0.65,0.9,1))
Boston_C[["medv"]]<-ordered(cut(Boston_C$medv,
                                breaks = break_medv,
                                labels=c("Low","moderate","High", "Excess")))
unique(Boston_C$medv)


### Replacing NA values with appropriate categorical variable ###
Boston_C$crim <- Boston_C$crim %>% replace_na("Low")

Boston_C$age <- Boston_C$age %>% replace_na("Young")
Boston_C$zn <- Boston_C$zn %>% replace_na("no_zn")
Boston_C$indus <- Boston_C$indus %>% replace_na("Low")
Boston_C$nox <- Boston_C$nox %>% replace_na("Low")

Boston_C$black <- Boston_C$black %>% replace_na("Low")
Boston_C$dis <- Boston_C$dis %>% replace_na("Low")
Boston_C$rad <- Boston_C$rad %>% replace_na("Low")

Boston_C$ptratio <- Boston_C$ptratio %>% replace_na("Low")

Boston_C$medv <- Boston_C$medv %>% replace_na("Low")
Boston_C$lstat <- Boston_C$lstat %>% replace_na("Low")
Boston_C$rm <- Boston_C$rm %>% replace_na("small")
Boston_C$tax <- Boston_C$tax %>% replace_na("Low")

### creating as.factor variables 
Boston_C$crim <- as.factor(Boston_C$crim)
Boston_C$zn <- as.factor(as.character(Boston_C$zn))
Boston_C$indus <- as.factor(as.character(Boston_C$indus))
Boston_C$nox <- as.factor(as.character(Boston_C$nox))
Boston_C$black <- as.factor(as.character(Boston_C$black))
Boston_C$dis <- as.factor(as.character(Boston_C$dis))
Boston_C$rad <- as.factor(as.character(Boston_C$rad))
Boston_C$ptratio <- as.factor(as.character(Boston_C$ptratio))
Boston_C$medv <- as.factor(as.character(Boston_C$medv))
Boston_C$lstat <- as.factor(as.character(Boston_C$lstat))
Boston_C$age <- as.factor(as.character(Boston_C$age))
Boston_C$tax <- as.factor(as.character(Boston_C$tax))
Boston_C$rm <- as.factor(as.character(Boston_C$rm))

Boston_C<- Boston_C[,-c( 4,6)] 
summary(Boston_C)

Boston_trans<-as(Boston_C,"transactions")
summary(Boston_trans)
image(Boston_trans[1:20])
image(sample(Boston_trans, 100))

# (b)
options(repr.plot.width=12, repr.plot.height=8, repr.plot.res = 200)
itemFrequencyPlot(Boston_trans,support=0.3,   type = "relative", cex.name= 0.8)

# options(repr.plot.width=12, repr.plot.height=8, repr.plot.res = 200)
# itemFrequencyPlot(Boston_trans,support=0.2,  cex.name= 0.8)

rules<-apriori(Boston_trans,parameter= list(support = 0.02,confidence = 0.7))
# summary(rules)
rules

inspect(head(sort(rules, by = "lift"), n = 5))
inspect(head(sort(rules, by = "confidence"), n = 5))



# (c) A student is interested is a low crime area, but wants to be as close to the city
# as possible (as measured by "dis"). What can you advise on this matter
# through the mining of association rules?

rulescrimLow<-subset(rules, subset = lhs %in% "dis=Low" & rhs %in% "crim=Low", lift>2 )
rulescrimLow

inspect(head(sort(rulescrimLow, by = "support"), n = 5))
plot(rulescrimLow)
plot(rulescrimLow, shading="order", control=list(main = "Two-key plot"))
# plot(rulescrimLow, measure=c("lift", "confidence"), control=list(reorder=TRUE, col=sequential_hcl(200)))
# plot(rulescrimLow,  measure="support", control=list(col=sequential_hcl(100)))



# (d) A family is moving to the area, and has made schooling a priority. They want
# schools with low pupil-teacher ratios. What can you advise on this matter
# through the mining of association rules?

rulesptratioLow<-subset(rules,subset= rhs %in% "ptratio=Low", lift>2)
rulesptratioLow

inspect(head(sort(rulesptratioLow, by="support"),n=10))
plot(rulesptratioLow, measure=c("support", "lift"), shading="confidence")
plot(rulesptratioLow, shading="order", control=list(main = "Two-key plot"), color = 'black')

#########################
### Extra Credits    ####
#########################

### regression model ####
# min-max scaling on boston dataset prior to regression
Boston_R <- Boston[,-c(4,6)]
max <- apply(Boston_R, 2 , max)
min <- apply(Boston_R, 2 , min)
Boston_S <- as.data.frame(scale(Boston_R, center = min, scale = max - min))

head(Boston_S)

## splitting the dataset into train and test sets
set.seed(4444) ## seeding the sampling
Boston_r <- initial_split(Boston_S, prop = 0.75) ## spliting the data by library 'rsample'
data_train<- training(Boston_r)
data_test  <- testing(Boston_r)

reg_fit<-lm(ptratio~.,data=data_train)
summary(reg_fit)

options(repr.plot.width=6, repr.plot.height=6, repr.plot.res = 200)
par(mfrow = c(2,2))
plot(reg_fit, )

options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)
plot(cooks.distance(reg_fit), pch = 16, col = "blue", ylim=c(0,0.06))

anova(reg_fit)['Residuals', 'Mean Sq'] # MSE calculation from anova table
sigma(reg_fit) # residual standard deviation

predict_train<-predict(reg_fit,data_train )
predict_test<-predict(reg_fit,data_test )
mse_train<-mean((predict_train-data_train$ptratio)^2)
mse_test<-mean((predict_test-data_test$ptratio)^2)
mse_train
mse_test

options(repr.plot.width=8, repr.plot.height=4, repr.plot.res = 200)
par(mfrow=c(1,2))

plot(data_train$ptratio, predict_train, col="blue", xlab="Original", ylab="Predicted", xlim = c(0,1), ylim =c(0,1),main="Prediction on train set" )
abline(a = 0, b = 1, lty = 2)

plot(data_test$ptratio, predict_test, col="red", xlab="Original", ylab="Predicted", xlim = c(0,1), ylim =c(0,1), main="Prediction on test set" )
abline(a = 0, b = 1, lty = 2)


###############################################################################################
################################ Ridge Regression #############################################
###############################################################################################

## converting the dataframe to matrix
X_train <- as.matrix(Boston_S[,-9])
Y_train <- as.matrix(Boston_S[,9])
X_test <- as.matrix(Boston_S[,-9])
Y_test <- as.matrix(Boston_S[,9])

## defining a range of lambda
lam_ridge <- 10^seq(2, -3, by = -.1)
ridge_mod = glmnet(X_train, Y_train, nlambda = 25, alpha = 0, family = 'gaussian', lambda = lam_ridge)

summary(ridge_mod)

ridge_mod$dev.ratio

ridge_mod$lambda

options(repr.plot.width=12, repr.plot.height=6, repr.plot.res = 200)
par(mfrow=c(1,2))
plot(ridge_mod, xvar="lambda",  ylab="Standardised coefficients", label=TRUE)
plot(ridge_mod,  ylab="Standardised coefficients", xlab= "L2 norm", label=TRUE)
#plot(ridge_mod$lambda, ridge_mod$coefficients , ylab="Standardised coefficients", xlab="lambda")

# finding the optimal lambda value
cvglm_ridge <- cv.glmnet(X_train, Y_train, alpha = 0, lambda = lam_ridge)

options(repr.plot.width=8, repr.plot.height=4, repr.plot.res = 200)

# plot(cvglm_ridge, xvar="lambda",  ylab="Standardised coefficients", label=TRUE)
opt_lam <- cvglm_ridge$lambda.min
opt_lam

par(mfrow=c(1,2))
plot(cvglm_ridge, ylab="MSE from CV in Ridge")
abline(v=log(opt_lam), col="green", lty=2, ldw=3)

#Creating training model using ridge regression
ridge_best =glmnet(X_train, Y_train,alpha=0,lambda=opt_lam)
#Printing out the logistic model
ridge_best$beta

# Computing R^2 from original and predicted values
eval_results <- function(original, predicted) {
  SSE <- sum((predicted - original)^2)
  SST <- sum((original - mean(original))^2)
  R_square <- (1 - SSE / SST)*100 ## in percentage
  MSE = SSE/nrow(original) ## calculating mse
  
  # Model performance metrics
  data.frame(
    MSE = MSE,
    Rsquare_percent = R_square)
}

#Retrieving the ridge coefficients
ridge_coef=predict(ridge_best,type="coefficients",s=opt_lam)[0:length(ridge_best$beta)+1,]
#Printing non zero coefficients
as.data.frame(ridge_coef[ridge_coef !=0])

# Prediction and evaluation on train data
ridge_pred_train <- predict(ridge_best, s = opt_lam, newx = X_train)
# Calculate MSE and R2 on test data
eval_results(Y_train, ridge_pred_train)
rmse_ridge_train<-sqrt(sum((ridge_pred_train - data_train$ptratio)^2)/length(data_train$ptratio))
rmse_ridge_train

# Prediction and evaluation on test data
ridge_pred_test <- predict(ridge_best, s = opt_lam, newx = X_test)
eval_results(Y_test, ridge_pred_test)

rmse_ridge_test<-sqrt(sum((ridge_pred_test - data_test$ptratio)^2)/length(data_test$ptratio))
rmse_ridge_test


###################################################################################################
################################ Lasso Regression #############################################
######################################################################################################

lam_lasso <- 10^seq(2, -4, by = -.1)

# Setting alpha = 1 implements lasso regression
lasso_mod <- glmnet(X_train, Y_train, alpha = 1, lambda = lam_lasso)
sum_lasso <- summary(lasso_mod)

sum_lasso

options(repr.plot.width=12, repr.plot.height=6, repr.plot.res = 200)
par(mfrow=c(1,2))
plot(lasso_mod, xvar="lambda", label=TRUE, cex=5)
plot(lasso_mod,   label=TRUE)

cvglm_lasso <- cv.glmnet(X_train, Y_train, alpha = 1, lambda = lam_lasso)
options(repr.plot.width=4, repr.plot.height=4, repr.plot.res = 200)

# Best lambda selection
opt_las <- cvglm_lasso$lambda.min 
opt_las

par(mfrow=c(1,1))
plot(cvglm_lasso, ylab= "MSE from CV in Lasso")
abline(v=log(opt_las), col="green", lty=2, ldw=3)

#Creating training model using lasso regression with best lambda
lasso_best =glmnet(X_train, Y_train,alpha=1,lambda=opt_las)
#Printing out the logistic model
lasso_best$beta

#Retrieving the lasso coefficients
lasso_coef=predict(lasso_best,type="coefficients",s=opt_las)[1:length(lasso_best$beta)+1,]
#Printing non zero coefficients
as.data.frame(lasso_coef[lasso_coef!=0])

# Prediction and evaluation on train data
lasso_pred_train <- predict(lasso_best, s = opt_las, newx = X_train)
eval_results(Y_train, lasso_pred_train)


# Prediction and evaluation on test data
lasso_pred_test <- predict(lasso_best, s = opt_las, newx = X_test)
# Calculate MSE and R2 on test data
eval_results(Y_test, lasso_pred_test)

### end ###