-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathStage 1.Rmd
117 lines (93 loc) · 2.56 KB
/
Stage 1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
---
title: "5310 Project"
author: "rzou3444 / 500709979"
date: "26/09/2020"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r}
library(tidyr)
library(ggplot2 )
library(readr)
library(dplyr)
library(corrplot)
library(GGally)
```
```{r read}
# input dataset
HTRU_2 <- read_csv("HTRU_2_UCI.csv", col_names = FALSE)
colnames(HTRU_2) <- c('Mean_IP','SD_IP','EK_IP','S_IP',"Mean_C","SD_C","EK_C","S_C","Class") #add column names to the dataset
dim(HTRU_2) # 17898 candidates & 9 attributes
HTRU_2[1:5,] #show first 5 candidates
```
```{r}
# remove missing value & change "Class" datatype
htru <- HTRU_2 %>% drop_na # make sure there is no missing value
vapply(htru,anyNA,logical(1)) #check missing value
htru <- as.data.frame(htru)
htru$Class <- factor(htru[[9]]) # make "Class" to labels
vapply(htru,class,character(1)) #check each column datatype
dim(htru)
str(htru)
summary(htru[1:8]) #show summary of the 8 attributes
```
```{r}
# correlation between attributes
correlation <- round(cor(HTRU_2),2)
correlation
corrplot(correlation,method = "square",type="upper")
#result shows the attribute EK_IP has significant correlation with Class.
```
```{r}
htru.data <- htru[-9] # store 8 attributes
classes <- factor(htru[[9]]) # store class labels only
levels(htru$Class) # 2 levels
summary(htru$Class) # 1639 positive candidates & 16259 negative candidates
```
```{r}
# count positive and negative candidates
ggplot(htru,aes(x=Class,fill=htru$Class)) + geom_bar() + theme_minimal()+labs(title = "Numbers of Positive and Negative Candidates")
```
```{r}
#par(mfrow=c(1,2))
#boxplot(htru[,1:7])
boxplot(htru[,1:8],col="blue",boxwex=0.8)
```
```{r}
# distribution of 8 attributes
par(mfrow=c(3,3))
hist(htru$Mean_IP)
hist(htru$SD_IP)
hist(htru$EK_IP)
hist(htru$S_IP)
hist(htru$Mean_C)
hist(htru$SD_C)
hist(htru$EK_C)
hist(htru$S_C)
```
```{r}
# matrix of plots show scotters of each feature and each class
ggpairs(htru,aes(colour = classes,alpha=0.1),lower = list(continuous=wrap("smooth",size=0.1,alpha=0.1)),upper = list(continuous = wrap("cor", size = 2)))
```
```{r}
# matrix of scatterplots
pairs(htru.data,main="Features of HTRU_2 Data",pch=21,bg=c("blue","red")[classes])
```
```{r}
# see outlier
ggplot(htru,aes(x=Class,y=Mean_IP))+geom_boxplot(outlier.colour="red",outlier.shape = 1)
```
```{r}
# Normalisation
normalise <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
htru[1:8] <- normalise(htru[1:8])
summary(htru)
```
```{r}
# save to csv file
write.csv(htru,"htru.csv",row.names=FALSE)
```