-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathData Science's Toolboxes.R
158 lines (134 loc) · 6.95 KB
/
Data Science's Toolboxes.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#Toolbox
#1.Getting helps
?rnorm #directly access help file
help.serach("rnorm") #search help files #same with ??rnorm
??rnorm #same with help.search("rnorm")
args("rnorm") #get arguments
#2.readLines
con<-url("http://www.jhsph.edu","r")
#file, pipe, fifo, url, gzfile, bzfile, xzfile, unz and socketConnection
#file: the description is a path to the file to be opened or a complete URL or "" (the default) or "clipboard"
#url: the description is a complete URL, including scheme (such as http://, ftp:// or file://)
#gzfile: the description is the path to a file compressed by gzip: it can also open for reading uncompressed files and those compressed by bzip2, xz or lzma.
#"r" or "rt": Open for reading in text mode. URLs can only be opened for reading.
# "w" or "wt": Open for writing in text mode.
#"a" or "at": Open for appending in text mode.
#"r+", "r+b": Open for reading and writing.
#"w+", "w+b": Open for reading and writing, truncating file initially.
#"a+", "a+b": Open for reading and appending.
x<-readLines(con)
head(x)
#3. getting and cleaing Data Content
# Excel, XML, JSON, MySQL, HDF5, Web
if (!require("RMySQL")) {
install.packages("RMySQL")
}
library("RMySQL")
uscDb<-dbConnect(MySQL(),user="genome",host="genome-mysql.cse.ucsc.edu")
result<-dbGetQuery()
dbDisconnect(ucscDb)
result
#4. Merging data
mergeData2<-merge(reviews, solutions, by.x="solution_id",by.y="id",all=TRUE)
head(mergedData2[,1:6],3)
reviews[1,1:6]
#5. EDA
#Fina relationships you didn't know about.EDA are usually not the final say.
#EDA alone should not be used for generalizating/predicting
# Exploratory graphs, Plotting Systems in R(base, lattice, ggplot2), Hierachical clustering, K-means Clustering, Dimension Reduction
# ggplot2:
if (!require("ggplot2")) {
install.packages("ggplot2")
}
library("ggplot2")
gplot(displ, hwy, data=mpg, geom=c("point","smooth"))
# k-means clutering:
set.seed(1234)
par(mar=c(0,0,0,0)) #mar: margins(bottom, left, top, right)
x<-rnorm(12, mean=rep(1:3, each = 4),sd=0.2)
y<-rnorm(12, mean=rep(c(1,2,1),each = 4),sd=0.2)
plot(x,y,col="blue",pch=19,cex = 2)
text(x+0.05,y+0.05,label=as.character(1:12))
#6.Statistical Inference
#sample->population. sampling scheme is important.(Monte Carlo)
#Asymptotics, Bootstrapping, Non-parametric tests, Bayesian statistics
#bootstrapping:
B<-1000
n<-length(gmVol)
resamples<-matrix(sample(gmVol,n*B,replace=TRUE),B,n) #extract n*B samples from gmVol
median<-apply(resamples,1,median) #apply(X, MARGIN, FUN, ...)
#MARGIN: a vector giving the subscripts which the function will be applied over. E.g., for a matrix 1 indicates rows, 2 indicates columns, c(1, 2) indicates rows and columns.
sd(medians)
quantile(medians,c(0.025,0.975))
#7. Regression Models
#Casual Analysis
#If X predicts Y, it doesn't mean that X causes Y. --> Confounding means that there may be other variables cause the seeming correlation between two interested variables.
#Methods to minimize Confounding:Restriction, Matching, Randomization, stratification, adjustment, multivariate analysis
#Splines, Machine Learning Via regression, permutation tests, Weighted regression, Mixed Models(random intercepts)
#Three ways to deal with confounding: (1)fix a variable (2)if you don't fix, you can stratify samples. (3)If can't fix a variable, randomize it
###probability sampling(sampling frame is required):
#simple random sampling: randomly pick out # of samples.(more easily implemented for natural and manufacturing populations)(unbiased and representative)
#stratified sampling: (1)group(stratify) samples on certain characteristics (grouping like cluster sampling, groups are called strata) (2)take SRS or systematic sampling in each stratum.
#systematic sampling: starting point is random, and then systematically take objects at a certain number apart. easier to administer than simple random samples. A good apporx of a random sample. But if there is any pattern in the sample...
#cluster sampling: More convient than simple random sampling.Can lead to bias and non-representative if the clusters are different form each other.
#multi-stage sampling:any combination of random sampling. like use both stratified sampling and cluster sampling
###non-probability sampling includes convenience sampling and quota sampling(sampling frame isn't required)
#convenience sampling: biased such as self-selection but quick and cheap. Like Online poll, Survey your best friends, asking for volunteers at the mall.
#quota sampling: Quota sampling is the non probability version of stratified sampling. Stratified sampling utilizes a simple random sampling once the categories are created; quota sampling utilizes availability sampling.
#8.Practical Maching Learning
#caret package, Correlated predictor, prediction with regression, prediction with trees, boosting, bagging, model blending, forecasting
#Correlated predictor
if (!require("caret")) {
install.packages("caret")
}
if (!require("kernlab")) { # package for Kernel Methods
install.packages("kernlab")
}
library("lattice")
library("caret") # required package: lattice & ggplot2
library("kernlab")
data(spam) # dim(spam): 4601 * 58
inTrain<-createDataPartition(y=spam$type,p=0.75,list=FALSE) #class(inTrain) = matrix
#spam$type:Levels: nonspam spam
#createDataPartition(y, times = 1,p = 0.5,list = TRUE,groups = min(5, length(y))): A series of test/training partitions are created
#times: the number of partitions to create; p:the percentage of data that goes to training
#list: should the results be in a list (TRUE) or a matrix with the number of rows equal to floor(p * length(y)) and "times" columns.
#createResample(y, times = 10, list = TRUE): creates one or more bootstrap samples
#createFolds(y, k = 10, list = TRUE, returnTrain = FALSE) splits the data into k groups
#createTimeSlices(y, initialWindow, horizon = 1, fixedWindow = TRUE) creates cross-validation sample information to be used with time series data.
training <- spam[inTrain,]
testing<-spam[-inTrain,]
M<-abs(cor(training[,-58]))
diag(M)<-0
which(M>0.8,arr.ind=T)
#arr.ind: logical; should array indices be returned when x is an array
#9. Building Data Products
#R packages: devtools, roxygen, testhat
#Marketing report: rCharts
#slidify
#shiny: interactive web app
##################################################3
#Git: version control tools
#set up Git Bash:
#git config --global user.name "aeoluseros"
#git config --global user.email "[email protected]"
#git config --list
#mkdir ~/R-programming
#cd ~/R-programming
#git init
#git remote add origin https://github.com/aeoluseros/R-programming.git
# make a local copy:
#git clone https://github.com/aeoluseros/R-programming.git
#update local repo:
#git add . #adds all new files
#git add -u #update all changed or deleted files
#git add -A #above two
#git commit -m "message"
#put things to the github
#git push
#sometimes we need to "git pull origin master" first.
#sometimes we don't want to edit a version. we could create a branch:
#git checkout -b brachname
#see what branch you are on type:
#git branch
#switch back to the master branch type