-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathtweets-normalize.R
77 lines (60 loc) · 2.09 KB
/
tweets-normalize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
## Create and clean a Corpus from tweets for analysis.
##
## By Matteo DOT Redaelli AT gmail DOT com
## http://www.redaelli.org/matteo/
## 2011-08-01
##
## Ideas adapted from
## http://heuristically.wordpress.com/2011/04/08/text-data-mining-twitter-r/
## and
## Earl F Glynn, Franklin Center for Government & Public Integrity
library(tm)
library(methods)
RemoveDots <- function(tweet) {
gsub("[\\.\\,\\;]+", " ", tweet)
}
RemoveLinks <- function(tweet) {
gsub("http:[^ $]+", "", tweet)
}
RemoveAtPeople <- function(tweet) {
gsub("@\\w+", "", tweet)
}
CleanTweet <- function(tweet) {
s1 <- RemoveLinks(tweet)
s2 <- RemoveAtPeople(s1)
s3 <- RemoveDots(s2)
s3
}
args <- commandArgs()
q <- ifelse(is.na(args[6]), "twitter",args[6])
load("fetch.Rdata")
tweets <- as.vector(sapply(tweets, CleanTweet))
## build a corpus
mydata.corpus <- Corpus(VectorSource(tweets))
## make each letter lowercase
mydata.corpus <- tm_map(mydata.corpus, tolower)
## remove punctuation
mydata.corpus <- tm_map(mydata.corpus, removePunctuation)
## remove generic and custom stopwords
stopwords_1 <- c(stopwords('english'),
stopwords('italian'),
q
)
mydata.corpus <- tm_map(mydata.corpus, removeWords, stopwords_1)
stopwords_2 <- c(stopwords('spanish'),
stopwords('portuguese')
)
mydata.corpus <- tm_map(mydata.corpus, removeWords, stopwords_2)
save(mydata.corpus, file="normalize.Rdata")