-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelpers.R
99 lines (83 loc) · 3.51 KB
/
helpers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
readData <- function(){
require('dplyr')
#read data from raw csv file
articles <- read.csv("data/articles.csv",header=FALSE)
names(articles) <- c('id','title','content','clicks','created_date','published','published_date',
'updated_date','legacy_id','position','issue_id','section_id')
articles <- articles %>% dplyr::filter(published==1)
articles <- articles[,c(1,2,3,7,12)]
#format the data
articles$title <- as.character(articles$title)
articles$content <- as.character(articles$content)
articles$id <- as.numeric(articles$id)
articles$published_date <- as.Date(articles$published_date,format = "%Y-%m-%d %H:%M:%S")
#get rid of html tag
cleanFun <- function(htmlString) {
return(gsub("<.*?>", "", htmlString))
}
articles$content <- unlist(lapply(articles$content,cleanFun))
#get rid of short articles
wordCount <- function(str){
sapply(strsplit(str, " "), length)
}
articles <- articles[wordCount(articles$content)>50,]
#link authors to their articles
profile.article <- read.csv('data/profile_article.csv', header=FALSE)[,c(2,3)]
names(profile.article) <- c("article_id","profile_id")
profiles <- read.csv('data/profiles.csv', header=FALSE)[,c(1,3)]
names(profiles) <- c('profile_id', 'author_name')
profile.article <- left_join(profile.article,profiles,by="profile_id")
names(profile.article)[1] <- 'id'
articles <- left_join(articles,profile.article, by="id")
articles <- articles[!duplicated(articles$id),]
articles
}
classifyNewArticle <- function(content,topic.model){
new.article = data.frame(id=10000, content=content)
myReader <- readTabular(mapping=list(content="content",id="id"))
myCorpus<-VCorpus(DataframeSource(new.article), readerControl=list(reader=myReader))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeWords, c(stopwords("english"),stopwords("SMART")))
myCorpus <- tm_map(myCorpus, stemDocument,lazy=TRUE)
dtm = DocumentTermMatrix(myCorpus)
topic.probs <- posterior(topic.model,dtm)$topics
topic.probs
}
classifyResult <- function(content,topic.model){
topic.probs <- classifyNewArticle(content,topic.model)
topic.probs.table <- data.frame(Topic.ID=seq(1,length(topic.probs),1))
topic.probs.table$Weight = as.numeric(t(topic.probs))
topic.probs.table <- topic.probs.table[order(-topic.probs.table$Weight),]
result = topic.probs.table[1:3,]
result
}
toNumber <- function(str){
x <- unlist(regmatches(str, gregexpr('\\(?[0-9,.]+', str)))
x <- as.numeric(x)
x
}
get.articles.topics <- function(topic.model,topic.num){
article.topics = data.frame(t(data.frame(topics(topic.model,topic.num))))
article.topics$id = rownames(article.topics)
rownames(article.topics) = c()
article.topics$id <- as.integer(sapply(article.topics$id,toNumber))
article.topics
}
get.article.topic <- function(topic.model,article.id){
article.topics <- get.articles.topics(topic.model,3)
topics <- article.topics %>% dplyr::filter(id==article.id)
topics <- topics %>% select(-id)
names(topics) <- c("1st","2nd","3rd")
topics
}
get.topic.terms <- function(topic.model, term.num){
topic.terms = terms(topic.model,term.num)
}
join.articles.with.topics <- function(articles,topic.model,topic.num){
articles.topics <- get.articles.topics(topic.model,topic.num)
dplyr::inner_join(articles,articles.topics,by="id")
}
convertSecondsToDate <- function(seconds){
as.Date(as.POSIXct(seconds/1000,origin = "1970-01-01",tz = "GMT"))
}