Skip to content

Commit c905062

Browse files
authored
R Script for Text mining abstracts
Analyzes topics for abstracts from the AMiner dataset using Latent Dirichlet Allocation. I then did an quasi-experiment to see if articles with different abstract topics have different citation counts.
1 parent d170295 commit c905062

File tree

1 file changed

+85
-0
lines changed

1 file changed

+85
-0
lines changed

AbstractTopicAndCitations.R

+85
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
library(tm)
2+
library(topicmodels)
3+
library(slam)
4+
library(ggplot2)
5+
library(tidyverse)
6+
library(tidytext)
7+
8+
citation_data <- read.csv("../AMiner.csv")
9+
head(citation_data)
10+
str(citation_data)
11+
summary(citation_data)
12+
13+
# Remove duplicates in paper authors. The original parsing of the data stored a new entry for
14+
# every author in a paper
15+
citation_data <- citation_data[!duplicated(citation_data$author), ]
16+
17+
# Now remove every entry that does not have an abstract
18+
citation_data <- citation_data[!(is.na(citation_data$abstract) | citation_data$abstract==""), ]
19+
20+
# Since we are only interested in citation counts and abstracts, we will just
21+
# keep those columns
22+
citation_data <- citation_data[, c('citations', 'abstract')]
23+
citation_data <-citation_data[sample(nrow(citation_data), 100000), ]
24+
25+
# Isolate the abstract texts and prepar them for text processing
26+
abstracts <- Corpus(VectorSource(citation_data$abstract))
27+
28+
# Clean all the abstracts and create a DTM
29+
clean_corpus <- function(corpus){
30+
corpus <- tm_map(corpus, stripWhitespace)
31+
corpus <- tm_map(corpus, removePunctuation)
32+
corpus <- tm_map(corpus, content_transformer(tolower))
33+
corpus <- tm_map(corpus, removeWords, stopwords('en'))
34+
corpus <- tm_map(corpus, stemDocument, language = "english")
35+
return(corpus)
36+
}
37+
38+
clean_abstracts <- clean_corpus(abstracts)
39+
40+
abstracts_dtm <- DocumentTermMatrix(clean_abstracts, control = list(minWordLength = 1))
41+
abstracts_dtm
42+
43+
# Next, we will use the tf-idf metric to get rid of words that will likely not be good
44+
# distinguishing words
45+
46+
term_tfidf <- tapply(abstracts_dtm$v/row_sums(abstracts_dtm)[abstracts_dtm$i],
47+
abstracts_dtm$j, mean) * log2(nDocs(abstracts_dtm)
48+
/col_sums(abstracts_dtm > 0))
49+
abstracts_dtm <- abstracts_dtm[,term_tfidf >= 0.1]
50+
51+
# We will next drop out any documents (i.e. rows) that no longer have any entries
52+
ui <- unique(abstracts_dtm$i)
53+
abstracts_dtm <- abstracts_dtm[ui, ]
54+
citation_data <- citation_data[ui, ]
55+
56+
# Now, construct an LDA model for the data, for the LDA model, we will use 10 as
57+
# the number of topics; this choice is largely arbitrary on my part
58+
abstracts_lda <- LDA(abstracts_dtm, k=10)
59+
60+
abstract_top_words <- tidy(abstracts_lda, matrix='beta')
61+
top_n(abstract_top_words, 20)
62+
63+
terms(abstracts_lda, 5)
64+
65+
# Next, we will do a regression model on topics and citation counts. We would like
66+
# to see if topics, as described by an abstract, are a good predictor of how many
67+
# citations a paper is likely to get
68+
69+
abstract_topics <- posterior(abstracts_lda, abstracts_dtm)
70+
citation_data$topic = apply(abstract_topics$topics, 1, max)
71+
72+
regMod <- lm(citations ~ topic, data=citation_data)
73+
summary(regMod)
74+
75+
ggplot(citation_data, aes(x=topic, y=citations))+
76+
geom_point(size=0.7, shape=23)+
77+
geom_smooth(method=lm)
78+
79+
abs_anova <- aov(citations ~ topic, data=citation_data)
80+
summary(abs_anova)
81+
82+
83+
84+
85+

0 commit comments

Comments
 (0)