|
| 1 | +library(tm) |
| 2 | +library(topicmodels) |
| 3 | +library(slam) |
| 4 | +library(ggplot2) |
| 5 | +library(tidyverse) |
| 6 | +library(tidytext) |
| 7 | + |
| 8 | +citation_data <- read.csv("../AMiner.csv") |
| 9 | +head(citation_data) |
| 10 | +str(citation_data) |
| 11 | +summary(citation_data) |
| 12 | + |
| 13 | +# Remove duplicates in paper authors. The original parsing of the data stored a new entry for |
| 14 | +# every author in a paper |
| 15 | +citation_data <- citation_data[!duplicated(citation_data$author), ] |
| 16 | + |
| 17 | +# Now remove every entry that does not have an abstract |
| 18 | +citation_data <- citation_data[!(is.na(citation_data$abstract) | citation_data$abstract==""), ] |
| 19 | + |
| 20 | +# Since we are only interested in citation counts and abstracts, we will just |
| 21 | +# keep those columns |
| 22 | +citation_data <- citation_data[, c('citations', 'abstract')] |
| 23 | +citation_data <-citation_data[sample(nrow(citation_data), 100000), ] |
| 24 | + |
| 25 | +# Isolate the abstract texts and prepar them for text processing |
| 26 | +abstracts <- Corpus(VectorSource(citation_data$abstract)) |
| 27 | + |
| 28 | +# Clean all the abstracts and create a DTM |
| 29 | +clean_corpus <- function(corpus){ |
| 30 | + corpus <- tm_map(corpus, stripWhitespace) |
| 31 | + corpus <- tm_map(corpus, removePunctuation) |
| 32 | + corpus <- tm_map(corpus, content_transformer(tolower)) |
| 33 | + corpus <- tm_map(corpus, removeWords, stopwords('en')) |
| 34 | + corpus <- tm_map(corpus, stemDocument, language = "english") |
| 35 | + return(corpus) |
| 36 | +} |
| 37 | + |
| 38 | +clean_abstracts <- clean_corpus(abstracts) |
| 39 | + |
| 40 | +abstracts_dtm <- DocumentTermMatrix(clean_abstracts, control = list(minWordLength = 1)) |
| 41 | +abstracts_dtm |
| 42 | + |
| 43 | +# Next, we will use the tf-idf metric to get rid of words that will likely not be good |
| 44 | +# distinguishing words |
| 45 | + |
| 46 | +term_tfidf <- tapply(abstracts_dtm$v/row_sums(abstracts_dtm)[abstracts_dtm$i], |
| 47 | + abstracts_dtm$j, mean) * log2(nDocs(abstracts_dtm) |
| 48 | + /col_sums(abstracts_dtm > 0)) |
| 49 | +abstracts_dtm <- abstracts_dtm[,term_tfidf >= 0.1] |
| 50 | + |
| 51 | +# We will next drop out any documents (i.e. rows) that no longer have any entries |
| 52 | +ui <- unique(abstracts_dtm$i) |
| 53 | +abstracts_dtm <- abstracts_dtm[ui, ] |
| 54 | +citation_data <- citation_data[ui, ] |
| 55 | + |
| 56 | +# Now, construct an LDA model for the data, for the LDA model, we will use 10 as |
| 57 | +# the number of topics; this choice is largely arbitrary on my part |
| 58 | +abstracts_lda <- LDA(abstracts_dtm, k=10) |
| 59 | + |
| 60 | +abstract_top_words <- tidy(abstracts_lda, matrix='beta') |
| 61 | +top_n(abstract_top_words, 20) |
| 62 | + |
| 63 | +terms(abstracts_lda, 5) |
| 64 | + |
| 65 | +# Next, we will do a regression model on topics and citation counts. We would like |
| 66 | +# to see if topics, as described by an abstract, are a good predictor of how many |
| 67 | +# citations a paper is likely to get |
| 68 | + |
| 69 | +abstract_topics <- posterior(abstracts_lda, abstracts_dtm) |
| 70 | +citation_data$topic = apply(abstract_topics$topics, 1, max) |
| 71 | + |
| 72 | +regMod <- lm(citations ~ topic, data=citation_data) |
| 73 | +summary(regMod) |
| 74 | + |
| 75 | +ggplot(citation_data, aes(x=topic, y=citations))+ |
| 76 | + geom_point(size=0.7, shape=23)+ |
| 77 | + geom_smooth(method=lm) |
| 78 | + |
| 79 | +abs_anova <- aov(citations ~ topic, data=citation_data) |
| 80 | +summary(abs_anova) |
| 81 | + |
| 82 | + |
| 83 | + |
| 84 | + |
| 85 | + |
0 commit comments