Updated twitter collection and reddit text cleaning (#11)

* Removed Save and LoadCredential functions * Reddit text encoding stopgap * Added twitter collection summary to output max status_id * Added timer and elapsed time output for the Collect function * Fixed a bug in passing additional twitter api params * Moved reddit HTML decoding to Collect function
vosonlab · Mar 10, 2019 · 14060c6 · 14060c6
1 parent 1e2211c
commit 14060c6
Show file tree

Hide file tree

Showing 47 changed files with 543 additions and 261 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,13 @@
 Package: vosonSML
-Version: 0.26.3
+Version: 0.27.0
 Title: Collecting Social Media Data and Generating Networks for Analysis
 Description: A suite of tools for collecting and constructing networks from social media data.
     Provides easy-to-use functions for collecting data across popular platforms (Twitter, YouTube 
     and Reddit) and generating different types of networks for analysis.
 Type: Package
 Imports: tm, stringr, RCurl, igraph (>= 1.2.2), Hmisc, data.table, httpuv, methods, httr, 
     magrittr, dplyr (>= 0.7.8), rlang (>= 0.3.0.1), RedditExtractoR (>= 2.1.2), 
-    rtweet (>= 0.6.8)
+    rtweet (>= 0.6.8), textutils, tictoc
 Depends: R (>= 3.2.0)
 Encoding: UTF-8
 Author: Timothy Graham, Robert Ackland, Chung-hong Chan, Bryan Gertzel

diff --git a/NAMESPACE b/NAMESPACE
@@ -20,14 +20,12 @@ S3method(Create,default)
 S3method(Create,semantic)
 S3method(Create,semantic.default)
 S3method(Create,semantic.twitter)
-export(AddUserData.twitter)
+export(AddTwitterUserData)
 export(Authenticate)
 export(Collect)
 export(Create)
 export(GetYoutubeVideoIDs)
 export(ImportData)
-export(LoadCredential)
-export(SaveCredential)
 import(RCurl)
 import(data.table)
 import(httpuv)
@@ -38,6 +36,7 @@ importFrom(Hmisc,escapeRegex)
 importFrom(RedditExtractoR,reddit_content)
 importFrom(RedditExtractoR,user_network)
 importFrom(dplyr,anti_join)
+importFrom(dplyr,arrange)
 importFrom(dplyr,bind_rows)
 importFrom(dplyr,coalesce)
 importFrom(dplyr,distinct)
@@ -75,6 +74,9 @@ importFrom(rtweet,users_data)
 importFrom(stringr,str_extract)
 importFrom(stringr,str_match_all)
 importFrom(stringr,str_replace_all)
+importFrom(textutils,HTMLdecode)
+importFrom(tictoc,tic)
+importFrom(tictoc,toc)
 importFrom(utils,"capture.output")
 importFrom(utils,"flush.console")
 importFrom(utils,"install.packages")

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,27 @@
+# vosonSML 0.27.0
+
+## Bug Fixes
+- Fixed a bug in `Collect.twitter` in which any additional `twitter API` parameters
+  e.g `lang` or `until` were not being passed properly to `rtweet::search_tweets`. This 
+  resulted in the additional parameters being ignored.
+
+## Major Changes
+- Removed the `SaveCredential` and `LoadCredential` functions, as well as the `useCachedToken`
+  parameter for `Authenticate.twitter`. These were simply calling the `saveRDS` and `readRDS` 
+  functions and not performing any additional processing. Using `saveRDS` and `readRDS` directly 
+  to save and load an `Authenticate` credential object to file is simpler.
+- Changed the way that the `cleanText` parameter works in `Create.actor.reddit` so that it is 
+  more permissive. Addresses encoding issues with apostrophes and pound symbols and removes 
+  unicode characters not permitted by the XML 1.0 standard as used in `graphml` files. This is 
+  best effort and does not resolve all `reddit` text encoding issues.
+
+## Minor Changes
+- Added `Collect.twitter` summary information that includes the earliest (min) and latest (max) 
+  tweet `status_id` collected with timestamp. The `status_id` values can be used to frame 
+  subsequent collections as `since_id` or `max_id` parameter values. If the `until` date 
+  parameter was used the timestamp can also be used as a quick confirmation.
+- Added elapsed time output to the `Collect` method.
+
 # vosonSML 0.26.3
 
 ## Bug Fixes

diff --git a/R/AddUserData.twitter.R → R/AddTwitterUserData.R b/R/AddUserData.twitter.R → R/AddTwitterUserData.R
@@ -26,17 +26,17 @@
 #' # add additional twitter user profile information to actor network graph as node attributes 
 #' # requires twitterAuth from Authenticate, twitterData from Collect and actorNetwork from 
 #' # Create actor network
-#' actorNetWithUserAttr <- AddUserData.twitter(twitterData, actorNetwork,
-#'                                             lookupUsers = TRUE, 
-#'                                             twitterAuth = twitterAuth, writeToFile = TRUE)
+#' actorNetWithUserAttr <- AddTwitterUserData(twitterData, actorNetwork,
+#'                                            lookupUsers = TRUE, 
+#'                                            twitterAuth = twitterAuth, writeToFile = TRUE)
 #' # igraph object
 #' # actorNetWithUserAttr$graph
 #' }
 #' 
-#' @aliases AddUserData.twitter
-#' @name vosonSML::AddUserData.twitter
+#' @aliases AddTwitterUserData
+#' @name vosonSML::AddTwitterUserData
 #' @export
-AddUserData.twitter <- function(collectData, networkData, lookupUsers = TRUE, twitterAuth = NULL, writeToFile = FALSE) {
+AddTwitterUserData <- function(collectData, networkData, lookupUsers = TRUE, twitterAuth = NULL, writeToFile = FALSE) {
 
   dfCollect <- collectData
   dfRelations <- networkData$relations

diff --git a/R/Authenticate.twitter.R b/R/Authenticate.twitter.R
@@ -9,9 +9,6 @@
 #' @param apiSecret Character string. API secret to authenticate.
 #' @param accessToken Character string. API access token to authenticate.
 #' @param accessTokenSecret Character string. API access token secret to authenticate.
-#' @param useCachedToken Logical. Use cached access token if found otherwise create one. The cached token file is 
-#' named \code{.twitter-oauth} and saved to the working directory.To refresh the cached token this file has to be 
-#' manually deleted. Default is \code{TRUE}.
 #' @param ... Additional parameters passed to function. Not used in this method.
 #' 
 #' @return A \code{credential} object containing an access token \code{$auth} and social media type descriptor 
@@ -26,12 +23,11 @@
 #' 
 #' twitterAuth <- Authenticate("twitter", appName = myKeys$appName, 
 #'   apiKey = myKeys$apiKey, apiSecret = myKeys$apiSecret, accessToken = myKeys$accessToken, 
-#'   accessTokenSecret = myKeys$accessTokenSecret, useCachedToken = TRUE)
+#'   accessTokenSecret = myKeys$accessTokenSecret)
 #' }
 #' 
 #' @export
-Authenticate.twitter <- function(socialmedia, appName, apiKey, apiSecret, accessToken, accessTokenSecret,
-                                 useCachedToken = TRUE, ...) {
+Authenticate.twitter <- function(socialmedia, appName, apiKey, apiSecret, accessToken, accessTokenSecret, ...) {
 
   if (missing(apiKey) || missing(apiSecret) || missing(accessToken) || missing(accessTokenSecret)) {
     stop("Missing one or more twitter API keys.", call. = FALSE)
@@ -47,18 +43,6 @@ Authenticate.twitter <- function(socialmedia, appName, apiKey, apiSecret, access
   credential <- list(socialmedia = "twitter", auth = NULL)
   class(credential) <- append(class(credential), c("credential", "twitter"))
 
-  if (useCachedToken) {
-    if (file.exists(token_file_name)) {
-      cat("Cached twitter token was found (using cached token).\n")
-      twitter_oauth <- LoadCredential(token_file_name)
-      # todo: check loaded token is valid before returning
-      credential$auth <- twitter_oauth
-      return(credential)
-    } else {
-      cat("OAuth token not found. A token will be created and saved to the working directory.\n")
-    }
-  }
-
   twitter_oauth <- rtweet::create_token(
     app = appName,
     consumer_key = apiKey,
@@ -67,10 +51,6 @@ Authenticate.twitter <- function(socialmedia, appName, apiKey, apiSecret, access
     access_secret = accessTokenSecret,
     set_renv = FALSE)
 
-  if (useCachedToken) {
-    SaveCredential(twitter_oauth, file = token_file_name)
-  }
-
   credential$auth <- twitter_oauth
 
   return(credential)

diff --git a/R/Collect.R b/R/Collect.R
@@ -13,12 +13,21 @@
 #'
 #' @export
 Collect <- function(credential, ...) {
+  # set the environment encoding to UTF-8 for data collection
+  save_enc <- getOption("encoding")
+  on.exit({
+    tictoc::toc(quiet = FALSE, func.toc = collectTocOutput)
+    options(encoding = save_enc)
+  }, add = TRUE)
+  options(encoding = "UTF-8")
+  tictoc::tic(msg = "Elapsed time")
+
   # searches the class list of credential for matching method
   UseMethod("Collect", credential)
 }
 
 # default function
 #' @export
 Collect.default <- function(credential, ...) {
-  stop("Unknown social media type passed to collect.", call. = FALSE) 
+  stop("Unknown social media type passed to collect.", call. = FALSE)
 }
diff --git a/R/Collect.reddit.R b/R/Collect.reddit.R
@@ -31,36 +31,26 @@ Collect.reddit <- function(credential, threadUrls, waitTime = 5, writeToFile = F
 
   cat("Collecting thread data for reddit urls...\n")
 
-  # reddit_content uses a progress bar that defaults to option width
-  # set to be much smaller than page
-  # save_width <- getOption("width")
-
-  # progress_width <- save_width - 40
-  # if (progress_width >= 20) {
-  #   options("width" = progress_width)
-  # }
-
-  # options("width" = 60)
-
   threads_df <- NULL
 
+  # cat(paste0("encoding: ", getOption("encoding"), "\n"))
+
   # make the get request for the reddit thread url
   tryCatch({
     capture.output(threads_df <- RedditExtractoR::reddit_content(threadUrls, waitTime), type = c("output"))
-    # RedditExtractoR::reddit_content(threadUrls, waitTime)
   }, error = function(e) {
     stop(gsub("^Error:\\s", "", paste0(e)), call. = FALSE)
-  }, finally = {
-    # reset width
-    # options("width" = save_width)
-  })
+  }, finally = { })
 
   if (!is.null(threads_df)) {
     if (nrow(threads_df) > 0) {
       # add thread id to df, extracted from url
       threads_df$thread_id <- gsub("^(.*)?/comments/([0-9A-Za-z]{6})?/.*?(/)?$", "\\2", 
                                    threads_df$URL, ignore.case = TRUE, perl = TRUE)
 
+      cat("HTML decoding comments.\n")
+      threads_df$comment <- textutils::HTMLdecode(threads_df$comment)
+
       # summary
       results_df <- threads_df %>% 
         dplyr::group_by(.data$thread_id) %>%

diff --git a/R/Collect.twitter.R b/R/Collect.twitter.R
@@ -81,10 +81,23 @@ Collect.twitter <- function(credential, searchTerm = "", searchType = "recent",
 
   # additional twitter api params
   dots <- substitute(...())
-  search_params[['...']] <- dots
+  # search_params[['...']] <- dots
+  search_params <- append(search_params, dots)
 
   tweets_df <- do.call(rtweet::search_tweets, search_params)
 
+  # summary
+  if (nrow(tweets_df) > 0) {
+    results_df <- tweets_df %>% dplyr::filter(.data$status_id %in% c(min(.data$status_id), max(.data$status_id))) %>% 
+      dplyr::mutate(tweet = ifelse(.data$status_id == min(.data$status_id), "Min ID", "Max ID"), 
+                    created = as.character(.data$created_at)) %>% 
+      dplyr::select(.data$tweet, .data$status_id, .data$created, .data$screen_name) %>% 
+      dplyr::arrange(.data$status_id)
+
+    results_df$screen_name <- paste0("@", results_df$screen_name)
+
+    printResultTable(results_df)
+  }
   cat(paste0("Collected ", nrow(tweets_df), " tweets.\n"))
 
   # rds chosen over csv to avoid flattening lists in the data

diff --git a/R/Collect.youtube.R b/R/Collect.youtube.R
@@ -246,7 +246,7 @@ Collect.youtube <- function(credential, videoIDs, verbose = FALSE, writeToFile =
   }
 
   if (writeToFile) { writeOutputFile(dataCombined, "csv", "YoutubeData") }
-    
+
   cat("Done.\n")
   flush.console()
-Original file line number
+Diff line change
@@ Expand Up @@
       }
       if (writeToFile) { writeOutputFile(dataCombined, "csv", "YoutubeData") }
       cat("Done.\n")
       flush.console()
@@ Expand Down @@