Skip to content

Commit

Permalink
Updated twitter collection and reddit text cleaning (#11)
Browse files Browse the repository at this point in the history
* Removed Save and LoadCredential functions
* Reddit text encoding stopgap
* Added twitter collection summary to output max status_id
* Added timer and elapsed time output for the Collect function
* Fixed a bug in passing additional twitter api params
* Moved reddit HTML decoding to Collect function
  • Loading branch information
bryn-g authored Mar 10, 2019
1 parent 1e2211c commit 14060c6
Show file tree
Hide file tree
Showing 47 changed files with 543 additions and 261 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
Package: vosonSML
Version: 0.26.3
Version: 0.27.0
Title: Collecting Social Media Data and Generating Networks for Analysis
Description: A suite of tools for collecting and constructing networks from social media data.
Provides easy-to-use functions for collecting data across popular platforms (Twitter, YouTube
and Reddit) and generating different types of networks for analysis.
Type: Package
Imports: tm, stringr, RCurl, igraph (>= 1.2.2), Hmisc, data.table, httpuv, methods, httr,
magrittr, dplyr (>= 0.7.8), rlang (>= 0.3.0.1), RedditExtractoR (>= 2.1.2),
rtweet (>= 0.6.8)
rtweet (>= 0.6.8), textutils, tictoc
Depends: R (>= 3.2.0)
Encoding: UTF-8
Author: Timothy Graham, Robert Ackland, Chung-hong Chan, Bryan Gertzel
Expand Down
8 changes: 5 additions & 3 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,12 @@ S3method(Create,default)
S3method(Create,semantic)
S3method(Create,semantic.default)
S3method(Create,semantic.twitter)
export(AddUserData.twitter)
export(AddTwitterUserData)
export(Authenticate)
export(Collect)
export(Create)
export(GetYoutubeVideoIDs)
export(ImportData)
export(LoadCredential)
export(SaveCredential)
import(RCurl)
import(data.table)
import(httpuv)
Expand All @@ -38,6 +36,7 @@ importFrom(Hmisc,escapeRegex)
importFrom(RedditExtractoR,reddit_content)
importFrom(RedditExtractoR,user_network)
importFrom(dplyr,anti_join)
importFrom(dplyr,arrange)
importFrom(dplyr,bind_rows)
importFrom(dplyr,coalesce)
importFrom(dplyr,distinct)
Expand Down Expand Up @@ -75,6 +74,9 @@ importFrom(rtweet,users_data)
importFrom(stringr,str_extract)
importFrom(stringr,str_match_all)
importFrom(stringr,str_replace_all)
importFrom(textutils,HTMLdecode)
importFrom(tictoc,tic)
importFrom(tictoc,toc)
importFrom(utils,"capture.output")
importFrom(utils,"flush.console")
importFrom(utils,"install.packages")
Expand Down
24 changes: 24 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,27 @@
# vosonSML 0.27.0

## Bug Fixes
- Fixed a bug in `Collect.twitter` in which any additional `twitter API` parameters
e.g `lang` or `until` were not being passed properly to `rtweet::search_tweets`. This
resulted in the additional parameters being ignored.

## Major Changes
- Removed the `SaveCredential` and `LoadCredential` functions, as well as the `useCachedToken`
parameter for `Authenticate.twitter`. These were simply calling the `saveRDS` and `readRDS`
functions and not performing any additional processing. Using `saveRDS` and `readRDS` directly
to save and load an `Authenticate` credential object to file is simpler.
- Changed the way that the `cleanText` parameter works in `Create.actor.reddit` so that it is
more permissive. Addresses encoding issues with apostrophes and pound symbols and removes
unicode characters not permitted by the XML 1.0 standard as used in `graphml` files. This is
best effort and does not resolve all `reddit` text encoding issues.

## Minor Changes
- Added `Collect.twitter` summary information that includes the earliest (min) and latest (max)
tweet `status_id` collected with timestamp. The `status_id` values can be used to frame
subsequent collections as `since_id` or `max_id` parameter values. If the `until` date
parameter was used the timestamp can also be used as a quick confirmation.
- Added elapsed time output to the `Collect` method.

# vosonSML 0.26.3

## Bug Fixes
Expand Down
12 changes: 6 additions & 6 deletions R/AddUserData.twitter.R → R/AddTwitterUserData.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,17 @@
#' # add additional twitter user profile information to actor network graph as node attributes
#' # requires twitterAuth from Authenticate, twitterData from Collect and actorNetwork from
#' # Create actor network
#' actorNetWithUserAttr <- AddUserData.twitter(twitterData, actorNetwork,
#' lookupUsers = TRUE,
#' twitterAuth = twitterAuth, writeToFile = TRUE)
#' actorNetWithUserAttr <- AddTwitterUserData(twitterData, actorNetwork,
#' lookupUsers = TRUE,
#' twitterAuth = twitterAuth, writeToFile = TRUE)
#' # igraph object
#' # actorNetWithUserAttr$graph
#' }
#'
#' @aliases AddUserData.twitter
#' @name vosonSML::AddUserData.twitter
#' @aliases AddTwitterUserData
#' @name vosonSML::AddTwitterUserData
#' @export
AddUserData.twitter <- function(collectData, networkData, lookupUsers = TRUE, twitterAuth = NULL, writeToFile = FALSE) {
AddTwitterUserData <- function(collectData, networkData, lookupUsers = TRUE, twitterAuth = NULL, writeToFile = FALSE) {

dfCollect <- collectData
dfRelations <- networkData$relations
Expand Down
24 changes: 2 additions & 22 deletions R/Authenticate.twitter.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
#' @param apiSecret Character string. API secret to authenticate.
#' @param accessToken Character string. API access token to authenticate.
#' @param accessTokenSecret Character string. API access token secret to authenticate.
#' @param useCachedToken Logical. Use cached access token if found otherwise create one. The cached token file is
#' named \code{.twitter-oauth} and saved to the working directory.To refresh the cached token this file has to be
#' manually deleted. Default is \code{TRUE}.
#' @param ... Additional parameters passed to function. Not used in this method.
#'
#' @return A \code{credential} object containing an access token \code{$auth} and social media type descriptor
Expand All @@ -26,12 +23,11 @@
#'
#' twitterAuth <- Authenticate("twitter", appName = myKeys$appName,
#' apiKey = myKeys$apiKey, apiSecret = myKeys$apiSecret, accessToken = myKeys$accessToken,
#' accessTokenSecret = myKeys$accessTokenSecret, useCachedToken = TRUE)
#' accessTokenSecret = myKeys$accessTokenSecret)
#' }
#'
#' @export
Authenticate.twitter <- function(socialmedia, appName, apiKey, apiSecret, accessToken, accessTokenSecret,
useCachedToken = TRUE, ...) {
Authenticate.twitter <- function(socialmedia, appName, apiKey, apiSecret, accessToken, accessTokenSecret, ...) {

if (missing(apiKey) || missing(apiSecret) || missing(accessToken) || missing(accessTokenSecret)) {
stop("Missing one or more twitter API keys.", call. = FALSE)
Expand All @@ -47,18 +43,6 @@ Authenticate.twitter <- function(socialmedia, appName, apiKey, apiSecret, access
credential <- list(socialmedia = "twitter", auth = NULL)
class(credential) <- append(class(credential), c("credential", "twitter"))

if (useCachedToken) {
if (file.exists(token_file_name)) {
cat("Cached twitter token was found (using cached token).\n")
twitter_oauth <- LoadCredential(token_file_name)
# todo: check loaded token is valid before returning
credential$auth <- twitter_oauth
return(credential)
} else {
cat("OAuth token not found. A token will be created and saved to the working directory.\n")
}
}

twitter_oauth <- rtweet::create_token(
app = appName,
consumer_key = apiKey,
Expand All @@ -67,10 +51,6 @@ Authenticate.twitter <- function(socialmedia, appName, apiKey, apiSecret, access
access_secret = accessTokenSecret,
set_renv = FALSE)

if (useCachedToken) {
SaveCredential(twitter_oauth, file = token_file_name)
}

credential$auth <- twitter_oauth

return(credential)
Expand Down
11 changes: 10 additions & 1 deletion R/Collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,21 @@
#'
#' @export
Collect <- function(credential, ...) {
# set the environment encoding to UTF-8 for data collection
save_enc <- getOption("encoding")
on.exit({
tictoc::toc(quiet = FALSE, func.toc = collectTocOutput)
options(encoding = save_enc)
}, add = TRUE)
options(encoding = "UTF-8")
tictoc::tic(msg = "Elapsed time")

# searches the class list of credential for matching method
UseMethod("Collect", credential)
}

# default function
#' @export
Collect.default <- function(credential, ...) {
stop("Unknown social media type passed to collect.", call. = FALSE)
stop("Unknown social media type passed to collect.", call. = FALSE)
}
22 changes: 6 additions & 16 deletions R/Collect.reddit.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,36 +31,26 @@ Collect.reddit <- function(credential, threadUrls, waitTime = 5, writeToFile = F

cat("Collecting thread data for reddit urls...\n")

# reddit_content uses a progress bar that defaults to option width
# set to be much smaller than page
# save_width <- getOption("width")

# progress_width <- save_width - 40
# if (progress_width >= 20) {
# options("width" = progress_width)
# }

# options("width" = 60)

threads_df <- NULL

# cat(paste0("encoding: ", getOption("encoding"), "\n"))

# make the get request for the reddit thread url
tryCatch({
capture.output(threads_df <- RedditExtractoR::reddit_content(threadUrls, waitTime), type = c("output"))
# RedditExtractoR::reddit_content(threadUrls, waitTime)
}, error = function(e) {
stop(gsub("^Error:\\s", "", paste0(e)), call. = FALSE)
}, finally = {
# reset width
# options("width" = save_width)
})
}, finally = { })

if (!is.null(threads_df)) {
if (nrow(threads_df) > 0) {
# add thread id to df, extracted from url
threads_df$thread_id <- gsub("^(.*)?/comments/([0-9A-Za-z]{6})?/.*?(/)?$", "\\2",
threads_df$URL, ignore.case = TRUE, perl = TRUE)

cat("HTML decoding comments.\n")
threads_df$comment <- textutils::HTMLdecode(threads_df$comment)

# summary
results_df <- threads_df %>%
dplyr::group_by(.data$thread_id) %>%
Expand Down
15 changes: 14 additions & 1 deletion R/Collect.twitter.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,23 @@ Collect.twitter <- function(credential, searchTerm = "", searchType = "recent",

# additional twitter api params
dots <- substitute(...())
search_params[['...']] <- dots
# search_params[['...']] <- dots
search_params <- append(search_params, dots)

tweets_df <- do.call(rtweet::search_tweets, search_params)

# summary
if (nrow(tweets_df) > 0) {
results_df <- tweets_df %>% dplyr::filter(.data$status_id %in% c(min(.data$status_id), max(.data$status_id))) %>%
dplyr::mutate(tweet = ifelse(.data$status_id == min(.data$status_id), "Min ID", "Max ID"),
created = as.character(.data$created_at)) %>%
dplyr::select(.data$tweet, .data$status_id, .data$created, .data$screen_name) %>%
dplyr::arrange(.data$status_id)

results_df$screen_name <- paste0("@", results_df$screen_name)

printResultTable(results_df)
}
cat(paste0("Collected ", nrow(tweets_df), " tweets.\n"))

# rds chosen over csv to avoid flattening lists in the data
Expand Down
2 changes: 1 addition & 1 deletion R/Collect.youtube.R
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ Collect.youtube <- function(credential, videoIDs, verbose = FALSE, writeToFile =
}

if (writeToFile) { writeOutputFile(dataCombined, "csv", "YoutubeData") }

cat("Done.\n")
flush.console()

Expand Down
Loading

0 comments on commit 14060c6

Please sign in to comment.