diff --git a/.gitignore b/.gitignore index 1ce9df9..3dec67a 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ vignettes/*.pdf # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 .httr-oauth +.twitter_oauth_token # knitr and R markdown default cache directories /*_cache/ diff --git a/vosonSML/DESCRIPTION b/vosonSML/DESCRIPTION index 62acbc5..8a52c8f 100644 --- a/vosonSML/DESCRIPTION +++ b/vosonSML/DESCRIPTION @@ -1,15 +1,14 @@ Package: vosonSML -Version: 0.24.0 +Version: 0.25.0 Title: Tools for Collecting Social Media Data and Generating Networks for Analysis Description: A suite of tools for collecting and constructing networks from social media data. Provides easy-to-use functions for collecting data across popular platforms (Instagram, Facebook, Twitter, YouTube and Reddit) and generating different types of networks for analysis. Type: Package -Imports: tm, stringr, twitteR, RCurl, bitops, rjson, plyr, igraph (>= 1.2.2), Rfacebook (>= 0.6.15), +Imports: tm, stringr, RCurl, bitops, rjson, plyr, igraph (>= 1.2.2), Rfacebook (>= 0.6.15), Hmisc, data.table, httpuv, instaR, methods, httr, RedditExtractoR (>= 2.1.2), magrittr, - dplyr (>= 0.7.8), rlang (>= 0.3.0.1) + dplyr (>= 0.7.8), rlang (>= 0.3.0.1), rtweet (>= 0.6.8) Depends: R (>= 3.2.0) -Suggests: testthat Encoding: UTF-8 Author: Timothy Graham, Robert Ackland, Chung-hong Chan, Bryan Gertzel Maintainer: Bryan Gertzel diff --git a/vosonSML/NAMESPACE b/vosonSML/NAMESPACE index 831488c..e17059c 100644 --- a/vosonSML/NAMESPACE +++ b/vosonSML/NAMESPACE @@ -1,28 +1,28 @@ # Generated by roxygen2: do not edit by hand +S3method(CreateActorNetwork,default) +S3method(CreateActorNetwork,reddit) S3method(CreateActorNetwork,twitter) +S3method(CreateActorNetwork,youtube) S3method(CreateBimodalNetwork,facebook) S3method(CreateBimodalNetwork,instagram) -S3method(CreateBimodalNetwork,twitter) S3method(CreateDynamicNetwork,facebook) S3method(CreateEgoNetworkFromData,instagram) -S3method(CreateSemanticNetwork,twitter) export(Authenticate) export(AuthenticateWithFacebookAPI) export(AuthenticateWithInstagramAPI) -export(AuthenticateWithTwitterAPI) export(Collect) export(CollectDataFacebook) export(CollectDataInstagram) -export(CollectDataTwitter) export(CollectEgoInstagram) export(Create) +export(CreateActorNetwork) export(CreateEgoNetwork) export(GetYoutubeVideoIDs) +export(GraphUserInfoTwitter) +export(ImportData) export(LoadCredential) -export(PopulateUserInfo) export(SaveCredential) -export(importData) import(RCurl) import(bitops) import(data.table) @@ -38,16 +38,23 @@ importFrom(Rfacebook,fbOAuth) importFrom(Rfacebook,getPage) importFrom(Rfacebook,getPost) importFrom(Rfacebook,getUsers) +importFrom(dplyr,anti_join) importFrom(dplyr,coalesce) +importFrom(dplyr,distinct) +importFrom(dplyr,ends_with) importFrom(dplyr,filter) +importFrom(dplyr,funs) importFrom(dplyr,group_by) importFrom(dplyr,left_join) importFrom(dplyr,mutate) +importFrom(dplyr,mutate_all) +importFrom(dplyr,mutate_at) importFrom(dplyr,rename) importFrom(dplyr,row_number) importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,ungroup) +importFrom(dplyr,vars) importFrom(igraph,'V<-') importFrom(igraph,V) importFrom(igraph,delete.vertices) @@ -57,6 +64,7 @@ importFrom(igraph,graph_from_data_frame) importFrom(igraph,set.graph.attribute) importFrom(igraph,set_graph_attr) importFrom(igraph,simplify) +importFrom(igraph,vcount) importFrom(igraph,write.graph) importFrom(instaR,getComments) importFrom(instaR,getFollowers) @@ -65,17 +73,19 @@ importFrom(instaR,getLikes) importFrom(instaR,getUser) importFrom(instaR,instaOAuth) importFrom(instaR,searchInstagram) +importFrom(magrittr,'%<>%') importFrom(magrittr,'%>%') importFrom(plyr,ldply) importFrom(rlang,'.data') +importFrom(rtweet,create_token) +importFrom(rtweet,lookup_users) +importFrom(rtweet,rate_limit) +importFrom(rtweet,search_tweets) +importFrom(rtweet,users_data) importFrom(stats,'na.omit') importFrom(stringr,str_extract) importFrom(stringr,str_match_all) importFrom(stringr,str_replace_all) -importFrom(twitteR,lookupUsers) -importFrom(twitteR,searchTwitter) -importFrom(twitteR,setup_twitter_oauth) -importFrom(twitteR,twListToDF) importFrom(utils,"flush.console") importFrom(utils,"install.packages") importFrom(utils,"read.csv") diff --git a/vosonSML/R/Authenticate.R b/vosonSML/R/Authenticate.R index 6d48a35..9a06267 100644 --- a/vosonSML/R/Authenticate.R +++ b/vosonSML/R/Authenticate.R @@ -1,162 +1,91 @@ -## The AuthenticateWithTwitterAPI is not functional because it relies on a "side effect". It is a twitteR design problem. -## AuthenticateWithFacebookAPI can be fixed to make it functional. - -## TODO: Maybe need to unify the variable names, currently there are: -### facebook: appID, appSecret, extended_permissions, useCachedToken -### twitter: api_key, api_secret, access_token, access_token_secret, createToken <- inconsistent? -### youtube: apiKeyYoutube <- inconsistent? -### instagram: appID, appSecret, useCachedToken - -## Maybe make it consistent with only camel, as the rest of the package uses camel, not underscore. But hadleyverse packages usually use underscores: -## Therefore, unified variable names: -## appID, appSecret, apiKey, apiSecret, accessToken, accessTokenSecret, useCachedToken, extendedPermissions, createToken - -#' Create credential to access social media APIs +#' Create a credential to access social media APIs #' -#' \code{Authenticate} creates a \code{credential} object that enables R to -#' make authenticated calls to social media APIs. A \code{credential} object -#' is a S3 object with the authentication-related information such as access -#' tokens and the information on the social media that grant authentication. -#' \code{Authenticate} is the first step of the \code{Authenticate}, -#' \code{Collect}, \code{Create} workflow. +#' \code{Authenticate} creates a \code{credential} object that enables R to make authenticated calls to social media +#' APIs. A \code{credential} object is a S3 object with the authentication-related information such as access tokens +#' and the information on the social media that grant authentication. \code{Authenticate} is the first step of the +#' \code{Authenticate}, \code{\link{Collect}} and \code{\link{Create}} workflow. #' -#' @param socialmedia character string, social media API to authenticate, -#' currently supports "facebook", "youtube", "twitter", "instagram" and "reddit" -#' @param ... additional parameters for authentication -#' \code{facebook}: appID, appSecret -#' \code{youtube}: apiKey -#' \code{twitter}: apiKey, apiSecret, accessToken, accessTokenSecret -#' \code{instagram}: appID, appSecret -#' \code{reddit}: appName, appKey, appSecret, useTokenCache +#' @param socialmedia Character string. Identifier for social media API to authenticate.\cr +#' Supports: \code{"twitter"}, \code{"youtube"}, \code{"reddit"}, \code{"instagram"} and \code{"facebook"}. +#' @param ... Additional parameters for authentication appropriate to \code{socialmedia} identifier. +#' \describe{ +#' \item{twitter:}{\code{[appName], apiKey, apiSecret, accessToken, +#' accessTokenSecret, [useCachedToken]}} +#' \item{youtube:}{\code{apiKey}} +#' \item{reddit:}{\code{[appName], appKey, appSecret, [useCachedToken]}} +#' \item{instagram:}{\code{appID, appSecret, [useCachedToken]}} +#' \item{facebook:}{\code{appID, appSecret, [extendedPermissions, useCachedToken]}} +#' } #' -#' @return credential object with authentication information +#' @return A \code{credential} object with authentication information. #' -#' @note Currently, \code{Authenticate} with socialmedia = "twitter" generates -#' oauth information to be used in the current active session only (i.e. -#' "side-effect") and no authentication-related information will be stored in -#' the returned \code{credential} object. +#' @note Currently, \code{Authenticate} with \code{socialmedia = "twitter"} generates OAuth information to be used in +#' the current active session only (i.e. "side-effect") and no authentication-related information will be stored in the +#' returned \code{credential} object. #' -#' @author Chung-hong Chan -#' @seealso \code{\link{AuthenticateWithFacebookAPI}}, -#' \code{\link{AuthenticateWithInstagramAPI}}, -#' \code{\link{AuthenticateWithTwitterAPI}}, -#' \code{\link{SaveCredential}}, -#' \code{\link{LoadCredential}} -#' @examples +#' For other social network API's it's useful to cache the credential to a file and then re-use it in future sessions. +#' Refer to \code{\link{SaveCredential}} and \code{\link{LoadCredential}} to do this. +#' +#' @seealso \code{\link{SaveCredential}}, \code{\link{Collect}}, \code{\link{Create}} +#' @keywords authenticate credential twitter youtube reddit instagram facebook #' +#' @examples #' \dontrun{ #' require(magrittr) -#' ## Instagram ego network example -#' myAppID <- "123456789098765" -#' myAppSecret <- "abc123abc123abc123abc123abc123ab" -#' myUsernames <- c("senjohnmccain","obama") #' -#' Authenticate("instagram", -#' appID = myAappId, -#' appSecret = myAppSecret) %>% Collect(ego = TRUE, -#' username = myUsernames) %>% Create +#' ## youtube actor network example +#' +#' myYoutubeAPIKey <- "xxxxxxxxxxxxxxxxxxxxxx" +#' listYoutubeVideoIDs <- c("W2GZFeYGU3s", "mL27TAJGlWc") +#' +#' myActorNetwork <- Authenticate("youtube", apiKey = myYoutubeAPIKey) %>% +#' Collect(videoIDs = listYoutubeVideoIDs) %>% Create("actor") #' -#' ## YouTube actor network example -#' my_apiKeyYoutube <- "314159265358979qwerty" -#' videoIDs <- c("W2GZFeYGU3s","mL27TAJGlWc") +#' ## instagram ego network example #' -#' Authenticate("youtube", -#' apiKey = my_apiKeyYoutube) %>% Collect(videoIDs = videoIDs) %>% Create('actor') +#' myInstaAppID <- "xxxxxxxxxxx" +#' myInstaAppSecret <- "xxxxxxxxxxxxxxxxxxxxxx" +#' listInstaUsernames <- c("senjohnmccain", "obama") +#' +#' myEgoNetwork <- Authenticate("instagram", appID = myInstaAppID, appSecret = myInstaAppSecret) %>% +#' Collect(ego = TRUE, username = listInstaUsernames) %>% Create("ego") #' } +#' #' @export Authenticate <- function(socialmedia, ...) { authenticator <- switch(tolower(socialmedia), - facebook = facebookAuthenticator, - youtube = youtubeAuthenticator, twitter = twitterAuthenticator, - instagram = instagramAuthenticator, + youtube = youtubeAuthenticator, reddit = redditAuthenticator, - stop("Unknown socialmedia") - ) + instagram = instagramAuthenticator, + facebook = facebookAuthenticator, + stop("Unknown socialmedia")) + auth <- authenticator(...) + credential <- list(socialmedia = tolower(socialmedia), auth = auth) class(credential) <- append(class(credential), "credential") + return(credential) } -### For the side effect of saving the credential into a file. -### Useful to cache the Credential to a file and then re-use it in the future session. -### i.e. Authenticate %>% SaveCredential %>% Collect -### and then, LoadCredential %>% Collect - -#' Save and load credential information -#' -#' Functions to save and load credential information. Currently, credential -#' information will be stored as a RDS file. \code{SaveCredential} will return -#' the input \code{credential}, useful for working as a filter between the -#' \code{Authenticate} and \code{Collect}. -#' -#' @aliases LoadCredential SaveCredential -#' @param credential \code{credential} object -#' @param filename character, filename to be saved to or restored from -#' @return \code{credential} object -#' @note \code{credential} created from \code{Authenticate} with socialmedia = -#' 'twitter' will not be saved by SaveCredential -#' @examples -#' -#' \dontrun{ -#' require(magrittr) -#' myAppID <- "123456789098765" -#' myAppSecret <- "abc123abc123abc123abc123abc123ab" -#' myUsernames <- c("senjohnmccain","obama") -#' -#' Authenticate("instagram", -#' appID = myAppId, -#' appSecret = myAppSecret) %>% SaveCredential("instagramCred.RDS") %>% Collect(ego = TRUE, -#' username = myUsernames) %>% Create -#' -#' ## Load the previously saved credential information -#' LoadCredential("instagramCred.RDS") %>% Collect(tag="obama", -#' distance=5000, n=100) %>% Create("bimodal") -#' } -#' @export -SaveCredential <- function(credential, filename = "credential.RDS") { - if (credential$socialmedia == "twitter") { - warning("Credential created for Twitter will not be saved.") - } else { - saveRDS(credential, filename) - } - return(credential) -} - -#' @rdname SaveCredential -#' @export -LoadCredential <- function(filename = "credential.RDS") { - credential <- readRDS(filename) - return(credential) +twitterAuthenticator <- function(appName, apiKey, apiSecret, accessToken, accessTokenSecret, useCachedToken) { + return(AuthenticateWithTwitterAPI(appName, apiKey, apiSecret, accessToken, accessTokenSecret, useCachedToken)) } -### *Authenticator functions should not be exported. It is just a bunch of helper functions to bridge the AuthenticateWith* functions with Authenticate(), but with datasource as the first argument and always return an auth object - -### As a convention, function starts with lower case shouldn't be exported. - youtubeAuthenticator <- function(apiKey) { - return(authenticateWithYoutubeAPI(apiKey)) + return(AuthenticateWithYoutubeAPI(apiKey)) } -### Currently, this Authenticator will return nothing, only for its side effect -### SAD!!!!!!!!!!!!!!!!!! -### i.e. cannot use SaveCredential and LoadCredential! - -twitterAuthenticator <- function(apiKey, apiSecret, accessToken, accessTokenSecret, createToken) { - AuthenticateWithTwitterAPI(api_key = apiKey, api_secret = apiSecret, access_token = accessToken, access_token_secret = accessTokenSecret, createToken = createToken) # ah, only for its side effect, really bad design decision, twitteR! +redditAuthenticator <- function(appName, appKey, appSecret, useCachedToken) { + # return(AuthenticateWithRedditAPI(appName, appKey, appSecret, useCachedToken)) return(NULL) } -facebookAuthenticator <- function(appID, appSecret, extendedPermissions = FALSE) { - return(AuthenticateWithFacebookAPI(appID, appSecret, extended_permissions = extendedPermissions, useCachedToken = FALSE)) -} - instagramAuthenticator <- function(appID, appSecret) { return(AuthenticateWithInstagramAPI(appID, appSecret)) } -redditAuthenticator <- function(appName, appKey, appSecret, useTokenCache) { - # return(AuthenticateWithRedditAPI(appName, appKey, appSecret, useTokenCache)) - return(NULL) +facebookAuthenticator <- function(appID, appSecret, extendedPermissions = FALSE) { + return(AuthenticateWithFacebookAPI(appID, appSecret, extendedPermissions, useCachedToken = FALSE)) } diff --git a/vosonSML/R/AuthenticateWithRedditAPI.R b/vosonSML/R/AuthenticateWithRedditAPI.R index cbdfe71..c935d83 100644 --- a/vosonSML/R/AuthenticateWithRedditAPI.R +++ b/vosonSML/R/AuthenticateWithRedditAPI.R @@ -12,14 +12,14 @@ #' @param appName character string containing the reddit app name associated with the API key. #' @param appKey character string containing the app key. #' @param appSecret character string containing the app secret. -#' @param useTokenCache logical. Use cached authentication token if found. +#' @param useCachedToken logical. Use cached authentication token if found. #' #' @return a reddit authentication token #' -AuthenticateWithRedditAPI <- function(appName, appKey, appSecret, useTokenCache) { +AuthenticateWithRedditAPI <- function(appName, appKey, appSecret, useCachedToken) { if (missing(appName)) { - appName <- "reddit" + appName <- "vosonSML-reddit" } if (missing(appKey) | missing(appSecret)) { @@ -27,8 +27,8 @@ AuthenticateWithRedditAPI <- function(appName, appKey, appSecret, useTokenCache) return() } - if (missing(useTokenCache)) { - useTokenCache <- FALSE + if (missing(useCachedToken)) { + useCachedToken <- FALSE } # sets up oauth2 for reddit @@ -44,7 +44,7 @@ AuthenticateWithRedditAPI <- function(appName, appKey, appSecret, useTokenCache) scope = c("read"), use_basic_auth = TRUE, config_init = user_agent("httr oauth"), - cache = useTokenCache) + cache = useCachedToken) return(reddit_token) } diff --git a/vosonSML/R/AuthenticateWithTwitterAPI.R b/vosonSML/R/AuthenticateWithTwitterAPI.R index 2db9c27..622f9a5 100644 --- a/vosonSML/R/AuthenticateWithTwitterAPI.R +++ b/vosonSML/R/AuthenticateWithTwitterAPI.R @@ -1,89 +1,62 @@ -#' Note: this function is DEPRECATED and will be removed in a future release. -#' Please use the \code{Authenticate} function +#' Note: this function is DEPRECATED. Please use the \code{\link{Authenticate}} function. #' -#' Twitter API Authentication +#' Twitter API authentication #' -#' Oauth based authentication with the Twitter API +#' Oauth based authentication using the Twitter API. #' -#' In order to collect data from Twitter, the user must first authenticate with -#' Twitter's Application Programming Interface (API). +#' In order to collect data from Twitter, the user must first authenticate with Twitter's API. This requires setting up +#' an app on Twitter. A useful guide to creating an app can be found in the rtweet documentation: +#' https://rtweet.info/articles/auth.html#creating-a-twitter-app #' -#' This requires setting up an App on Twitter. An excellent guide to achieving -#' this can be found at: -#' http://thinktostart.com/twitter-authentification-with-r/ -#' -#' @param api_key character string specifying the 'API key' used for -#' authentication. -#' @param api_secret character string specifying the 'API secret' used for -#' authentication. -#' @param access_token character string specifying the 'access token' used for -#' authentication. -#' @param access_token_secret character string specifying the 'access token -#' secret' used for authentication. -#' @param createToken logical. !! NOT PROPERLY IMPLEMENTED YET. -#' @return This is called for its side effect. -#' @author Timothy Graham & Robert Ackland -#' -#' @seealso \code{AuthenticateWithFacebookAPI} and -#' \code{AuthenticateWithYouTubeAPI} for other ways to collect social media -#' data. -#' @keywords twitter social media SNA -#' @examples -#' -#' \dontrun{ -#' # Firstly specify your API credentials -#' my_api_key <- "1234567890qwerty" -#' my_api_secret <- "1234567890qwerty" -#' my_access_token <- "1234567890qwerty" -#' my_access_token_secret <- "1234567890qwerty" -#' -#' AuthenticateWithTwitterAPI(api_key=my_api_key, api_secret=my_api_secret, -#' access_token=my_access_token, access_token_secret=my_access_token_secret) -#' } -#' @export -AuthenticateWithTwitterAPI <- -function(api_key, api_secret, access_token, access_token_secret, createToken) { - - # EnsurePackage("tm") # we only load packages as required (i.e. if user authenticate with twitter, then we load packages for twitter data collection/analysis) - # EnsurePackage("stringr") - # EnsurePackage("twitteR") - # EnsurePackage("RCurl") - # EnsurePackage("bitops") - # EnsurePackage("rjson") - # EnsurePackage("plyr") - # EnsurePackage("igraph") - - if (missing(api_key) | missing(api_secret) | missing(access_token) | missing(access_token_secret)) { +#' @param appName Character string. Specifies the twitter registered app name associated with API keys. +#' @param apiKey Character string. Specifies the app 'API key' used for authentication. +#' @param apiSecret Character string. Specifies the app 'API secret'. +#' @param accessToken Character string. Specifies the app 'access token'. +#' @param accessTokenSecret Character string. Specifies the app 'access token secret'. +#' @param useCachedToken Logical. If \code{TRUE} uses cached API token if found otherwise creates one. +#' +#' @return twitter_oauth. Returns a twitter oauth token object. +#' +#' @seealso \code{\link{Authenticate}} +#' @keywords authenticate twitter +#' +AuthenticateWithTwitterAPI <- function(appName, apiKey, apiSecret, accessToken, accessTokenSecret, + useCachedToken) { + + if (missing(apiKey) | missing(apiSecret) | missing(accessToken) | missing(accessTokenSecret)) { cat("Error. One or more API credentials arguments are missing.\nPlease specify these. \n") - return() + return(NULL) } - - # We avoid the popup prompt about cached authentication, - # and instead include a `createToken` argument in the function, - # and directly set the options parameter for the "httr" package. - # (And default to no token if the argument is missing) - - origOptions <- options("httr_oauth_cache") # original options setting - - if (missing(createToken)) { - createToken <- FALSE # default to no token + + if (missing(appName)) { + appName <- "vosonSML-twitter" } - - if (createToken=="TRUE" | createToken=="true" | createToken=="T" | createToken==TRUE) { - createToken <- TRUE # handling user input - } - - if (createToken) { - options(httr_oauth_cache=T) + + twitter_oauth <- NULL + token_file_name <- ".twitter-oauth" + + if (useCachedToken) { + if (file.exists(token_file_name)) { + cat("\nCached twitter token was found (using cached token).\n") + twitter_oauth <- LoadCredential(token_file_name) + # todo: check loaded token is valid before returning + return(twitter_oauth) + } else { + cat("\nOAuth token not found. A token will be created and saved to working directory.\n") + } } - else { - options(httr_oauth_cache=F) + + twitter_oauth <- rtweet::create_token( + app = appName, + consumer_key = apiKey, + consumer_secret = apiSecret, + access_token = accessToken, + access_secret = accessTokenSecret, + set_renv = FALSE) + + if (useCachedToken) { + SaveCredential(twitter_oauth, filename = token_file_name) } - - setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret) - - options(httr_oauth_cache=origOptions) # reset options back to the original setting - - return() - + + return(twitter_oauth) } diff --git a/vosonSML/R/AuthenticateWithYoutubeAPI.R b/vosonSML/R/AuthenticateWithYoutubeAPI.R index af818f1..ac4681e 100644 --- a/vosonSML/R/AuthenticateWithYoutubeAPI.R +++ b/vosonSML/R/AuthenticateWithYoutubeAPI.R @@ -5,14 +5,13 @@ #' In order to collect data from YouTube, the user must first authenticate with Google's Application Programming #' Interface (API). Users can obtain a Google Developer API key at: https://console.developers.google.com. #' -#' @param apiKeyYoutube character string specifying your Google Developer API key. +#' @param apiKey character string specifying your Google Developer API key. #' #' @return This is called for its side effect. #' #' @note In the future this function will enable users to save the API key in working directory, and the function will #' automatically look for a locally stored key whenever it is called without apiKeyYoutube argument. #' -#' @noRd -authenticateWithYoutubeAPI <- function(apiKeyYoutube) { - return(apiKeyYoutube) +AuthenticateWithYoutubeAPI <- function(apiKey) { + return(apiKey) } diff --git a/vosonSML/R/Collect.R b/vosonSML/R/Collect.R index ea13fa3..10148d3 100644 --- a/vosonSML/R/Collect.R +++ b/vosonSML/R/Collect.R @@ -1,102 +1,123 @@ #' Collect data from social media for generating networks #' -#' This function collects data from social media APIs, and structures the data -#' into a data frame of class \code{dataSource.*}, ready for creating networks -#' for further analysis. \code{Collect} is the second step of the -#' \code{Authenticate}, \code{Collect}, \code{Create} workflow. This function is -#' a convenient UI wrapper to the core CollectDataFrom* family of functions. -#' -#' -#' @param credential \code{credential} object generated from -#' \code{Authenticate} -#' @param ego logical, collecting ego network data. Currently only support -#' Instagram. -#' @param ... additional parameters for data collection (refer to -#' CollectDataFrom* and CollectEgo* functions) -#' -#' \code{facebook}: pageName, rangeFrom, rangeTo, verbose, n, writeToFile, dynamic -#' \code{youtube}: videoIDs, verbose, writeToFile, maxComments -#' \code{twitter}: searchTerm, numTweets, verbose, writeToFile, language -#' \code{instagram}: credential, tag, n, lat, lng, distance, folder, mindate, maxdate, verbose, sleep, writeToFile, -#' waitForRateLimit -#' \code{reddit}: threadUrls, waitTime, writeToFile -#' -#' \code{instagram} with \code{ego} = TRUE: username, userid, verbose, -#' degreeEgoNet, waitForRateLimit, getFollows -#' @return A data.frame object of class \code{dataSource.*} that can be used -#' with \code{Create}. -#' @author Chung-hong Chan -#' @seealso \code{CollectDataFacebook}, -#' \code{CollectDataInstagram}, -#' \code{CollectDataTwitter}, -#' \code{CollectEgoInstagram}, -#' \code{CollectDataReddit}, -#' @examples +#' This function collects data from social media APIs, and structures the data into a data frame of class +#' \code{dataSource.*}, ready for creating networks for further analysis. \code{Collect} is the second step of the +#' \code{Authenticate}, \code{Collect}, \code{Create} workflow. This function is a convenient UI wrapper to the core +#' CollectDataFrom* family of functions. +#' +#' @param credential A \code{credential} object generated from \code{Authenticate}. +#' @param ego Logical. If \code{TRUE} collect ego network data. Currently only supports Instagram. +#' @param ... Additional parameters for data collection by appropriate to credential \code{socialmedia} type. +#' Refer to CollectDataFrom* and CollectEgo* functions for more details. +#' \describe{ +#' \item{twitter:}{\code{authToken, searchTerm, [searchType, numTweets, includeRetweets, retryOnRateLimit,}\cr +#' \code{writeToFile, verbose, ...]}} +#' \item{youtube:}{\code{videoIDs, apiKeyYoutube, [verbose, writeToFile, maxComments]}} +#' \item{reddit:}{\code{threadUrls, [waitTime, writeToFile]}} +#' \item{instagram:}{\code{tag, n, lat, lng, [distance, folder, mindate, maxdate, verbose, sleep,}\cr +#' \code{writeToFile, waitForRateLimit, credential]}} +#' \item{instagram with \code{ego = TRUE}:}{\code{username, userid, [verbose, degreeEgoNet,}\cr +#' \code{waitForRateLimit, getFollows, credential]}} +#' \item{facebook:}{\code{pageName, [rangeFrom, rangeTo, verbose, n, writeToFile, dynamic]}} +#' } +#' +#' @return A data.frame object of class \code{dataSource.*} that can be used with \code{Create}. #' +#' @seealso \code{Authenticate}, \code{Create} +#' @keywords collect twitter youtube reddit instagram facebook +#' +#' @examples #' \dontrun{ #' require(magrittr) -#' ## Instagram ego network example -#' myAppID <- "123456789098765" -#' myAppSecret <- "abc123abc123abc123abc123abc123ab" -#' myUsernames <- c("senjohnmccain","obama") -#' -#' Authenticate("instagram", -#' appID = myAappId, -#' appSecret = myAppSecret) %>% Collect(ego = TRUE, -#' username = myUsernames) %>% Create -#' -#' ## YouTube actor network example -#' my_apiKeyYoutube <- "314159265358979qwerty" -#' videoIDs <- c("W2GZFeYGU3s","mL27TAJGlWc") -#' -#' Authenticate("youtube", -#' apiKey = my_apiKeyYoutube) %>% Collect(videoIDs = videoIDs) %>% Create('actor') +#' +#' ## youtube actor network example +#' +#' myYoutubeAPIKey <- "xxxxxxxxxxxxxxxxxxxxxx" +#' listYoutubeVideoIDs <- c("W2GZFeYGU3s", "mL27TAJGlWc") +#' +#' myActorNetwork <- Authenticate("youtube", apiKey = myYoutubeAPIKey) %>% +#' Collect(videoIDs = listYoutubeVideoIDs) %>% Create("actor") +#' +#' ## instagram ego network example +#' +#' myInstaAppID <- "xxxxxxxxxxx" +#' myInstaAppSecret <- "xxxxxxxxxxxxxxxxxxxxxx" +#' listInstaUsernames <- c("senjohnmccain", "obama") +#' +#' myEgoNetwork <- Authenticate("instagram", appID = myInstaAppID, appSecret = myInstaAppSecret) %>% +#' Collect(ego = TRUE, username = listInstaUsernames) %>% Create("ego") +#' +#' ## facebook bimodal network example +#' +#' myFacebookAppID <- "xxxxxxxxxxx" +#' myFacebookAppSecret <- "xxxxxxxxxxxxxxxxxxxxxx" +#' +#' myBimodalNetwork <- Authenticate("Facebook", appID = myFacebookAppID, +#' appSecret = myFacebookAppSecret) %>% +#' SaveCredential("FBCredential.RDS") %>% +#' Collect(pageName = "StarWars", rangeFrom = "2015-03-01", rangeTo = "2015-03-02", +#' writeToFile = FALSE) %>% +#' Create("bimodal") +#' +#' ## facebook dynamic network example +#' +#' myDynamicNetwork <- LoadCredential("FBCredential.RDS") %>% +#' Collect(pageName = "StarWars", rangeFrom = "2015-03-01", rangeTo = "2015-03-02", +#' writeToFile = FALSE) %>% +#' Create("dynamic") #' } -#' +#' #' @export Collect <- function(credential, ego = FALSE, ...) { - if (ego) { - collector <- switch(credential$socialmedia, - instagram = instagramEgo, - stop("Unsupported socialmedia") - ) - } else { - collector <- switch(credential$socialmedia, - facebook = facebookCollector, - youtube = youtubeCollector, - twitter = twitterCollector, - instagram = instagramCollector, - reddit = redditCollector, - stop("Unsupported socialmedia") - ) - } - return(collector(credential, ...)) + if (ego) { + collector <- switch(credential$socialmedia, + instagram = instagramEgo, + stop("Unsupported socialmedia")) + } else { + collector <- switch(credential$socialmedia, + twitter = twitterCollector, + youtube = youtubeCollector, + reddit = redditCollector, + instagram = instagramCollector, + facebook = facebookCollector, + stop("Unsupported socialmedia")) + } + + return(collector(credential, ...)) } -### *collector functions should not be exported. It is just a bunch of helper functions to bridge the CollectDataFrom* functions with Collect(), but with credential obj as the first argument - -youtubeCollector <- - function(credential, videoIDs, verbose, writeToFile, maxComments) { - return(collectDataYoutube(videoIDs, apiKeyYoutube = credential$auth, verbose, writeToFile, maxComments)) +twitterCollector <- function(credential, ...) { + return(CollectDataTwitter(authToken = credential$auth, ...)) } -facebookCollector <- - function(credential,pageName,rangeFrom,rangeTo,verbose,n,writeToFile) { - return(CollectDataFacebook(pageName,rangeFrom,rangeTo,verbose,n,writeToFile, credential)) +# twitterCollector <- function(credential, searchTerm, searchType, numTweets, includeRetweets, retryOnRateLimit, +# writeToFile, verbose, ...) { +# return(CollectDataTwitter(authToken = credential$auth, searchTerm, searchType, numTweets, includeRetweets, +# retryOnRateLimit, writeToFile, verbose, ...)) +# } + +youtubeCollector <- function(credential, ...) { + return(CollectDataYoutube(apiKey = credential$auth, ...)) } -twitterCollector <- function(credential, searchTerm, numTweets, verbose, writeToFile, language, ...) { - return(CollectDataTwitter(searchTerm, numTweets, verbose, writeToFile, language, ...)) # credential means nothing to twitteR +# youtubeCollector <- function(credential, videoIDs, verbose, writeToFile, maxComments) { +# return(CollectDataYoutube(videoIDs, apiKeyYoutube = credential$auth, verbose, writeToFile, maxComments)) +# } + +redditCollector <- function(credential, threadUrls, waitTime, writeToFile) { + return(CollectDataReddit(threadUrls, waitTime, writeToFile)) } -instagramCollector <- function(credential, tag, n, lat, lng, distance, folder, mindate, maxdate, verbose, sleep, writeToFile, waitForRateLimit) { - return(CollectDataInstagram(tag, n, lat, lng, distance, folder, mindate, maxdate, verbose, sleep, writeToFile, waitForRateLimit, credential)) +instagramCollector <- function(credential, tag, n, lat, lng, distance, folder, mindate, maxdate, verbose, sleep, + writeToFile, waitForRateLimit) { + return(CollectDataInstagram(tag, n, lat, lng, distance, folder, mindate, maxdate, verbose, sleep, writeToFile, + waitForRateLimit, credential)) } instagramEgo <- function(credential, username, userid, verbose, degreeEgoNet, waitForRateLimit, getFollows) { - return(CollectEgoInstagram(username, userid, verbose, degreeEgoNet, waitForRateLimit, getFollows, credential)) + return(CollectEgoInstagram(username, userid, verbose, degreeEgoNet, waitForRateLimit, getFollows, credential)) } -redditCollector <- function(credential, threadUrls, waitTime, writeToFile) { - return(CollectDataReddit(threadUrls, waitTime, writeToFile)) +facebookCollector <- function(credential, pageName, rangeFrom, rangeTo, verbose, n, writeToFile) { + return(CollectDataFacebook(pageName, rangeFrom, rangeTo, verbose, n, writeToFile, credential)) } diff --git a/vosonSML/R/CollectDataReddit.R b/vosonSML/R/CollectDataReddit.R index 40278e9..4f74a35 100644 --- a/vosonSML/R/CollectDataReddit.R +++ b/vosonSML/R/CollectDataReddit.R @@ -11,23 +11,13 @@ #' @return A data frame object of class dataSource.reddit that can be used for creating unimodal #' networks (CreateActorNetwork). #' -CollectDataReddit <- function(threadUrls, waitTime = 5, writeToFile) { +CollectDataReddit <- function(threadUrls, waitTime = 5, writeToFile = FALSE) { - if (missing(threadUrls)) { - cat("Error. Argument `threadUrls` is missing.\nPlease provide a reddit thread url.\n") - return(NA) + if (missing(threadUrls) || !is.vector(threadUrls) || length(threadUrls) < 1) { + stop("Please provide a vector of one or more reddit thread urls.\n", call. = FALSE) } - if (!is.vector(threadUrls) || length(threadUrls) < 1) { - cat("Error. Please provide a vector of one or more reddit thread urls.\n") - return(NA) - } - - if (missing(writeToFile)) { - writeToFile <- FALSE - } - - cat("\nCollecting thread data for reddit urls:\n") + cat("Collecting thread data for reddit urls...\n") # make the get request for the reddit thread url threads_df <- RedditExtractoR::reddit_content(threadUrls, waitTime) @@ -36,13 +26,12 @@ CollectDataReddit <- function(threadUrls, waitTime = 5, writeToFile) { threads_df$thread_id <- gsub("^(.*)?/comments/([0-9A-Za-z]{6})?/.*?(/)?$", "\\2", threads_df$URL, ignore.case = TRUE, perl = TRUE) - if (isTrueValue(writeToFile)) { - writeOutputFile(threads_df, "csv", "RedditData") - } + if (writeToFile) { writeOutputFile(threads_df, "csv", "RedditData") } class(threads_df) <- append(class(threads_df), c("dataSource", "reddit")) - cat("\nDone!\n") + cat("Done.\n") + flush.console() return(threads_df) } \ No newline at end of file diff --git a/vosonSML/R/CollectDataTwitter.R b/vosonSML/R/CollectDataTwitter.R index 1cf1d50..9b3d75a 100644 --- a/vosonSML/R/CollectDataTwitter.R +++ b/vosonSML/R/CollectDataTwitter.R @@ -1,232 +1,99 @@ -#' Note: this function is DEPRECATED and will be removed in a future release. -#' Please use the \code{Collect} function +#' Note: this function is DEPRECATED. Please use the \code{\link{Collect}} function. #' #' Collect data from Twitter for generating different types of networks #' -#' This function collects data from Twitter based on hashtags or search terms, -#' and structures the data into a data frame of class -#' \code{dataSource.twitter}, ready for creating networks for further analysis. +#' This function collects data from Twitter based on hashtags or search terms, and structures the data into a data +#' frame of class \code{dataSource.twitter}, ready for creating networks for further analysis. #' -#' \code{CollectDataTwitter} collects public 'tweets' from Twitter using the -#' Twitter API. +#' \code{CollectDataTwitter} collects public 'tweets' from Twitter using the Twitter API. #' -#' The function then finds and maps the relationships of entities of interest -#' in the data (e.g. users, terms, hashtags), and structures these -#' relationships into a data frame format suitable for creating unimodal -#' networks (\code{CreateActorNetwork}), bimodal networks -#' (\code{CreateBimodalNetwork}), and semantic networks +#' The function then finds and maps the relationships of entities of interest in the data (e.g. users, terms, hashtags) +#' , and structures these relationships into a data frame format suitable for creating unimodal networks +#' (\code{CreateActorNetwork}), bimodal networks (\code{CreateBimodalNetwork}), and semantic networks #' (\code{CreateSemanticNetwork}). #' -#' The maximum number of tweets for a single call of \code{CollectDataTwitter} -#' is 1500. +#' The maximum number of tweets for a single call of \code{CollectDataTwitter} is 1500. #' -#' Language support is available, using the \code{language} argument. The user -#' can restrict tweets returned to a particular language, using the ISO 639-1 -#' code. For example, restricting to English would use \code{language="en"}. -#' The full list of codes is available here: -#' https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. +#' Language support is available, using the \code{language} parameter. The user can restrict tweets returned to a +#' particular language, using the ISO 639-1 code. For example, restricting to English would use \code{language="en"}. +#' The full list of codes is available here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. #' -#' A variety of query operators are available through the Twitter API. For -#' example, "love OR hate" returns any tweets containing either term (or both). -#' For more information see the Twitter API documentation (under the heading +#' A variety of query operators are available through the Twitter API. For example, "love OR hate" returns any tweets +#' containing either term (or both). For more information see the Twitter API documentation (under the heading #' 'Query Operators'): https://dev.twitter.com/rest/public/search #' -#' @param searchTerm character string, specifying a search term or phrase (e.g. -#' "Australian politics") or hashtag (e.g. "#auspol"). Many query operators are -#' available - see the Twitter documentation for more information: +#' @param authToken Twitter oauth token created by rtweet. +#' @param searchTerm Character string. Specifies a search term or phrase (e.g. "Australian politics") or hashtag (e.g. +#' "#auspol"). Many query operators are available - see the Twitter documentation for more information: #' https://dev.twitter.com/rest/public/search -#' @param numTweets numeric integer, specifying how many tweets to be -#' collected. Defaults to 1500. Maximum tweets for a single call of this -#' function is 1500. -#' @param verbose logical. If \code{TRUE} then this function will output -#' runtime information to the console as it computes. Useful diagnostic tool -#' for long computations. Default is \code{FALSE}. -#' @param writeToFile logical. If \code{TRUE} then the data is saved to file in -#' current working directory (CSV format), with filename denoting current -#' system time and \code{searchTerm}. Default is \code{FALSE}. -#' @param language character string, restricting tweets to the given language, -#' given by an ISO 639-1 code. For example, "en" restricts to English tweets. -#' Defaults to NULL. -#' @param since If not NULL, restricts tweets to those since the given date. Date is to be formatted -#' as YYYY-MM-DD (this is a wrapper to the searchTwitter function in the twitteR package). -#' @param until If not NULL, restricts tweets to those up until the given date. Date is to be formatted -#' as YYYY-MM-DD (this is a wrapper to the searchTwitter function in the twitteR package). -#' @param locale If not NULL, will set the locale for the search. As of 03/06/11 only ja is effective, -#' as per the Twitter API (this is a wrapper to the searchTwitter function in the twitteR package). -#' @param geocode If not NULL, returns tweets by users located within a given radius of the given -#' latitude/longitude. (this is a wrapper to the searchTwitter function in the twitteR package). -#' @param sinceID If not NULL, returns tweets with IDs greater (ie newer) than the specified ID -#' (this is a wrapper to the searchTwitter function in the twitteR package). -#' @param maxID If not NULL, returns tweets with IDs smaller (ie older) than the specified ID -#' (this is a wrapper to the searchTwitter function in the twitteR package). -#' @param resultType If not NULL, returns filtered tweets as per value. See details for allowed values. -#' (this is a wrapper to the searchTwitter function in the twitteR package). -#' @param retryOnRateLimit If non-zero the search command will block retry up to X times if the rate limit -#' is experienced. This might lead to a much longer run time but the task will -#' eventually complete if the retry count is high enough (this is a wrapper to the searchTwitter -#' function in the twitteR package). -#' @return A data frame object of class \code{dataSource.twitter} that can be -#' used for creating unimodal networks (\code{CreateActorNetwork}), bimodal -#' networks (\code{CreateBimodalNetwork}), and semantic networks +#' @param searchType Character string. Returns filtered tweets as per search type \code{recent}, \code{mixed} or +#' \code{popular}. Default type is \code{recent}. +#' @param numTweets Numeric. Specifies how many tweets to be collected. Defaults is \code{100}. +#' @param includeRetweets Logical. Specifies if the search should filter out retweets. Defaults is \code{TRUE}. +#' @param retryOnRateLimit Logical. Default is \code{FALSE}. +#' @param writeToFile Logical. If \code{TRUE} then the data is saved to file in current working directory (RDS format), +#' with filename denoting current system time and \code{searchTerm}. Default is \code{FALSE}. +#' @param verbose Logical. If \code{TRUE} then this function will output runtime information to the console as it +#' computes. Useful diagnostic tool for long computations. Default is \code{FALSE}. +#' @param ... Additional parameters to pass to the rtweet \code{search_tweets} function. +#' +#' @return A data frame object of class \code{dataSource.twitter} that can be used for creating unimodal networks +#' (\code{CreateActorNetwork}), bimodal networks (\code{CreateBimodalNetwork}), and semantic networks #' (\code{CreateSemanticNetwork}). -#' @note Data generated using this function is *not* suitable for dynamic -#' networks. Dynamic Twitter networks are not currently implemented in the -#' vosonSML package. This will be implemented in a future release. -#' @author Timothy Graham & Robert Ackland -#' -#' @seealso \code{AuthenticateWithTwitterAPI} must be run first or no data will -#' be collected. -#' @keywords twitter data mining SNA -#' @examples -#' -#' \dontrun{ -#' # Firstly specify your API credentials -#' my_api_key <- "1234567890qwerty" -#' my_api_secret <- "1234567890qwerty" -#' my_access_token <- "1234567890qwerty" -#' my_access_token_secret <- "1234567890qwerty" -#' -#' # Authenticate with the Twitter API using \code{AuthenticateWithTwitterAPI} -#' AuthenticateWithTwitterAPI(api_key=my_api_key, api_secret=my_api_secret, -#' access_token=my_access_token, access_token_secret=my_access_token_secret) -#' -#' # Collect tweets data using \code{myTwitterData} -#' myTwitterData <- CollectDataTwitter(searchTerm="#auspol", -#' numTweets=150,writeToFile=FALSE,verbose=FALSE) -#' -#' # Create an 'actor' network using \code{CreateActorNetwork} -#' g_actor_twitter <- CreateActorNetwork(myTwitterData) -#' -#' # Create a 'bimodal' network using \code{CreateBimodalNetwork} -#' g_bimodal_twitter <- CreateBimodalNetwork(myTwitterData) -#' -#' # Create a 'semantic' network using \code{CreateSemanticNetwork} -#' g_semantic_twitter <- CreateSemanticNetwork(myTwitterData) -#' } -#' @export -CollectDataTwitter <- -function(searchTerm, numTweets, verbose, writeToFile, language, since, until, - locale, geocode, sinceID, maxID, resultType, retryOnRateLimit) { - -# cat(paste("DEBUG - numTweets is set to:", numTweets)) # DEBUG - - # handle the arguments - - if (missing(verbose)) { - verbose <- TRUE # default to verbose - } - - if (missing(language)) { - language <- NULL # default to NULL (as per 'twitteR' package default) - } - - if (missing(writeToFile)) { - writeToFile <- FALSE # default = not write to file - } - - if (verbose=="TRUE" | verbose=="true" | verbose=="T" | verbose==TRUE) { - verbose <- TRUE - } - else {verbose <- FALSE} - - if (missing(numTweets)) { - numTweets <- 1500 # default to 1500 max tweets - } - - # Ensure that argument `pageName` has been specified by user. - - if (missing(searchTerm)) { - cat("Error. Argument `searchTerm` is missing.\nPlease specify a search term or hashtag to collect data from.\n") - return(NA) - } - - if (missing(since)) { - since <- NULL # default to NULL (as per 'twitteR' package default) - } - - if (missing(until)) { - until <- NULL # default to NULL (as per 'twitteR' package default) - } - - if (missing(locale)) { - locale <- NULL # default to NULL (as per 'twitteR' package default) - } - - if (missing(geocode)) { - geocode <- NULL # default to NULL (as per 'twitteR' package default) - } - - if (missing(sinceID)) { - sinceID <- NULL # default to NULL (as per 'twitteR' package default) - } - - if (missing(maxID)) { - maxID <- NULL # default to NULL (as per 'twitteR' package default) - } - - if (missing(resultType)) { - resultType <- NULL # default to NULL (as per 'twitteR' package default) - } - - if (missing(retryOnRateLimit)) { - retryOnRateLimit <- 0 # default to NULL (as per 'twitteR' package default) - } - - # Start data collection -if (verbose) { - cat(paste("Now retrieving data based on search term: ",searchTerm,"\n",sep="")) +#' +#' @note Supported network types: \code{actor}, \code{bimodal}, \code{semantic} +#' +#' Data generated using this function is *not* suitable for dynamic networks. +#' +#' @seealso \code{Collect} +#' @keywords collect twitter +#' +CollectDataTwitter <- function(authToken = NULL, searchTerm = "", searchType = "recent", numTweets = 100, + includeRetweets = TRUE, retryOnRateLimit = FALSE, writeToFile = FALSE, + verbose = FALSE, ...) { + + if (!("Token" %in% class(authToken))) { + stop("OAuth token missing. Please use the Authenticate function to create and supply a token.\n", + call. = FALSE) + } + + searchTerm <- trimws(searchTerm) + cat(paste0("Collecting tweets", ifelse(searchTerm == "", "", paste0(" for search term: ", searchTerm)), "...\n")) flush.console() -} - # Collecting tweets based on hashtag / keyword - - tweetsData <- searchTwitter(searchTerm, n=numTweets, lang=language, since=since, until=until, - locale=locale, geocode=geocode, sinceID=sinceID, maxID=maxID, resultType=resultType, retryOnRateLimit=retryOnRateLimit) #1500 is max - - # Convert this data into a dataframe object, for ease of use - if (verbose) { - cat("Done\n") ### DEBUG - flush.console() - cat("Cleaning and sorting the data...\n") - } - df <- twListToDF(tweetsData) # a better way - - # rename metadata - names.twitteR <- c("screenName", "created") # change from - names.api <- c("screen_name", "created_at") # change to - for(name in names.twitteR) { - names(df)[which(names(df)==name)] <- names.api[which(names.twitteR==name)] - } - df$from_user <- df$screen_name - - # removing odd characters - df <- RemoveOddChars(df) - - # extract user info and add to df - df <- ExtractUserInfo(df) - - # extract HASHTAG info and add to df - df <- ExtractHashtagInfo(df) - if (verbose) { - cat("Done\n") ### DEBUG - flush.console() - } - ################################################ - - if (writeToFile=="TRUE" | writeToFile=="true" | writeToFile=="T" | writeToFile==TRUE) { - tweetsDataDF <- twListToDF(tweetsData) # we just want the original tweets data - currTime <- format(Sys.time(), "%b_%d_%X_%Y_%Z") - currTime <- gsub(":","_",currTime) - write.csv(tweetsDataDF,paste0(currTime,"_",searchTerm,"_TwitterData.csv")) - cat("Twitter data was written to current working directory, with filename:\n") - cat(paste0(currTime,"_",searchTerm,"_TwitterData.csv")) - } - - class(df) <- append(class(df),c("dataSource","twitter")) - - cat("\n") - - return(df) - - ################################################ - + + rtlimit <- rtweet::rate_limit(authToken, "search/tweets") + remaining <- rtlimit[["remaining"]] * 100 + if (retryOnRateLimit == TRUE & numTweets < remaining) { + cat(paste0("Requested ", numTweets, " tweets of ", remaining, " in this rate limit.\n")) + cat("Less tweets requested than remaining limit retryOnRateLimit set to FALSE.\n") + retryOnRateLimit <- FALSE + } + + search_params <- list() + search_params[['token']] <- authToken + + search_params['q'] <- searchTerm + search_params['type'] <- searchType + search_params['n'] <- numTweets + search_params['include_rts'] <- includeRetweets + search_params['retryonratelimit'] <- retryOnRateLimit + search_params['verbose'] <- verbose + + # additional twitter api params + dots <- substitute(...()) + search_params[['...']] <- dots + + tweets_df <- do.call(rtweet::search_tweets, search_params) + + cat(paste0("Collected ", nrow(tweets_df), " tweets.\n")) + + # rds chosen over csv to avoid flattening lists in the data + if (writeToFile) { writeOutputFile(tweets_df, "rds", "TwitterData") } + + cat("Done.\n") + flush.console() + + class(tweets_df) <- append(class(tweets_df), c("dataSource", "twitter")) + + return(tweets_df) } diff --git a/vosonSML/R/collectDataYoutube.R b/vosonSML/R/CollectDataYoutube.R similarity index 88% rename from vosonSML/R/collectDataYoutube.R rename to vosonSML/R/CollectDataYoutube.R index 9201cf2..a4d3f15 100644 --- a/vosonSML/R/collectDataYoutube.R +++ b/vosonSML/R/CollectDataYoutube.R @@ -12,11 +12,11 @@ #' For multiple videos, the user may wish to use the function GetYoutubeVideoIDs, which creates a character #' vector of video IDs from a plain text file of YouTube video URLs, which can then be used for the videoIDs #' argument of the function CollectDataYoutube. -#' +#' +#' @param apiKey character string, specifying the Google Developer API Key used for authentication. #' @param videoIDs character vector, specifying one or more YouTube video IDs. For example, if the video URL is #' 'https://www.youtube.com/watch?v=W2GZFeYGU3s', then use videoIDs='W2GZFeYGU3s'. For multiple videos, the #' function GetYoutubeVideoIDs can be used to create a vector object suitable as input for videoIDs. -#' @param apiKeyYoutube character string, specifying the Google Developer API Key used for authentication. #' @param verbose logical. If TRUE then this function will output runtime information to the console as it #' computes. Useful diagnostic tool for long computations. Default is FALSE. #' @param writeToFile logical. If TRUE then the data is saved to file in current working directory (CSV format), @@ -25,7 +25,7 @@ #' *does not* take into account 'reply' comments (i.e. replies to top-level comments), therefore the total number of #' comments collected may be higher than maxComments. By default this function attempts to collect all comments. #' -#' @return A data frame object of class dataSource.youtube that can be used for creating unimodal networks +#' @return A dataframe object of class dataSource.youtube that can be used for creating unimodal networks #' (CreateActorNetwork). #' #' @note Currently supported network types: unimodal 'actor' network; CreateActorNetwork. @@ -41,42 +41,19 @@ #' comments, and one of these top-level comments has 5 'child' or reply comments, then the total number of comments #' collected will be equal to 15. Currently, the user must 'guesstimate' the maxResults value, to collect a #' number of comments in the order of what they require. -#' -#' @author Timothy Graham & Robert Ackland -#' @seealso Authenticate must be run first or no data will be collected. #' -#' @noRd -collectDataYoutube <- function(videoIDs, apiKeyYoutube, verbose = FALSE, writeToFile = FALSE, maxComments) { - - if (missing(verbose)) { - verbose <- FALSE # default to not verbose - } - - if (missing(maxComments)) { - maxComments <- 10000000000000 # some arbitrary very large number - } - - if (missing(writeToFile)) { - writeToFile <- FALSE - } - - if (isTrueValue(verbose)) { - verbose <- TRUE - } +CollectDataYoutube <- function(apiKey, videoIDs, verbose = FALSE, writeToFile = FALSE, + maxComments = 10000000000000) { - if (missing(apiKeyYoutube)) { - cat(paste0("Error. Argument `apiKeyYoutube` is missing. Please specify a valid API key to collect data (i.e. your", - " Google Developer API Key).\n")) - return(NA) + # maxComments defaults to an arbitrary very large number + + if (missing(videoIDs) || !is.vector(videoIDs) || length(videoIDs) < 1) { + stop("Please provide a vector of one or more youtube video ids.\n", call. = FALSE) } - - if (missing(videoIDs)) { - cat(paste0("Error. Argument `videoIDs` is missing.\nPlease specify a vector of video IDs to collect data from.\n", - "Hint: to do this you can use the `GetYoutubeVideoIDs` function in this package.")) - return(NA) + + if (missing(apiKey) || nchar(apiKey) < 1) { + stop("Please provide a valid youtube api key.\n", call. = FALSE) } - - apiKey <- apiKeyYoutube # to play nice with existing code # Start data collection @@ -106,9 +83,7 @@ collectDataYoutube <- function(videoIDs, apiKeyYoutube, verbose = FALSE, writeTo ## Make a dataframe out of the results - if (verbose) { - cat(paste0("\n** Creating data frame from threads of ", videoIDs[k], ".\n\n", sep = "")) - } + if (verbose) { cat(paste0("** Creating dataframe from threads of ", videoIDs[k], ".\n", sep = "")) } tempData <- lapply(rObj$data, function(x) { data.frame(Comment = x$snippet$topLevelComment$snippet$textDisplay, @@ -197,7 +172,7 @@ collectDataYoutube <- function(videoIDs, apiKeyYoutube, verbose = FALSE, writeTo cat(paste0("\n** Collected replies: ", total_replies, "\n", sep = "")) cat(paste0("** Total video comments: ", length(commentIDs) + total_replies, "\n", sep = "")) - cat("---------------------------------------------------------------\n\n") + cat("---------------------------------------------------------------\n") ############################## Combine comment threads and replies ############################# @@ -222,7 +197,7 @@ collectDataYoutube <- function(videoIDs, apiKeyYoutube, verbose = FALSE, writeTo } if (verbose) { - cat("\nCleaning and structuring data. Please be patient.\n") + cat("Cleaning and structuring data. Please be patient.\n") } ############################## Map relations between users into dataframe ############################# @@ -244,7 +219,7 @@ collectDataYoutube <- function(videoIDs, apiKeyYoutube, verbose = FALSE, writeTo usernamesCleaned <- escapeRegex(usernamesCleaned) # NEW WAY (OPTIMISED - better, faster, stronger...) - dataCombined$ReplyToAnotherUser <- searchCommentsForMentions(commentsTextCleaned, usernamesCleaned) + dataCombined$ReplyToAnotherUser <- SearchCommentsForMentions(commentsTextCleaned, usernamesCleaned) ## Map the comment replies within PARENT COMMENT THREADS into dataframe @@ -260,11 +235,10 @@ collectDataYoutube <- function(videoIDs, apiKeyYoutube, verbose = FALSE, writeTo } } - if (isTrueValue(writeToFile)) { - writeOutputFile(dataCombined, "csv", "YoutubeData") - } + if (writeToFile) { writeOutputFile(dataCombined, "csv", "YoutubeData") } - cat("\nDone!\n") + cat("Done.\n") + flush.console() ############################################################################# # return dataframe to environment @@ -277,7 +251,6 @@ collectDataYoutube <- function(videoIDs, apiKeyYoutube, verbose = FALSE, writeTo } ## Set up a class and methods/functions for scraping - yt_scraper <- setRefClass( "yt_scraper", fields = list( @@ -339,7 +312,7 @@ yt_scraper <- setRefClass( scrape_all = function(maxComments) { cat(paste0("** video Id: ", api_opts$videoId ,"\n", sep = "")) if (verbose) { - cat(paste0(" [results per page: ", api_opts$maxResults, " | max comments per video: ", maxComments, "]\n\n", + cat(paste0(" [results per page: ", api_opts$maxResults, " | max comments per video: ", maxComments, "]\n", sep = "")) } @@ -350,7 +323,7 @@ yt_scraper <- setRefClass( thread_count <- scrape() if (verbose) { - cat(paste0("-- Collected threads from page: ", thread_count, "\n\n", sep = "")) + cat(paste0("-- Collected threads from page: ", thread_count, "\n", sep = "")) } if (thread_count == 0 | length(data) > maxComments) { @@ -364,9 +337,7 @@ yt_scraper <- setRefClass( data <<- data[1:maxComments] } - if (verbose) { - cat(paste0("-- Done collecting threads.\n\n", sep = "")) - } + if (verbose) { cat(paste0("-- Done collecting threads.\n", sep = "")) } break } @@ -418,7 +389,7 @@ yt_scraper <- setRefClass( }) core_df <<- do.call("rbind", sub_data) } else { - message("\n`core_df` is already up to date.\n") + message("core_df is already up to date.\n") } } ) diff --git a/vosonSML/R/Create.R b/vosonSML/R/Create.R index 8b00e1e..10c1f22 100644 --- a/vosonSML/R/Create.R +++ b/vosonSML/R/Create.R @@ -1,50 +1,31 @@ #' Create networks from social media data #' -#' This function creates networks from social media data (i.e. from data frames of class \code{dataSource}. -#' \code{Create} is the final step of the \code{Authenticate}, \code{Collect}, \code{Create} workflow. This function is -#' a convenient UI wrapper to the core create*Network family of functions. +#' This function creates networks from social media data (i.e. collected from dataframes of class \code{social media}). +#' \code{Create} is the final step of the \code{Authenticate}, \code{Collect}, \code{Create} workflow. This function +#' is a wrapper for the Create*Network S3 methods. #' -#' Note: when creating Twitter networks, the user information can be collected separately using the -#' \code{\link{PopulateUserInfo}} function and stored into the network as vertex attributes (this involves additional -#' calls to the Twitter API). +#' @param dataSource Social media data collected using the \code{Collect} method. +#' @param type Character string. Type of network to be created, can be \code{actor}, \code{bimodal}, +#' \code{dynamic}, \code{semantic} or \code{ego}. +#' @param ... Additional parameters for network creation for appropriate \code{social media} and network \code{type}. +#' Refer to S3 methods \code{social media} type for default parameters. #' -#' @param dataSource a data frame of class \code{dataSource} -#' @param type character, type of network to be created, currently supports "actor", "bimodal", "dynamic", "semantic" -#' and "ego" -#' @param ... additional parameters for create*Network functions -#' @return an igraph graph object -#' -#' @author Chung-hong Chan -#' -#' @examples -#' \dontrun{ -#' require(magrittr) -#' -#' ## instagram ego network example -#' -#' my_app_id <- "123456789098765" -#' my_app_secret <- "abc123abc123abc123abc123abc123ab" -#' my_usernames <- c("senjohnmccain", "obama") -#' -#' my_ego_network <- Authenticate("instagram", appID = my_app_id, appSecret = my_app_secret) %>% -#' Collect(ego = TRUE, username = my_usernames) %>% Create -#' -#' ## youtube actor network example -#' -#' my_api_key <- "314159265358979qwerty" -#' my_video_ids <- c("W2GZFeYGU3s","mL27TAJGlWc") +#' @return Network data containing an igraph object. #' -#' my_actor_network <- Authenticate("youtube", apiKey = my_api_key) %>% -#' Collect(videoIDs = my_video_ids) %>% Create('actor') +#' @note When creating twitter networks, a network with additional user information can be generated using the +#' \code{\link{GraphUserInfoTwitter}} function. Additional calls can be made to the twitter API to get information +#' about users that were identified as nodes during network creation. +#' +#' @seealso \code{\link{CreateActorNetwork}}, \code{\link{CreateBimodalNetwork}}, \code{\link{CreateSemanticNetwork}} +#' @keywords create actor bimodal semantic network #' -#' } #' @export Create <- function(dataSource, type = "actor", ...) { - + # if ego is in the class list if (inherits(dataSource, "ego")) { - return(CreateEgoNetworkFromData(dataSource)) ## you cannot create actor out of ego data + return(CreateEgoNetworkFromData(dataSource)) # you cannot create actor out of ego data } - + creator <- switch(tolower(type), actor = CreateActorNetwork, bimodal = CreateBimodalNetwork, @@ -52,9 +33,12 @@ Create <- function(dataSource, type = "actor", ...) { semantic = CreateSemanticNetwork, ego = CreateEgoNetworkFromData, stop("Unknown Type")) - - network_to_return <- creator(dataSource, ...) - class(network_to_return) <- append(class(network_to_return), c("vosonSML")) - - return(network_to_return) + + # calls method mapped to type with parameters passed to create + networkToReturn <- creator(dataSource, ...) + + # creates class as vector that adds network results class and vosonSML class attributes + class(networkToReturn) <- append(class(networkToReturn), c("vosonSML")) + + return(networkToReturn) } diff --git a/vosonSML/R/CreateActorNetwork.R b/vosonSML/R/CreateActorNetwork.R index daf6e7b..971594e 100644 --- a/vosonSML/R/CreateActorNetwork.R +++ b/vosonSML/R/CreateActorNetwork.R @@ -1,46 +1,30 @@ -#' Create actor networks from social media data +#' Create an actor network from social media data #' -#' This function creates a unimodal 'actor' network from social media data (i.e. from data frames of class dataSource, -#' or for Twitter data it is also possible to provide a list of data frames). In this actor network, edges represent -#' relationships between actors of the same type (e.g. interactions between Twitter users). For example, with Twitter -#' data an interaction is defined as a 'mention' or 'reply' or 'retweet' from user i to user j, given 'tweet' m. With -#' YouTube comments, an interaction is defined as a 'reply' or 'mention' from user i to user j, given 'comment' m. +#' This function creates an actor network from social media data collected using the \code{Collect} method. Edges in +#' the network represent interactions or relationships between the actors. For example, with twitter data an +#' interaction is defined as a 'mention', reply' or 'retweet' from user i to user j, given 'tweet' m. With youtube +#' comments, an interaction is defined as a 'reply' from user i to user j, given 'comment' m. The resulting network is +#' returned as an igraph object. #' -#' This function creates a (weighted and directed) unimodal 'actor' network from a data frame of class dataSource -#' (which are created using the CollectData family of functions in the vosonSML package), or a list of Twitter data -#' frames collected using CollectDataTwitter function. +#' @param x Collected social media data with \code{social media} class attribute. +#' @param ... Additional parameters to pass to the network creation method. +#' @param writeToFile Logical. Save network data to a file in the current working directory. Default is \code{FALSE}. #' -#' The resulting network is an igraph graph object. This graph object is unimodal because edges represent relationships -#' between vertices of the same type (read: actors), such as replies/retweets/mentions between Twitter users. Edges are -#' directed and weighted (e.g. if user i has replied n times to user j, then the weight of this directed edge equals n). +#' @seealso \code{\link{Create}} +#' @keywords create actor twitter youtube reddit #' -#' @param x a data frame of class dataSource. For Twitter data, it is also possible to provide a list of data frames -#' (i.e. data frames that inherit class dataSource and twitter). Only lists of Twitter data frames are supported at -#' this time. If a list of data frames is provided, then the function binds these row-wise and computes over the entire -#' data set. -#' @param writeToFile logical. If TRUE then the network is saved to file in current working directory (GRAPHML format), -#' with filename denoting the current date/time and the type of network -#' @param ... additional parameters to pass to the network creation method -#' @return an igraph graph object, with directed and weighted edges -#' -#' @note Not all data sources in vosonSML can be used for creating actor networks. -#' Currently supported data sources are: YouTube, Twitter -#' -#' Other data sources (e.g. Facebook) will be implemented in the future. The user is notified if they try to create -#' actor networks for incompatible data sources. -#' -#' For Twitter data, actor networks can be created from multiple data frames (i.e. datasets collected individually -#' using CollectDataTwitter). Simply create a list of the data frames that you wish to create a network from. For -#' example: my_list <- list(my_twitter_data_1, my_twitter_data_2, my_twitter_data_3) -#' -#' @author Timothy Graham , Robert Ackland -#' -#' @noRd -CreateActorNetwork <- function(x, writeToFile, ...) { +#' @export +CreateActorNetwork <- function(x, ...) { + # searches the class list of x for matching method + UseMethod("CreateActorNetwork", x) +} - if (missing(writeToFile)) { - writeToFile <- FALSE +#' @rdname CreateActorNetwork +#' @export +CreateActorNetwork.default <- function(x, ...) { + cat("Cannot create actor network using this type of data.\n") + + if (inherits(x, "temporal")) { + cat("The data you supplied is temporal. Please use the CreateDynamicNetwork function for temporal data.\n") } - - UseMethod("CreateActorNetwork", x) } diff --git a/vosonSML/R/CreateActorNetwork.default.R b/vosonSML/R/CreateActorNetwork.default.R deleted file mode 100644 index ff04e88..0000000 --- a/vosonSML/R/CreateActorNetwork.default.R +++ /dev/null @@ -1,11 +0,0 @@ -CreateActorNetwork.default <- -function(x,writeToFile) - { - if (missing(writeToFile)) { - writeToFile <- FALSE # default = not write to file - } - cat("Error. Cannot create actor network using this type of data (see help file for data types and sources).\n") - if (inherits(x,"temporal")) { - cat("(The data you supplied is temporal. Please use the `CreateDynamicNetwork` function for temporal data.)\n") - } - } diff --git a/vosonSML/R/CreateActorNetwork.reddit.R b/vosonSML/R/CreateActorNetwork.reddit.R index aac8d77..e8d3d1f 100644 --- a/vosonSML/R/CreateActorNetwork.reddit.R +++ b/vosonSML/R/CreateActorNetwork.reddit.R @@ -1,49 +1,30 @@ -#' Creates a reddit actor network from collected threads -#' -#' Uses RedditExtractoR::user_network to create an igraph directed actor network with comment ids as edge attribute. -#' -#' @param x a dataframe as vosonSML class object containing collected social network data -#' @param weightEdges logical. Combines and weights directed edges. Can't be used with includeTextData. -#' @param includeTextData logical. If the igraph network edges should include the comment text as attribute. -#' @param cleanText logical. If non-alphanumeric, non-punctuation, and non-space characters should be removed from the -#' included text attribute data. Default is TRUE -#' @param writeToFile logical. If the igraph network graph should be written to file. -#' -#' @note Can create three types of network graphs: -#' * Directed graph with subreddit, thread_ids and comment ids as edge attributes - default option -#' * Directed graph with weighted edges (without comment ids) - weightEdges = TRUE -#' * Directed graph with comment text included as edge attribute - includeTextData = TRUE +# Creates a reddit actor network +# +# Uses RedditExtractoR::user_network to create an igraph directed actor network with comment ids as +# edge attributes. +# +#' @param weightEdges Logical. Combines and weights directed network edges. Default is \code{FALSE}. +#' @param textData Logical. If the igraph network should include the comment text as an edge attribute. +#' Cannot be used with the \code{weightEdges} parameter. Default is \code{FALSE}. +#' @param cleanText Logical. If non-alphanumeric, non-punctuation, and non-space characters should be removed from the +#' included text attribute data. Only applies if \code{textData = TRUE}. Default is \code{TRUE}. #' -#' Comment ids as edge attributes in graphs refer to the Collect dataframe comment id not reddits comment id -#' If "Forbidden control character 0x19 found in igraph_i_xml_escape, Invalid value" then set cleanText = TRUE +#' @return A reddit actor network as igraph object. #' -#' @return an igraph object of the actor network -#' -CreateActorNetwork.reddit <- function(x, weightEdges, includeTextData, cleanText, writeToFile) { +#' @rdname CreateActorNetwork +#' @export +CreateActorNetwork.reddit <- function(x, weightEdges = FALSE, textData = FALSE, cleanText = TRUE, + writeToFile = FALSE, ...) { - if (missing(writeToFile) || writeToFile != TRUE) { - writeToFile <- FALSE - } - - if (missing(weightEdges) || weightEdges != TRUE) { - weightEdges <- FALSE - } + # default cleanText = TRUE as reddit comments often contain forbidden XML control characters - # if weightEdges then includeTextData set FALSE - if (missing(includeTextData) || includeTextData != TRUE || weightEdges == TRUE) { - includeTextData <- FALSE - } + # if weightEdges then textData set FALSE + if (weightEdges) { textData <- FALSE } - # default cleanText = TRUE as reddit comments often contain forbidden XML control characters - if (missing(cleanText) || cleanText != FALSE) { - cleanText <- TRUE - } else { - cleanText <- FALSE - } + if (textData == FALSE) { cleanText <- FALSE } - if (includeTextData == FALSE) { - cleanText <- FALSE - } + cat("Generating reddit actor network...\n") + flush.console() # append string to file name to indicate different graph types, only used if writeToFile = TRUE appendToName <- "" @@ -55,17 +36,30 @@ CreateActorNetwork.reddit <- function(x, weightEdges, includeTextData, cleanText # modified from RedditExtractoR::user_network to include the df comment id, subreddit and thread id as edge # attributes to support post-processing. author of sender_receiver_df, node_df, and edge_df @ivan-rivera. include_author <- TRUE - sender_receiver_df <- - thread_df %>% + + # select cols and rename id and user + sender_receiver_df <- thread_df %>% dplyr::select(.data$id, .data$subreddit, .data$thread_id, .data$structure, .data$user, .data$author, .data$comment) %>% - dplyr::rename("comment_id" = .data$id, "sender" = .data$user) %>% + dplyr::rename("comment_id" = .data$id, "sender" = .data$user) + + sender_receiver_df %<>% + # response_to = "" if structure doesnt have underscore in it + # else structure minus last digit '1_1_2' response_to = '1_1' dplyr::mutate(response_to = ifelse(!grepl("_", .data$structure), "", gsub("_\\d+$", "", .data$structure))) %>% + + # select structure and user from original df + # rename structure to response_to and user to receiver + # left join sender_receiver_df to response_to, receiver by response_to dplyr::left_join(thread_df %>% dplyr::select(.data$structure, .data$user) %>% dplyr::rename("response_to" = .data$structure, "receiver" = .data$user), - by = "response_to") %>% + by = "response_to") + + sender_receiver_df %<>% + # inserts author into missing receiver values dplyr::mutate(receiver = dplyr::coalesce(.data$receiver, ifelse(include_author, .data$author, ""))) %>% + # filter out when sender and receiver same, or if either deleted or empty string dplyr::filter(.data$sender != .data$receiver, !(.data$sender %in% c("[deleted]", "")), !(.data$receiver %in% c("[deleted]", ""))) %>% @@ -93,22 +87,24 @@ CreateActorNetwork.reddit <- function(x, weightEdges, includeTextData, cleanText # weight edges network graph if (weightEdges) { + # drop comment id and text edge_df$comment_id <- edge_df$title <- NULL - edge_df <- edge_df %>% dplyr::group_by(.data$from, .data$to) %>% - dplyr::summarise(weight = sum(.data$weight)) %>% dplyr::ungroup() + edge_df %<>% dplyr::group_by(.data$from, .data$to) %>% + dplyr::summarise(weight = sum(.data$weight)) %>% dplyr::ungroup() appendToName <- "Weighted" # include comment text as edge attribute network graph - } else if (includeTextData) { + } else if (textData) { edge_df$weight <- NULL # rename the edge attribute containing the thread comment - edge_df <- edge_df %>% dplyr::rename("vosonTxt_comment" = .data$title) + edge_df %<>% dplyr::rename("vosonTxt_comment" = .data$title) # problem control characters encountered in reddit text # edge_df$vosonTxt_comment <- gsub("[\x01\x05\x18\x19\x1C]", "", edge_df$vosonTxt_comment, perl = TRUE) appendToName <- "Txt" + # remove any characters that are not in punctuation, alphanumeric classes or spaces if (cleanText) { edge_df$vosonTxt_comment <- gsub("[^[:punct:]^[:alnum:]^\\s]", "", edge_df$vosonTxt_comment, perl = TRUE) appendToName <- "CleanTxt" @@ -129,7 +125,7 @@ CreateActorNetwork.reddit <- function(x, weightEdges, includeTextData, cleanText writeOutputFile(g, "graphml", name) } - cat("\nDone!\n") + cat("Done.\n") flush.console() return(g) diff --git a/vosonSML/R/CreateActorNetwork.twitter.R b/vosonSML/R/CreateActorNetwork.twitter.R index 200dd59..7e2afe7 100644 --- a/vosonSML/R/CreateActorNetwork.twitter.R +++ b/vosonSML/R/CreateActorNetwork.twitter.R @@ -1,230 +1,191 @@ +# Create twitter actor network +# +# Creates an actor network from collected tweets. +# +#' @param verbose Logical. Output additional information about the network creation. Default is \code{FALSE}. +#' +#' @note For twitter data, actor networks can be created from multiple data frames (i.e. datasets collected individually +#' using \code{Collect} method. Simply create a list of the data frames that you wish to create a network from. +#' For example, \code{myList <- list(myTwitterData1, myTwitterData2, myTwitterData3)} +#' +#' @return A twitter actor network as list containing a relations dataframe, users dataframe and igraph object. +#' +#' @rdname CreateActorNetwork #' @export -CreateActorNetwork.twitter <- -function(x,writeToFile) -{ - - from=retweet_from=to=edgeType=timeStamp=tweet_id=users_mentioned=reply_to=NULL # to please the gods of R CMD CHECK - - if (missing(writeToFile)) { - writeToFile <- FALSE # default = not write to file - } - - df <- x # match the variable names (this must be used to avoid warnings in package compilation?) +CreateActorNetwork.twitter <- function(x, writeToFile = FALSE, verbose = FALSE, ...) { - # if `df` is a list of dataframes, then need to convert these into one dataframe - # CURRENTLY NOT IMPLEMENTED - there is no method for lists yet. - # suppressWarnings( - # if (class(df)=="list") { - # df <- do.call("rbind", df) - # } - # ) - - # The `hashtags_used` column in `df` causes problems for creating actor network, so delete it: - df <- df[,-21] - - # clear any odd characters - # df <- removeOddChars(df) - - # convert df to data.table + from <- to <- edge_type <- timestamp <- status_id <- NULL + is_retweet <- is_quote <- mentions_user_id <- reply_to_user_id <- NULL + + df <- x df <- data.table(df) - - # Now create the dfActorNetwork1, a dataframe of relations between users - cat("Generating the network...\n") ### DEBUG - flush.console() - - # for speed we will pre-allocate `dataCombined` to a very large size (more rows than needed) - # and after everything is finished we will delete the unused rows - - dataCombined <- data.table( - from = as.character(c(rep("NA_f00",20000000))), - to = as.character(c(rep("NA_f00",20000000))), - edgeType = as.character(c(rep("NA_f00",20000000))), - timeStamp = as.character(c(rep("NA_f00",20000000))), - tweet_id = as.character(c(rep("NA_f00",20000000))) - ) - - setkey(dataCombined,from) # set the key value of the data table - - nextEmptyRow <- 1 # so we can update rows in `dataCombined` in a relatively efficient way - - # We firstly do the retweet data - for (i in 1:nrow(df)) { - - if (is.na(df[i,retweet_from][[1]])) {next} # we check if there are retweets, if not skip to next row - - # nextEmptyRow <- dataCombined[ , .I[from_userID=="NA_f00"] ][1] # we get index of the next 'empty' row to put data into # NOT NEEDED NOW, BUT USEFUL FOR LATER - - dataCombined[nextEmptyRow, from:= as.character(df$from_user[i][[1]])] - dataCombined[nextEmptyRow, to := as.character(df$retweet_from[i][[1]])] - dataCombined[nextEmptyRow, edgeType := as.character("Retweet")] - dataCombined[nextEmptyRow, timeStamp := as.character(df$created_at[i][[1]])] - dataCombined[nextEmptyRow, tweet_id := as.character(df$id[i][[1]])] - - nextEmptyRow <- nextEmptyRow + 1 # increment the row to update in `dataCombined` - - } - - # Next we do the mentions - for (i in 1:nrow(df)) { - - if (length(df[i,users_mentioned][[1]]) < 1) {next} # we check if there are likes, if not skip to next row - - for (j in 1:length(df$users_mentioned[i][[1]])){ # for each row of the likes data for post i - - # nextEmptyRow <- dataCombined[ , .I[from_userID=="NA_f00"] ][1] # we get index of the next 'empty' row to put data into # NOT NEEDED NOW, BUT USEFUL FOR LATER - - dataCombined[nextEmptyRow, from := as.character(df$from_user[i][[1]])] - dataCombined[nextEmptyRow, to := as.character(df$users_mentioned[i][[1]][j])] - dataCombined[nextEmptyRow, edgeType := as.character("Mention")] - dataCombined[nextEmptyRow, timeStamp := as.character(df$created_at[i][[1]])] - dataCombined[nextEmptyRow, tweet_id := as.character(df$id[i][[1]])] - - nextEmptyRow <- nextEmptyRow + 1 # increment the row to update in `dataCombined` - + + df_stats <- networkStats(NULL, "collected tweets", nrow(df)) + + cat("Generating twitter actor network...\n") + flush.console() + + df_users <- data.frame("user_id" = character(0), "screen_name" = character(0)) + df_users <- rbind(df_users, subset(df, select = c("user_id", "screen_name"), stringsAsFactors = FALSE)) + + # for speed we will pre-allocate dataCombined to a very large size (more rows than needed) + # and after everything is finished we will delete the unused rows + dataCombined <- data.table( + from = as.character(c(rep("NA_f00", 20000000))), + to = as.character(c(rep("NA_f00", 20000000))), + edge_type = as.character(c(rep("NA_f00", 20000000))), #edgeType + timestamp = as.character(c(rep("NA_f00", 20000000))), # timeStamp + status_id = as.character(c(rep("NA_f00", 20000000))) # tweet_id + ) + + setkey(dataCombined, from) # set the key value of the data table + + nextEmptyRow <- 1 # so we can update rows in dataCombined in a relatively efficient way + + ## retweets + # this creates a retweet edge between: + # from (user retweeting) -- retweet --> to (user that tweeted) + count <- 0 + for (i in 1:nrow(df)) { + if ((df[i, is_retweet][[1]] == FALSE) || (is.na(df[i, is_retweet][[1]]))) { next } + + count <- count + 1 + + dataCombined[nextEmptyRow, from := as.character(df$user_id[i][[1]])] + dataCombined[nextEmptyRow, to := as.character(df$retweet_user_id[i][[1]])] + dataCombined[nextEmptyRow, edge_type := as.character("retweet")] + dataCombined[nextEmptyRow, timestamp := as.character(df$created_at[i][[1]])] + dataCombined[nextEmptyRow, status_id := as.character(df$status_id[i][[1]])] + + df_users <- rbind(df_users, list(df$retweet_user_id[i][[1]], df$retweet_screen_name[i][[1]])) + + nextEmptyRow <- nextEmptyRow + 1 + } + df_stats <- networkStats(df_stats, "retweets", count, TRUE) + + ## quotes + # this creates a quote edge between: + # from (user quoting) -- quote --> to (user being quoted) + count <- 0 + for (i in 1:nrow(df)) { + if ((df[i, is_quote][[1]] == FALSE) || (is.na(df[i, is_quote][[1]]))) { next } + + count <- count + 1 + + dataCombined[nextEmptyRow, from := as.character(df$user_id[i][[1]])] + dataCombined[nextEmptyRow, to := as.character(df$quoted_user_id[i][[1]])] + dataCombined[nextEmptyRow, edge_type := as.character("quote")] + dataCombined[nextEmptyRow, timestamp := as.character(df$created_at[i][[1]])] + dataCombined[nextEmptyRow, status_id := as.character(df$status_id[i][[1]])] + + df_users <- rbind(df_users, list(df$quoted_user_id[i][[1]], df$quoted_screen_name[i][[1]])) + + nextEmptyRow <- nextEmptyRow + 1 + } + df_stats <- networkStats(df_stats, "quoting others", count, TRUE) + + # dont create edges for mentions in retweets + # if user retweets and types own text with mentions it becomes a quote tweet + # and these are then counted + if_retweet_inlude_mentions <- FALSE + + ## mentions + # this creates a mention edge between: + # from (user tweeting) -- mention / reply mention --> to (user mentioned) + count <- 0 + mcount <- 0 + rmcount <- 0 + for (i in 1:nrow(df)) { + if ((length(df[i, mentions_user_id][[1]]) < 1) | + (length(df[i, mentions_user_id][[1]]) == 1 & is.na(df[i, mentions_user_id][[1]][[1]])) | + (if_retweet_inlude_mentions == FALSE & df[i, is_retweet][[1]] == TRUE)) { + next + } + + count <- count + 1 + + for (j in 1:length(df$mentions_user_id[i][[1]])) { # for each row of the likes data for post i + + etype <- "mention" + if (!is.na(df[i, reply_to_user_id][[1]])) { + # skip reply to actor as have this edge in replies + if (df[i, reply_to_user_id][[1]] == df$mentions_user_id[i][[1]][j]) { + next } - + etype <- "reply mention" + rmcount <- rmcount + 1 + } else { + mcount <- mcount + 1 } - - # Finally, we do the replies data - for (i in 1:nrow(df)) { - - if (is.na(df[i,reply_to][[1]])) {next} # we check if there are retweets, if not skip to next row - - # nextEmptyRow <- dataCombined[ , .I[from_userID=="NA_f00"] ][1] # we get index of the next 'empty' row to put data into # NOT NEEDED NOW, BUT USEFUL FOR LATER - - dataCombined[nextEmptyRow, from:= as.character(df$from_user[i][[1]])] - dataCombined[nextEmptyRow, to := as.character(df$reply_to[i][[1]])] - dataCombined[nextEmptyRow, edgeType := as.character("Reply")] - dataCombined[nextEmptyRow, timeStamp := as.character(df$created_at[i][[1]])] - dataCombined[nextEmptyRow, tweet_id := as.character(df$id[i][[1]])] - - nextEmptyRow <- nextEmptyRow + 1 # increment the row to update in `dataCombined` - - } - - # we now delete all the rows at the end of `dataCombined` that are unused - dataCombined <- dataCombined[edgeType != "NA_f00"] # we just keep the rows that are unchanged from the original dummy data values - - ## -------------------------------- - - # make a vector of all the unique actors in the network1 - # actorsNames <- unique(c(as.character(dataCombined$from),as.character(dataCombined$to))) - actorsNames <- unique(factor(c(as.character(unique(dataCombined$from)),as.character(unique(dataCombined$to))))) - -# -# # cat(actorsNames) # DEBUG -# -# # Retrieve all the user details (e.g. follower count, number of tweets, etc) and include as node attributes. -# # NOTE: Given API rate limits, the below implementation supports up to 7500 users overall in dataset (150 requests * 50 users per request). -# # NOTE: Future work needs to address the Twitter API rate limit for looking up user information (150 requests per 15 minutes). -# # NOTE: Requesting 50 users at a time seems to avoid rate limit errors (it's a safe bet...). -# -# # This function is supposed to perform the lookups in batches -# # and mind the rate limit: -# getUserObjects <- function(users) { -# groups <- split(users, ceiling(seq_along(users)/50)) -# userObjects <- ldply(groups, function(group) { # ldply is a very cool function, found in plyr package. -# objects <- lookupUsers(group) -# out <- twListToDF(objects) # twListToDF is also a handy function, found in twitteR package. Converts weird class object to data frame. -# # print("Waiting for 15 minutes (to 'refresh' the rate limit)...") # Don't need to use this yet. Implement later for number of users > 7500 (have to do chunked batches... chunks of chunks... urrghh) -# # Sys.sleep(900) -# return(out) -# }) -# return(userObjects) -# } -# -# # Putting it into action: -# usersInformationAttributes <- getUserObjects(actorsNames) -# actorsInfoDF <- usersInformationAttributes -# -# # Need to clean the user text collected here (get rid of odd characters): -# # actorsInfoDF <- RemoveOddCharsUserInfo(actorsInfoDF) # uses the new function in v2_munge_tweets.R -# -# # We sometimes have a PROBLEM of missing actors (no info could be retrieved for them - might be misspellings/errors/pun or joke, etc) -# # So, identify which users are missing from original set to retrieved set, -# # then ensure these users/connections are removed before proceeding onwards: -# -# missingActors <- setdiff(actorsNames,usersInformationAttributes$screenName) -# # NOTE: This is a horrible approach, need to optimise. -# missingTemp <- NULL # store the indexes of "offending" edge connections (i.e. bad/missing actors) -# # NOTE: Obviously the 'offending' users can only be found in the 2nd column -# # NOTE: Ipso facto, if they are not real/actual users, then they can't be the source of a directed edge -# -# for (i in 1:length(missingActors)) { -# missingTemp <- c(missingTemp, which(missingActors[i] == dataCombined$to)) -# } -# -# # REMOVE the offendors: -# if(length(missingTemp) > 0) { -# dataCombined <- dataCombined[-missingTemp,] -# } -# -# # REMOVE any duplicated usernames in the retrieved user information (NOT SURE HOW/WHY THIS WOULD OCCUR **NEED TO CHECK**): -# # duplicatedUsers <- which(duplicated(actorsInfoDF$screenName)) -# -# # if(length(duplicatedUsers) > 0) { -# # actorsInfoDF <- actorsInfoDF[-duplicatedUsers,] -# # } -# -# actors <- data.frame( -# name=actorsInfoDF$screenName, -# userDescription=actorsInfoDF$description, -# statusesCount=actorsInfoDF$statusesCount, -# followersCount=actorsInfoDF$followersCount, -# favoritesCount=actorsInfoDF$favoritesCount, -# friendsCount=actorsInfoDF$friendsCount, -# url=actorsInfoDF$url, -# realName=actorsInfoDF$name, -# dateAccountCreated=actorsInfoDF$created, -# userLocation=actorsInfoDF$location, -# userLanguage=actorsInfoDF$lang, -# numberOfListsUserIsFeaturedOn=actorsInfoDF$listedCount, -# profileImageUrl=actorsInfoDF$profileImageUrl -# ) -# -# # actors <- actors[-which(duplicated(actors$name)),] -# # actors <- unique(actors) -# -# # make a dataframe of the relations between actors -# # NOTE - FUTURE WORK: include edge attributes to specify the specific type of "mentions" (see previous comments on temporal network problem (see: approx. LINES 113-116)). -# # NOTE - For example, "RETWEET" versus "TWEET TO" (@username specified beginning of tweet) versus "MENTION" (@username specified somewhere else in tweet text) -# -# # return(df) # DEBUG - - relations <- data.frame( - from=dataCombined$from, - to=dataCombined$to, - edgeType=dataCombined$edgeType, - timeStamp=dataCombined$timeStamp, - tweet_id=dataCombined$tweet_id) - - ##### STEP FOUR ##### -# cat("\n I got to the final step before network generation") - - # convert into a graph - # note: suppressing warnings is used to avoid this error: - # In if (class(newval) == "factor") { : - # the condition has length > 1 and only the first element will be used - - suppressWarnings( - g <- graph.data.frame(relations, directed=TRUE, vertices=actorsNames) # used to be vertices=actors (when it collected user data) - ) - - # Make the node labels play nice with Gephi - V(g)$label <- V(g)$name - - if (writeToFile=="TRUE" | writeToFile=="true" | writeToFile=="T" | writeToFile==TRUE) { - # Output the final network to a graphml file, to import directly into Gephi - currTime <- format(Sys.time(), "%b_%d_%X_%Y_%Z") - currTime <- gsub(":","_",currTime) - write.graph(g,paste0(currTime,"_TwitterActorNetwork.graphml"),format="graphml") - cat("Twitter actor network was written to current working directory, with filename:\n") - cat(paste0(currTime,"_TwitterActorNetwork.graphml")) + + dataCombined[nextEmptyRow, from := as.character(df$user_id[i][[1]])] + dataCombined[nextEmptyRow, to := as.character(df$mentions_user_id[i][[1]][j])] + dataCombined[nextEmptyRow, edge_type := as.character(etype)] + dataCombined[nextEmptyRow, timestamp := as.character(df$created_at[i][[1]])] + dataCombined[nextEmptyRow, status_id := as.character(df$status_id[i][[1]])] + + df_users <- rbind(df_users, list(df$mentions_user_id[i][[1]][j], df$mentions_screen_name[i][[1]][j])) + + nextEmptyRow <- nextEmptyRow + 1 } - - cat("\nDone.\n") ### DEBUG - flush.console() - - return(g) - + } + df_stats <- networkStats(df_stats, "mentions", mcount, TRUE) + df_stats <- networkStats(df_stats, "reply mentions", rmcount, TRUE) + + ## replies + # this creates a reply edge between: + # from (user replying) -- reply --> to (user being replied to) + count <- 0 + for (i in 1:nrow(df)) { + if (is.na(df[i, reply_to_user_id][[1]])) { next } # we check if there are retweets, if not skip to next row - reply_to + + count <- count + 1 + + dataCombined[nextEmptyRow, from:= as.character(df$user_id[i][[1]])] + dataCombined[nextEmptyRow, to := as.character(df$reply_to_user_id[i][[1]])] + dataCombined[nextEmptyRow, edge_type := as.character("reply")] + dataCombined[nextEmptyRow, timestamp := as.character(df$created_at[i][[1]])] + dataCombined[nextEmptyRow, status_id := as.character(df$status_id[i][[1]])] + + df_users <- rbind(df_users, list(df$reply_to_user_id[i][[1]], df$reply_to_screen_name[i][[1]])) + + nextEmptyRow <- nextEmptyRow + 1 # increment the row to update in dataCombined + } + df_stats <- networkStats(df_stats, "replies", count, TRUE) + + dataCombined <- dataCombined[edge_type != "NA_f00"] + + # make a vector of all the unique actors in the network + df_users <- unique(df_users) + + df_stats <- networkStats(df_stats, "nodes", nrow(df_users)) + df_stats <- networkStats(df_stats, "edges", sum(df_stats$count[df_stats$edge_count == TRUE])) + + # print stats + if (verbose) { networkStats(df_stats, print = TRUE) } + + df_relations <- data.frame( + from = dataCombined$from, + to = dataCombined$to, + edge_type = dataCombined$edge_type, + timestamp = dataCombined$timestamp, + status_id = dataCombined$status_id) + + g <- graph.data.frame(df_relations, directed = TRUE, vertices = df_users) + + V(g)$screen_name <- ifelse(is.na(V(g)$screen_name), paste0("ID:", V(g)$name), V(g)$screen_name) + V(g)$label <- V(g)$screen_name + + if (writeToFile) { writeOutputFile(g, "graphml", "TwitterActorNetwork") } + + cat("Done.\n") + flush.console() + + function_output <- list( + "relations" = df_relations, + "users" = df_users, + "graph" = g + ) + + return(function_output) } diff --git a/vosonSML/R/CreateActorNetwork.youtube.R b/vosonSML/R/CreateActorNetwork.youtube.R index faed427..395ee8b 100644 --- a/vosonSML/R/CreateActorNetwork.youtube.R +++ b/vosonSML/R/CreateActorNetwork.youtube.R @@ -1,19 +1,12 @@ -#' Create YouTube Actor Network -#' -#' Creates a unimodal actor network based on comments and replies to one or more youtube videos. -#' -#' @param x dataframe containing comments data collected and structured by CollectDataYoutube. -#' @param writeToFile boolean, if TRUE then igraph data is saved to a file in the current working directory in -#' graphml format. The file name will contain the current system time. Default is FALSE. +# Create youtube actor network +# +# Creates a unimodal actor network based on comments and replies to one or more youtube videos. +# +#' @return A youtube actor network as igraph object. #' -#' @return igraph object containing the actor network with edge attribute comment id -#' -#' @noRd -CreateActorNetwork.youtube <- function(x, writeToFile) { - - if (missing(writeToFile)) { - writeToFile <- FALSE - } +#' @rdname CreateActorNetwork +#' @export +CreateActorNetwork.youtube <- function(x, writeToFile = FALSE, ...) { df_comments <- x # match the variable names to avoid warnings in package compilation @@ -22,12 +15,14 @@ CreateActorNetwork.youtube <- function(x, writeToFile) { # 2 User 5 PublishTime 8 ReplyToAnotherUser # 3 ReplyCount 6 CommentId 9 VideoID + cat("Generating youtube actor network...\n") + flush.console() + if (nrow(df_comments) == 0) { - cat(paste0("\nOops! There are no user comments to make a network from.\nPlease find video(s) where users have", - " commented on a video or to each other.\nReturning...\n")) - return() + stop(paste0("There are no user comments to make a network from, please check that the videos selected ", + "for collection have comments.\n"), call. = FALSE) } - + # direct comments which are not replies to others to a video id node # in the graph the video nodes will appear as VIDEO:AbCxYz where AbCxYz is the id not_replies <- which(df_comments$ReplyToAnotherUser == "FALSE" & df_comments$ParentID == "None") @@ -54,11 +49,9 @@ CreateActorNetwork.youtube <- function(x, writeToFile) { V(g)$label <- V(g)$name # output the final network to a graphml file - if (isTrueValue(writeToFile)) { - writeOutputFile(g, "graphml", "YoutubeActorNetwork") - } + if (writeToFile) { writeOutputFile(g, "graphml", "YoutubeActorNetwork") } - cat("\nDone!\n") + cat("Done.\n") flush.console() return(g) diff --git a/vosonSML/R/CreateBimodalNetwork.R b/vosonSML/R/CreateBimodalNetwork.R index 03b22e7..116f9c5 100644 --- a/vosonSML/R/CreateBimodalNetwork.R +++ b/vosonSML/R/CreateBimodalNetwork.R @@ -1,102 +1,56 @@ -#' Note: this function is DEPRECATED and will be removed in a future release. -#' Please use the \code{Create} function -#' #' Create bimodal networks from social media data #' -#' This function creates a bimodal network from social media data (i.e. from -#' data frames of class \code{dataSource}, or for Twitter data it is also -#' possible to provide a *list* of data frames), with edges representing -#' relationships between actors of two different types (e.g. Facebook users and -#' Facebook posts, with edges representing whether a user has commented or -#' 'liked' a post). -#' -#' This function creates a (directed and weighted) bimodal network from a data -#' frame of class \code{dataSource} (which are created using the `CollectData` -#' family of functions in the vosonSML package), or a *list* of Twitter -#' data frames collected using \code{CollectDataTwitter} function. -#' -#' The resulting network is an igraph graph object. This graph object is -#' bimodal because edges represent relationships between vertices of two -#' different types. For example, in a bimodal Facebook network, vertices -#' represent Facebook users or Facebook posts, and edges represent whether a -#' user has commented or 'liked' a post. Edges are directed and weighted (e.g. -#' if user i has commented n times on post j, then the weight of this directed -#' edge equals n). -#' -#' @param x a data frame of class \code{dataSource}. For Twitter data, it is -#' also possible to provide a *list* of data frames (i.e. data frames that -#' inherit class \code{dataSource} and \code{twitter}). Only lists of Twitter -#' data frames are supported at this time. If a list of data frames is -#' provided, then the function binds these row-wise and computes over the -#' entire data set. -#' @param writeToFile logical. If \code{TRUE} then the network is saved to file -#' in current working directory (GRAPHML format), with filename denoting the -#' current date/time and the type of network. -#' @param removeTermsOrHashtags character vector. Default is none. Otherwise -#' this argument specifies which terms or hashtags (i.e. vertices with matching -#' `name`) should be removed from the bimodal network. This is useful to remove -#' the search term or hashtag that was used to collect the data (i.e. remove -#' the corresponding vertex in the graph). For example, a value of "#auspol" -#' means that if there is a vertex with the exact name "#auspol" then this -#' vertex will be removed. +#' This function creates a bimodal network from social media data (i.e. from data frames of class \code{dataSource}, or +#' for Twitter data it is also possible to provide a *list* of data frames), with edges representing relationships +#' between actors of two different types (e.g. Facebook users and Facebook posts, with edges representing whether a +#' user has commented or 'liked' a post). +#' +#' This function creates a (directed and weighted) bimodal network from a data frame of class \code{dataSource} (which +#' are created using the 'CollectData' family of functions in the vosonSML package), or a *list* of Twitter data +#' frames collected using \code{CollectDataTwitter} function. +#' +#' The resulting network is an igraph graph object. This graph object is bimodal because edges represent relationships +#' between vertices of two different types. For example, in a bimodal Facebook network, vertices represent Facebook +#' users or Facebook posts, and edges represent whether a user has commented or 'liked' a post. Edges are directed and +#' weighted (e.g. if user i has commented n times on post j, then the weight of this directed edge equals n). +#' +#' @param x A data frame of class \code{dataSource}. For Twitter data, it is also possible to provide a *list* of data +#' frames (i.e. data frames that inherit class \code{dataSource} and \code{twitter}). Only lists of Twitter data +#' frames are supported at this time. If a list of data frames is provided, then the function binds these row-wise and +#' computes over the entire data set. +#' @param writeToFile Logical. If \code{TRUE} then the network is saved to file in current working directory (GRAPHML +#' format), with filename denoting the current date/time and the type of network. +#' @param removeTermsOrHashtags Character string. Default is none. Otherwise this argument specifies which terms or +#' hashtags (i.e. vertices with matching 'name') should be removed from the bimodal network. This is useful to remove +#' the search term or hashtag that was used to collect the data (i.e. remove the corresponding vertex in the graph). +#' For example, a value of "#auspol" means that if there is a vertex with the exact name "#auspol" then this vertex +#' will be removed. +#' @param ... Additional parameters to pass to the network creation method. +#' #' @return An igraph graph object, with weighted and directed edges. -#' @note Not all data sources in vosonSML can be used for creating -#' bimodal networks. -#' -#' Currently supported data sources are: -#' -#' - Facebook - Twitter -#' -#' Other data sources (e.g. YouTube) will be implemented in the future. -#' Additionally, the user is notified if they try to create bimodal networks -#' for incompatible data sources. -#' -#' For Twitter data, bimodal networks can be created from multiple data frames -#' (i.e. datasets collected individually using CollectDataTwitter). Simply -#' create a list of the data frames that you wish to create a network from. For -#' example, \code{myList <- list(myTwitterData1, myTwitterData2, -#' myTwitterData3)}. -#' @author Timothy Graham & Robert Ackland -#' -#' @seealso See \code{CollectDataFacebook} and \code{CollectDataTwitter} to -#' collect data for creating bimodal networks in vosonSML. +#' +#' @note Supported data sources: \code{facebook}, \code{twitter} +#' +#' For Twitter data, bimodal networks can be created from multiple data frames (i.e. datasets collected individually +#' using CollectDataTwitter). Simply create a list of the data frames that you wish to create a network from. For +#' example, \code{myList <- list(myTwitterData1, myTwitterData2, myTwitterData3)}. +#' +#' @seealso \code{CollectDataFacebook}, \code{CollectDataTwitter} #' @keywords SNA bimodal network igraph social media -#' @examples -#' -#' \dontrun{ -#' ## This example shows how to collect Facebook page data and create a bimodal network -#' -#' # Use your own values for myAppID and myAppSecret -#' myAppID <- "123456789098765" -#' myAppSecret <- "abc123abc123abc123abc123abc123ab" -#' -#' # Authenticate with the Facebook API using `AuthenticateWithFacebookAPI` -#' fb_oauth <- AuthenticateWithFacebookAPI(appID=myAppID, appSecret=myAppSecret, -#' extended_permissions=FALSE, useCachedToken=TRUE) -#' -#' # Run the `CollectDataFacebook` function and store the results in variable `myFacebookData` -#' myFacebookData <- CollectDataFacebook(pageName="StarWars", rangeFrom="2014-05-15", -#' rangeTo="2014-06-03",writeToFile=FALSE,verbose=TRUE) -#' -#' # Create a 'bimodal' network using \code{CreateBimodalNetwork} -#' g_bimodal_facebook <- CreateBimodalNetwork(myFacebookData) -#' -#' # View descriptive information about the bimodal network -#' g_bimodal_facebook -#' } -#' -CreateBimodalNetwork <- -function(x,writeToFile,removeTermsOrHashtags) - { - if (missing(writeToFile)) { - writeToFile <- FALSE # default = not write to file - } - if (!missing(removeTermsOrHashtags)) { - removeTermsOrHashtags <- as.vector(removeTermsOrHashtags) #coerce to vector... to be sure - } - - if (missing(removeTermsOrHashtags)) { - removeTermsOrHashtags <- "foobar" - } - UseMethod("CreateBimodalNetwork",x) +#' +CreateBimodalNetwork <- function(x, writeToFile, removeTermsOrHashtags, ...) { + + if (missing(writeToFile)) { + writeToFile <- FALSE + } + + if (!missing(removeTermsOrHashtags)) { + removeTermsOrHashtags <- as.vector(removeTermsOrHashtags) # coerce to vector to be sure + } + + if (missing(removeTermsOrHashtags)) { + removeTermsOrHashtags <- "foobar" } + + UseMethod("CreateBimodalNetwork", x) +} diff --git a/vosonSML/R/CreateBimodalNetwork.default.R b/vosonSML/R/CreateBimodalNetwork.default.R index 7dbc3ed..7582f62 100644 --- a/vosonSML/R/CreateBimodalNetwork.default.R +++ b/vosonSML/R/CreateBimodalNetwork.default.R @@ -1,18 +1,20 @@ -CreateBimodalNetwork.default <- -function(x,writeToFile,removeTermsOrHashtags, ...) - { - if (missing(writeToFile)) { - writeToFile <- FALSE # default = not write to file - } - if (!missing(removeTermsOrHashtags)) { - removeTermsOrHashtags <- as.vector(removeTermsOrHashtags) #coerce to vector... to be sure - } - - if (missing(removeTermsOrHashtags)) { - removeTermsOrHashtags <- "foobar" - } - cat("Error. Cannot create bimodal network using this type of data (see help file for data types and sources).\n") - # if (inherits(x,"temporal")) { - # cat("(The data you supplied is temporal. Please use the `CreateDynamicNetwork` function for temporal data.)\n") - # } +CreateBimodalNetwork.default <- function(x, writeToFile, removeTermsOrHashtags, ...) { + + if (missing(writeToFile)) { + writeToFile <- FALSE } + + if (!missing(removeTermsOrHashtags)) { + removeTermsOrHashtags <- as.vector(removeTermsOrHashtags) # coerce to vector to be sure + } + + if (missing(removeTermsOrHashtags)) { + removeTermsOrHashtags <- "foobar" + } + + cat("Error. Cannot create bimodal network using this type of data (see help file for data types and sources).\n") + + # if (inherits(x, "temporal")) { + # cat("(The data you supplied is temporal. Please use the 'CreateDynamicNetwork' function for temporal data.)\n") + # } +} diff --git a/vosonSML/R/CreateBimodalNetwork.twitter.R b/vosonSML/R/CreateBimodalNetwork.twitter.R index dc380ad..7f15993 100644 --- a/vosonSML/R/CreateBimodalNetwork.twitter.R +++ b/vosonSML/R/CreateBimodalNetwork.twitter.R @@ -1,106 +1,108 @@ -#' @export -CreateBimodalNetwork.twitter <- -function(x,writeToFile,removeTermsOrHashtags) -{ - from=to=edgeType=timeStamp=tweet_id=NULL # to please the gods of R CMD CHECK - - if (missing(writeToFile)) { - writeToFile <- FALSE # default = not write to file - } - +CreateBimodalNetwork.twitter <- function(x, writeToFile = FALSE, removeTermsOrHashtags, verbose = FALSE) { + + from <- to <- edge_type <- timestamp <- status_id <- NULL + if (!missing(removeTermsOrHashtags)) { - removeTermsOrHashtags <- as.vector(removeTermsOrHashtags) #coerce to vector... to be sure + removeTermsOrHashtags <- as.vector(removeTermsOrHashtags) # coerce to vector to be sure } - + if (missing(removeTermsOrHashtags)) { removeTermsOrHashtags <- "#fake_hashtag_foobar42_1234567890" } - - df <- x # match the variable names (this must be used to avoid warnings in package compilation) - - # convert df to data.table + + df <- x df <- data.table(df) - # Now create the dfBimodalNetwork2, a dataframe of relations between users and hashtags (i.e. user i "tweeted" hashtag j) + df_stats <- networkStats(NULL, "collected tweets", nrow(df)) - print("Generating Twitter bimodal network...") ### DEBUG + # create dfBimodalNetwork2, a dataframe of relations between users and hashtags (i.e. user i "tweeted" hashtag j) + cat("Generating twitter bimodal network...\n") flush.console() - #### ----- NEW WAY --------- + df_entities <- data.table("entity_id" = character(0), "display_name" = character(0)) - # for speed we will pre-allocate `dataCombined` to a very large size (more rows than needed) + # for speed we will pre-allocate dataCombined to a very large size (more rows than needed) # and after everything is finished we will delete the unused rows - dataCombined <- data.table( - from = as.character(c(rep("NA_f00",20000000))), - to = as.character(c(rep("NA_f00",20000000))), - edgeType = as.character(c(rep("NA_f00",20000000))), - timeStamp = as.character(c(rep("NA_f00",20000000))), - tweet_id = as.character(c(rep("NA_f00",20000000))) + from = as.character(c(rep("NA_f00", 20000000))), + to = as.character(c(rep("NA_f00", 20000000))), + edge_type = as.character(c(rep("NA_f00", 20000000))), + timestamp = as.character(c(rep("NA_f00", 20000000))), + status_id = as.character(c(rep("NA_f00", 20000000))) ) - - setkey(dataCombined,from) # set the key value of the data table - - nextEmptyRow <- 1 # so we can update rows in `dataCombined` in a relatively efficient way - - # We only need to do the 'hashtag' data (currently) + + setkey(dataCombined, from) # set the key value of the data table + + nextEmptyRow <- 1 # so we can update rows in 'dataCombined' in a relatively efficient way + + # we only need to do the 'hashtag' data (currently) + count <- 0 + hashtag_count <- 0 for (i in 1:nrow(df)) { - - if (length(df$hashtags_used[[i]]) > 0) { # skip any rows where no hashtags were used - - for (j in 1:length(df$hashtags_used[[i]])) { # for each hashtag in list - - dataCombined[nextEmptyRow, from:= as.character(df$from_user[i][[1]])] - dataCombined[nextEmptyRow, to := as.character(df$hashtags_used[[i]][j])] - dataCombined[nextEmptyRow, edgeType := as.character("Used_hashtag")] - dataCombined[nextEmptyRow, timeStamp := as.character(df$created_at[i][[1]])] - dataCombined[nextEmptyRow, tweet_id := as.character(df$id[i][[1]])] - + if (length(df$hashtags[[i]]) > 0) { # skip any rows where no hashtags were used # hashtags_used + if (length(df$hashtags[[i]]) == 1 & is.na(df$hashtags[[i]][1])) { + next + } + + count <- count + 1 + df_entities <- rbind(df_entities, list(df$user_id[i][[1]], df$screen_name[i][[1]]), stringsAsFactors = FALSE) + + for (j in 1:length(df$hashtags[[i]])) { # for each hashtag in list + + tag <- paste0("#", df$hashtags[[i]][j]) + + dataCombined[nextEmptyRow, from:= as.character(df$user_id[i][[1]])] + dataCombined[nextEmptyRow, to := as.character(tag)] + dataCombined[nextEmptyRow, edge_type := as.character("hashtag")] + dataCombined[nextEmptyRow, timestamp := as.character(df$created_at[i][[1]])] + dataCombined[nextEmptyRow, status_id := as.character(df$status_id[i][[1]])] + + df_entities <- rbind(df_entities, list(tag, tag), stringsAsFactors = FALSE) + + hashtag_count = hashtag_count + 1 nextEmptyRow <- nextEmptyRow + 1 # increment the row to update in `dataCombined` - } } } - - # we now delete all the rows at the end of `dataCombined` that are unused - dataCombined <- dataCombined[edgeType != "NA_f00"] # we just keep the rows that are unchanged from the original dummy data values - - # make a vector of all the unique actors in the network1 - actorsNames <- unique(factor(c(as.character(unique(dataCombined$from)),as.character(unique(dataCombined$to))))) - + df_stats <- networkStats(df_stats, "tweets with hashtags", count, TRUE) + df_stats <- networkStats(df_stats, "hashtags", hashtag_count, TRUE) + + dataCombined <- dataCombined[edge_type != "NA_f00"] + + df_entities <- unique(df_entities) + + df_stats <- networkStats(df_stats, "nodes", nrow(df_entities)) + df_stats <- networkStats(df_stats, "edges", sum(df_stats$count[df_stats$edge_count == TRUE])) + if (verbose) { + networkStats(df_stats, print = TRUE) + } + relations <- data.frame( - from=dataCombined$from, - to=dataCombined$to, - edgeType=dataCombined$edgeType, - timeStamp=dataCombined$timeStamp, - tweet_id=dataCombined$tweet_id) - - suppressWarnings( - g <- graph.data.frame(relations, directed=TRUE, vertices=actorsNames) # used to be vertices=actors (when it collected user data) - ) - - # Make the node labels play nice with Gephi - V(g)$label <- V(g)$name - + from = dataCombined$from, + to = dataCombined$to, + edge_type = dataCombined$edge_type, + timestamp = dataCombined$timestamp, + status_id = dataCombined$status_id) + + g <- graph.data.frame(relations, directed = TRUE, vertices = df_entities) + + V(g)$display_name <- ifelse(is.na(V(g)$display_name), paste0("ID:", V(g)$name), V(g)$display_name) + # remove the search term / hashtags, if user specified it: - if (removeTermsOrHashtags[1]!="#fake_hashtag_foobar42_1234567890") { - toDel <- match(tolower(removeTermsOrHashtags),V(g)$name) # we force to lowercase because all terms/hashtags are already converted to lowercase - toDel <- toDel[!is.na(toDel)] # in case of user error (i.e. trying to delete terms/hashtags that don't exist in the data) - g <- delete.vertices(g, toDel) - } - - if (writeToFile=="TRUE" | writeToFile=="true" | writeToFile=="T" | writeToFile==TRUE) { - # Output the final network to a graphml file, to import directly into Gephi - currTime <- format(Sys.time(), "%b_%d_%X_%Y_%Z") - currTime <- gsub(":","_",currTime) - write.graph(g,paste0(currTime,"_TwitterBimodalNetwork.graphml"),format="graphml") - cat("Twitter bimodal network was written to current working directory, with filename:\n") - cat(paste0(currTime,"_TwitterBimodalNetwork.graphml")) + if (removeTermsOrHashtags[1] != "#fake_hashtag_foobar42_1234567890") { + # we force to lowercase because all terms/hashtags are already converted to lowercase + toDel <- match(tolower(removeTermsOrHashtags), V(g)$name) + # in case of user error (i.e. trying to delete terms/hashtags that don't exist in the data) + toDel <- toDel[!is.na(toDel)] + g <- delete.vertices(g, toDel) } - - cat("\nDone\n") ### DEBUG + + V(g)$label <- V(g)$display_name + + if (writeToFile) { writeOutputFile(g, "graphml", "TwitterBimodalNetwork") } + + cat("Done.\n") flush.console() - + return(g) - } diff --git a/vosonSML/R/CreateSemanticNetwork.R b/vosonSML/R/CreateSemanticNetwork.R index 4d6a7fa..1f46ed6 100644 --- a/vosonSML/R/CreateSemanticNetwork.R +++ b/vosonSML/R/CreateSemanticNetwork.R @@ -1,122 +1,80 @@ -#' Note: this function is DEPRECATED and will be removed in a future release. -#' Please use the \code{Create} function +#' Creates a semantic network from social media data (semantic relationships between concepts) #' -#' Create semantic networks from social media data (semantic relationships -#' between concepts) +#' This function creates a semantic network from social media data (i.e. from data frames of class \code{dataSource}, +#' or for Twitter data it is also possible to provide a list of data frames). In such semantic networks, concepts are +#' words/terms extracted from the text corpus of social media data (e.g. tweets on Twitter). #' -#' This function creates a semantic network from social media data (i.e. from -#' data frames of class \code{dataSource}, or for Twitter data it is also -#' possible to provide a list of data frames). In such semantic networks, -#' concepts are words/terms extracted from the text corpus of social media data -#' (e.g. tweets on Twitter). +#' This function creates a weighted network from a data frame of class \code{dataSource} (which are created using the +#' 'CollectData' family of functions in the vosonSML package), or a list of Twitter data frames collected using +#' \code{CollectDataTwitter} function. #' -#' This function creates a weighted network from a data frame of class -#' \code{dataSource} (which are created using the `CollectData` family of -#' functions in the vosonSML package), or a list of Twitter data frames -#' collected using \code{CollectDataTwitter} function. +#' The resulting semantic network is an igraph graph object. This graph object is semantic because vertices represent +#' unique concepts (in this case unique terms/words extracted from a social media text corpus), and edges represent +#' the co-occurrence of terms for all observations in the data set. For example, for a Twitter semantic network, +#' vertices represent either hashtags (e.g. "#auspol") or single terms ("politics"). If there are 1500 tweets in the +#' data set (i.e. 1500 observations), and the term "#auspol" and the term "politics" appear together in every tweet, +#' then this will be represented by an edge with weight equal to 1500. #' -#' The resulting semantic network is an igraph graph object. This graph object -#' is semantic because vertices represent unique concepts (in this case unique -#' terms/words extracted from a social media text corpus), and edges represent -#' the co-occurrence of terms for all observations in the data set. For -#' example, for a Twitter semantic network, vertices represent either hashtags -#' (e.g. "#auspol") or single terms ("politics"). If there are 1500 tweets in -#' the data set (i.e. 1500 observations), and the term "#auspol" and the term -#' "politics" appear together in every tweet, then this will be represented by -#' an edge with weight equal to 1500. -#' -#' @param x a data frame of class \code{dataSource}. For Twitter data, it is -#' also possible to provide a *list* of data frames (i.e. data frames that -#' inherit class \code{dataSource} and \code{twitter}). Only lists of Twitter -#' data frames are supported at this time. If a list of data frames is -#' provided, then the function binds these row-wise and computes over the -#' entire data set. -#' @param writeToFile logical. If \code{TRUE} then the network is saved to file -#' in current working directory (GRAPHML format), with filename denoting the -#' current date/time and the type of network. -#' @param termFreq numeric integer, specifying the percentage of most frequent -#' TERMS to include. For example, a value of 20 means that the 20 percent most -#' frequently occurring terms will be included in the semantic network. The +#' @param x A data frame of class \code{dataSource}. For Twitter data, it is also possible to provide a *list* of data +#' frames (i.e. data frames that inherit class \code{dataSource} and \code{twitter}). Only lists of Twitter data +#' frames are supported at this time. If a list of data frames is provided, then the function binds these row-wise and +#' computes over the entire data set. +#' @param writeToFile Logical. If \code{TRUE} then the network is saved to file in current working directory (GRAPHML +#' format), with filename denoting the current date/time and the type of network. +#' @param termFreq Numeric integer. Specifies the percentage of most frequent TERMS to include. For example, a value +#' of 20 means that the 20 percent most frequently occurring terms will be included in the semantic network. The #' default value is 5, meaning the 5 percent most frequent terms are used. #' @param hashtagFreq ** NOT IMPLEMENTED YET - DEFAULTS TO ALL HASHTAGS **. -#' numeric integer, specifying the percentage of most frequent HASHTAGS to -#' include. For example, a value of 80 means that the 80 percent most frequently -#' occurring hashtags will be included in the semantic network. The default -#' value is 50, meaning the 50 percent most frequent hashtags are used. -#' @param removeTermsOrHashtags character vector. Default is none. Otherwise -#' this argument specifies which terms or hashtags (i.e. vertices with matching -#' `name`) should be removed from the semantic network. This is useful to -#' remove the search term or hashtag that was used to collect the data (i.e. -#' remove the corresponding vertex in the graph). For example, a value of -#' "#auspol" means that if there is a vertex with the name "#auspol" then this -#' vertex will be removed. -#' @param stopwordsEnglish logical. If \code{TRUE} then English stopwords are -#' removed from the tweets (e.g. words such as 'the' or 'and'). Using -#' \code{FALSE} may be helpful non-English data sets. The default is -#' \code{TRUE} (i.e. stopwords will be removed). +#' Numeric integer. Specifies the percentage of most frequent HASHTAGS to include. For example, a value of 80 means +#' that the 80 percent most frequently occurring hashtags will be included in the semantic network. The default value +#' is 50, meaning the 50 percent most frequent hashtags are used. +#' @param removeTermsOrHashtags Character string vector. Default is none. Otherwise this argument specifies which terms +#' or hashtags (i.e. vertices with matching 'name') should be removed from the semantic network. This is useful to +#' remove the search term or hashtag that was used to collect the data (i.e. remove the corresponding vertex in the +#' graph). For example, a value of "#auspol" means that if there is a vertex with the name "#auspol" then this vertex +#' will be removed. +#' @param stopwordsEnglish Logical. If \code{TRUE} then English stopwords are removed from the tweets (e.g. words such +#' as 'the' or 'and'). Using \code{FALSE} may be helpful non-English data sets. The default is \code{TRUE} (i.e. +#' stopwords will be removed). +#' @param ... Additional parameters to pass to the network creation method. +#' #' @return An igraph graph object, with weighted edges. -#' @note Not all data sources in vosonSML can be used for creating -#' semantic networks. -#' -#' Currently supported data sources are: -#' -#' - Twitter -#' -#' Other data sources (e.g. YouTube and Facebook) will be implemented in the -#' future. Additionally, the user is notified if they try to create semantic -#' networks for incompatible data sources. -#' -#' For Twitter data, semantic networks can be created from multiple data frames -#' (i.e. datasets collected individually using CollectDataTwitter). Simply -#' create a list of the data frames that you wish to create a network from. For -#' example, \code{myList <- list(myTwitterData1, myTwitterData2, -#' myTwitterData3)}. -#' @author Timothy Graham & Robert Ackland -#' -#' @seealso See \code{CollectDataTwitter} to collect data for creating semantic -#' networks in vosonSML. -#' @keywords SNA semantic network igraph social media -#' @examples -#' -#' \dontrun{ -#' ## This example shows how to collect Twitter data and create a semantic network -#' -#' # Firstly specify your API credentials -#' my_api_key <- "1234567890qwerty" -#' my_api_secret <- "1234567890qwerty" -#' my_access_token <- "1234567890qwerty" -#' my_access_token_secret <- "1234567890qwerty" -#' -#' # Authenticate with the Twitter API using \code{AuthenticateWithTwitterAPI} -#' AuthenticateWithTwitterAPI(api_key=my_api_key, api_secret=my_api_secret, -#' access_token=my_access_token, access_token_secret=my_access_token_secret) -#' -#' # Collect tweets data using \code{myTwitterData} -#' myTwitterData <- CollectDataTwitter(searchTerm="#auspol", -#' numTweets=200,writeToFile=FALSE,verbose=FALSE) -#' -#' # Create a 'semantic' network using \code{CreateSemanticNetwork} -#' g_semantic_twitter <- CreateSemanticNetwork(myTwitterData,writeToFile=FALSE, -#' termFreq=20,hashtagFreq=80) +#' +#' @note Currently supported data sources: +#' \itemize{ +#' \item \code{twitter} #' } #' -CreateSemanticNetwork <- -function(x,writeToFile,termFreq,hashtagFreq,removeTermsOrHashtags,stopwordsEnglish) - { - if (missing(writeToFile)) { - writeToFile <- FALSE # default = not write to file - } - if (missing(termFreq)) { - termFreq <- 5 # default to the top 5% most frequent terms. reduces size of graph. - } - if (missing(hashtagFreq)) { - hashtagFreq <- 50 # default to the top 50% hashtags. reduces size of graph. hashtags are 50% because they are much less frequent than terms. - } - if (missing(removeTermsOrHashtags)) { - removeTermsOrHashtags <- NA - } - if (missing(stopwordsEnglish)) { - stopwordsEnglish <- TRUE # default to true, because most English users will probably want this - } - UseMethod("CreateSemanticNetwork",x) - } +#' For Twitter data, semantic networks can be created from multiple data frames (i.e. datasets collected individually +#' using CollectDataTwitter). Simply create a list of the data frames that you wish to create a network from. For +#' example, \code{myList <- list(myTwitterData1, myTwitterData2, myTwitterData3)}. +#' +#' @seealso \code{CollectDataTwitter} +#' @keywords SNA semantic network igraph social media +#' +CreateSemanticNetwork <- function(x, writeToFile, termFreq, hashtagFreq, removeTermsOrHashtags, stopwordsEnglish, ...) { + + if (missing(writeToFile)) { + writeToFile <- FALSE + } + + if (missing(termFreq)) { + termFreq <- 5 # default to the top 5% most frequent terms. reduces size of graph. + } + + if (missing(hashtagFreq)) { + # default to the top 50% hashtags. reduces size of graph. hashtags are 50% because they are much less frequent + # than terms. + hashtagFreq <- 50 + } + + if (missing(removeTermsOrHashtags)) { + removeTermsOrHashtags <- NA + } + + if (missing(stopwordsEnglish)) { + stopwordsEnglish <- TRUE # default to true, because most English users will probably want this + } + + UseMethod("CreateSemanticNetwork", x) +} diff --git a/vosonSML/R/CreateSemanticNetwork.default.R b/vosonSML/R/CreateSemanticNetwork.default.R index de8a5f3..156560d 100644 --- a/vosonSML/R/CreateSemanticNetwork.default.R +++ b/vosonSML/R/CreateSemanticNetwork.default.R @@ -1,20 +1,26 @@ -CreateSemanticNetwork.default <- -function(x,writeToFile,termFreq,hashtagFreq,removeTermsOrHashtags) - { - if (missing(writeToFile)) { - writeToFile <- FALSE # default = not write to file - } - if (missing(termFreq)) { - termFreq <- 5 # default to the top 5% most frequent terms. reduces size of graph. - } - if (missing(hashtagFreq)) { - hashtagFreq <- 50 # default to the top 50% hashtags. reduces size of graph. hashtags are 50% because they are much less frequent than terms. - } - if (missing(removeTermsOrHashtags)) { - removeTermsOrHashtags <- NA - } - cat("Error. Cannot create semantic network using this type of data (see help file for data types and sources).\n") - if (inherits(x,"temporal")) { - cat("(The data you supplied is temporal. Please use the `CreateDynamicNetwork` function for temporal data.)\n") - } +CreateSemanticNetwork.default <- function(x, writeToFile, termFreq, hashtagFreq, removeTermsOrHashtags) { + + if (missing(writeToFile)) { + writeToFile <- FALSE # default = not write to file } + + if (missing(termFreq)) { + termFreq <- 5 # default to the top 5% most frequent terms. reduces size of graph. + } + + if (missing(hashtagFreq)) { + # default to the top 50% hashtags. reduces size of graph. hashtags are 50% because they are much less frequent + # than terms. + hashtagFreq <- 50 + } + + if (missing(removeTermsOrHashtags)) { + removeTermsOrHashtags <- NA + } + + cat("Error. Cannot create semantic network using this type of data (see help file for data types and sources).\n") + + if (inherits(x, "temporal")) { + cat("(The data you supplied is temporal. Please use the `CreateDynamicNetwork` function for temporal data.)\n") + } +} diff --git a/vosonSML/R/CreateSemanticNetwork.twitter.R b/vosonSML/R/CreateSemanticNetwork.twitter.R index 0e26f45..3e7c767 100644 --- a/vosonSML/R/CreateSemanticNetwork.twitter.R +++ b/vosonSML/R/CreateSemanticNetwork.twitter.R @@ -1,259 +1,240 @@ -#' @export -CreateSemanticNetwork.twitter <- -function(x,writeToFile,termFreq,hashtagFreq,removeTermsOrHashtags,stopwordsEnglish) -{ +CreateSemanticNetwork.twitter <- function(x, writeToFile = FALSE, termFreq = 5, hashtagFreq = 50, + removeTermsOrHashtags, stopwordsEnglish = TRUE, verbose = FALSE) { - if (missing(writeToFile)) { - writeToFile <- FALSE # default = not write to file - } - - if (missing(stopwordsEnglish)) { - stopwordsEnglish <- TRUE # default to true, because most English users will probably want this - } - - if (missing(termFreq)) { - termFreq <- 5 # default to the top 5% most frequent terms. reduces size of graph. - } - - if (missing(hashtagFreq)) { - hashtagFreq <- 50 # default to the top 50% hashtags. reduces size of graph. hashtags are 50% because they are much less frequent than terms. - } + # default to the top 5% most frequent terms. reduces size of graph + # default to the top 50% hashtags. reduces size of graph. hashtags are 50% because they are much less + # frequent than terms. if (!missing(removeTermsOrHashtags)) { removeTermsOrHashtags <- as.vector(removeTermsOrHashtags) #coerce to vector... to be sure - } - - if (missing(removeTermsOrHashtags)) { + } else { removeTermsOrHashtags <- "foobar" } - - + df <- x # match the variable names (this must be used to avoid warnings in package compilation) - + # if `df` is a list of dataframes, then need to convert these into one dataframe suppressWarnings( - if (class(df)=="list") { - df <- do.call("rbind", df) - } - ) + if (class(df) == "list") { + df <- do.call("rbind", df) + }) - EnsurePackage("igraph") - - # Now create the dfSemanticNetwork3, - # a dataframe of relations between hashtags and terms - # (i.e. hashtag i and term j both occurred in same tweet - # (weight = n occurrences)) - - print("Generating Twitter semantic network...") ### DEBUG - flush.console() - - # convert the hashtags to lowercase here (before using tm_map later) - # but first deal with character encoding: - macMatch <- grep("darwin",R.Version()$os) - if (length(macMatch)!=0) { - # df$hashtags_used <- iconv(df$hashtags_used,to="utf-8-mac") - df$hashtags_used <- lapply(df$hashtags_used, function(x) TrimOddCharMac(x)) - } - if (length(macMatch)==0) { - df$hashtags_used <- lapply(df$hashtags_used, function(x) TrimOddChar(x)) - } - # ... and then convert to lowercase: - df$hashtags_used <- lapply(df$hashtags_used,tolower) - - # do the same for the comment text, but first deal with character encoding! - # we need to change value of `to` argument in `iconv` depending on OS, or else errors can occur - macMatch <- grep("darwin",R.Version()$os) - if (length(macMatch)!=0) { - df$text <- iconv(df$text,to="utf-8-mac") - } - if (length(macMatch)==0) { - df$text <- iconv(df$text,to="utf-8") - } - # ... and then convert to lowercase: - df$text <- tolower(df$text) - - hashtagsUsedTemp <- c() # temp var to store output - - # The 'hashtags_used' column in the 'df' dataframe - # is slightly problematic (i.e. not straightforward) - # because each cell in this column contains a - # LIST, itself containing 1 or more char vectors - # (which are unique hashtags found in the tweet text; empty if no hashtags used). - # So, need to extract each list item out, - # and put it into its own row in a new dataframe: - - for (i in 1:nrow(df)) { - if (length(df$hashtags_used[[i]]) > 0) { # skip any rows where NO HASHTAGS were used - for (j in 1:length(df$hashtags_used[[i]])) { - #commonTermsTemp <- c(commonTermsTemp, df$from_user[i]) - hashtagsUsedTemp <- c(hashtagsUsedTemp,df$hashtags_used[[i]][j]) - } - } - } # NOTE: try and vectorise this in future work to improve speed. - - hashtagsUsedTemp <- unique(hashtagsUsedTemp) - -### delete hashtags that contain 'horizontal ellipses' - - # delEllipses <- grep("\u2026",hashtagsUsedTemp) - # cat(paste("\nNumber of hashtags with ellipses: ",length(delEllipses),"\n")) - # cat(paste("\nThe offending hashtags:\n",hashtagsUsedTemp[delEllipses],"\n")) - # cat("Original:\n") - # cat(hashtagsUsedTemp) - # hashtagsUsedTemp <- hashtagsUsedTemp[-delEllipses] - # cat("Fixed:\n") - # cat(hashtagsUsedTemp) - -######## - - hashtagsUsedTempFrequency <- c() - # potentially do not want EVERY hashtag - just the top N% (most common): - for (i in 1: length(hashtagsUsedTemp)) { - hashtagsUsedTempFrequency[i] <- length(grep(hashtagsUsedTemp[i],df$text)) - } - mTemp <- cbind(hashtagsUsedTemp, hashtagsUsedTempFrequency) - mTemp2 <- as.matrix(as.numeric(mTemp[,2])) - names(mTemp2) <- mTemp[,1] - vTemp <- sort(mTemp2, decreasing=TRUE) - hashtagsUsedTemp <- names(head(vTemp, (length(vTemp) / 100) * hashtagFreq)) - ################################ ^^^^ this defaults to top 50% hashtags - - # we need to remove all punctuation EXCEPT HASHES (!) - # (e.g. both #auspol and auspol will appear in data) - df$text <- gsub("[^[:alnum:][:space:]#]", "", df$text) - - ## Find the most frequent terms across the tweet text corpus - commonTermsTemp <- df$text - - corpusTweetText <- Corpus(VectorSource(commonTermsTemp)) - - ## add usernames to stopwords - - mach_usernames <- sapply(df$screen_name, function(x) TrimOddChar(x)) - mach_usernames <- unique(mach_usernames) - if (length(macMatch)!=0) { - mach_usernames <- iconv(mach_usernames,to="utf-8-mac") - } - if (length(macMatch)==0) { - mach_usernames <- iconv(mach_usernames,to="utf-8") - } - - # we remove the usernames from the text (so they don't appear in data/network) - my_stopwords <- mach_usernames - corpusTweetText <- tm_map(corpusTweetText, removeWords, my_stopwords) - - # convert to all lowercase (WE WILL DO THIS AGAIN BELOW, SO REMOVE THIS DUPLICATE) - # corpusTweetText <- tm_map(corpusTweetText, content_transformer(tolower)) - - # remove English stop words (IF THE USER HAS SPECIFIED!) - if (stopwordsEnglish) { - corpusTweetText <- tm_map(corpusTweetText, removeWords, stopwords("english")) - } - - # eliminate extra whitespace - corpusTweetText <- tm_map(corpusTweetText, stripWhitespace) - - # create document term matrix applying some transformations - # note: applying too many transformations here (duplicating...) - need to fix - tdm = TermDocumentMatrix(corpusTweetText, - control = list(removeNumbers = TRUE, tolower = TRUE)) - - # create a vector of the common terms, finding the top N% terms - # N will need to be adjusted according to network / user requirements. - - mTemp <- as.matrix(tdm) - vTemp <- sort(rowSums(mTemp), decreasing=TRUE) - commonTerms <- names(head(vTemp, (length(vTemp) / 100) * termFreq)) - ################################ ^^^^ the default finds top 5% terms - - toDel <- grep("http",commonTerms) # !! still picking up junk terms (FIX) - if(length(toDel) > 0) { - commonTerms <- commonTerms[-toDel] # delete these junk terms + # now create the dfSemanticNetwork3, a dataframe of relations between hashtags and terms (i.e. hashtag i and term j + # both occurred in same tweet (weight = n occurrences)) + + df_stats <- networkStats(NULL, "collected tweets", nrow(df)) + + cat("Generating twitter semantic network...\n") + flush.console() + + # convert the hashtags to lowercase here (before using tm_map later) but first deal with character encoding + macMatch <- grep("darwin", R.Version()$os) + if (length(macMatch) != 0) { + # df$hashtags_used <- iconv(df$hashtags_used, to = "utf-8-mac") + df$hashtags <- lapply(df$hashtags, function(x) TrimOddCharMac(x)) + } + + if (length(macMatch) == 0) { + df$hashtags <- lapply(df$hashtags, function(x) TrimOddChar(x)) + } + + # and then convert to lowercase + df$hashtags <- lapply(df$hashtags, tolower) + + # do the same for the comment text, but first deal with character encoding! + # we need to change value of `to` argument in 'iconv' depending on OS, or else errors can occur + macMatch <- grep("darwin", R.Version()$os) + if (length(macMatch) != 0) { + df$text <- iconv(df$text, to = "utf-8-mac") + } + + if (length(macMatch) == 0) { + df$text <- iconv(df$text, to = "utf-8") + } + + # and then convert to lowercase + df$text <- tolower(df$text) + + hashtagsUsedTemp <- c() # temp var to store output + + # the 'hashtags_used' column in the 'df' dataframe is slightly problematic (i.e. not straightforward) + # because each cell in this column contains a LIST, itself containing 1 or more char vectors (which are unique + # hashtags found in the tweet text; empty if no hashtags used). + # so, need to extract each list item out, and put it into its own row in a new dataframe + count <- 0 + for (i in 1:nrow(df)) { + if (length(df$hashtags[[i]]) > 0) { # skip any rows where NO HASHTAGS were used + for (j in 1:length(df$hashtags[[i]])) { + count <- count + 1 + #commonTermsTemp <- c(commonTermsTemp, df$from_user[i]) + hashtagsUsedTemp <- c(hashtagsUsedTemp, df$hashtags[[i]][j]) } - - # create the "semantic hashtag-term network" dataframe - # (i.e. pairs of hashtags / terms) - - termAssociatedWithHashtag <- c() # temp var to store output - hashtagAssociatedWithTerm <- c() # temp var to store output - - for (i in 1:nrow(df)) { - if (length(df$hashtags_used[[i]]) > 0) { # skip any rows where NO HASHTAGS were used - for (j in 1:length(df$hashtags_used[[i]])) { - for (k in 1:length(commonTerms)) { - match <- grep(commonTerms[k],df$text[i]) - if (length(match) > 0) { - - termAssociatedWithHashtag <- c(termAssociatedWithHashtag,commonTerms[k]) - hashtagAssociatedWithTerm <- c(hashtagAssociatedWithTerm,df$hashtags_used[[i]][j]) - - } - } + } + } # try and vectorise this in future work to improve speed + df_stats <- networkStats(df_stats, "raw hashtags", count, FALSE) + + hashtagsUsedTemp <- unique(hashtagsUsedTemp) + df_stats <- networkStats(df_stats, "unique hashtags", length(hashtagsUsedTemp), FALSE) + + hashtagsUsedTempFrequency <- c() + + # potentially do not want EVERY hashtag - just the top N% (most common) + for (i in 1: length(hashtagsUsedTemp)) { + hashtagsUsedTempFrequency[i] <- length(grep(hashtagsUsedTemp[i], df$text)) + } + + mTemp <- cbind(hashtagsUsedTemp, hashtagsUsedTempFrequency) + mTemp2 <- as.matrix(as.numeric(mTemp[, 2])) + names(mTemp2) <- mTemp[, 1] + vTemp <- sort(mTemp2, decreasing = TRUE) + + # this defaults to top 50% hashtags + hashtagsUsedTemp <- names(head(vTemp, (length(vTemp) / 100) * hashtagFreq)) + df_stats <- networkStats(df_stats, paste0("top ", hashtagFreq , "% hashtags"), length(hashtagsUsedTemp), FALSE) + + # we need to remove all punctuation EXCEPT HASHES (!) (e.g. both #auspol and auspol will appear in data) + df$text <- gsub("[^[:alnum:][:space:]#]", "", df$text) + + # find the most frequent terms across the tweet text corpus + commonTermsTemp <- df$text + + corpusTweetText <- Corpus(VectorSource(commonTermsTemp)) + + # add usernames to stopwords + mach_usernames <- sapply(df$screen_name, function(x) TrimOddChar(x)) + mach_usernames <- unique(mach_usernames) + + if (length(macMatch) != 0) { + mach_usernames <- iconv(mach_usernames, to = "utf-8-mac") + } + + if (length(macMatch) == 0) { + mach_usernames <- iconv(mach_usernames, to = "utf-8") + } + + # we remove the usernames from the text (so they don't appear in data/network) + my_stopwords <- mach_usernames + corpusTweetText <- tm_map(corpusTweetText, removeWords, my_stopwords) + + # convert to all lowercase (WE WILL DO THIS AGAIN BELOW, SO REMOVE THIS DUPLICATE) + # corpusTweetText <- tm_map(corpusTweetText, content_transformer(tolower)) + + # remove English stop words (IF THE USER HAS SPECIFIED!) + if (stopwordsEnglish) { + corpusTweetText <- tm_map(corpusTweetText, removeWords, stopwords("english")) + } + + # eliminate extra whitespace + corpusTweetText <- tm_map(corpusTweetText, stripWhitespace) + + # create document term matrix applying some transformations + # ** applying too many transformations here (duplicating...) - need to fix + tdm = TermDocumentMatrix(corpusTweetText, control = list(removeNumbers = TRUE, tolower = TRUE)) + + # create a vector of the common terms, finding the top N% terms + # N will need to be adjusted according to network / user requirements + mTemp <- as.matrix(tdm) + vTemp <- sort(rowSums(mTemp), decreasing = TRUE) + df_stats <- networkStats(df_stats, paste0("common terms"), length(vTemp), FALSE) + + ## the default finds top 5% terms + commonTerms <- names(head(vTemp, (length(vTemp) / 100) * termFreq)) + + toDel <- grep("http", commonTerms) # !! still picking up junk terms (FIX) + if (length(toDel) > 0) { + commonTerms <- commonTerms[-toDel] # delete these junk terms + } + df_stats <- networkStats(df_stats, paste0("top ", termFreq , "% terms"), length(commonTerms), FALSE) + + # create the "semantic hashtag-term network" dataframe (i.e. pairs of hashtags / terms) + + termAssociatedWithHashtag <- c() # temp var to store output + hashtagAssociatedWithTerm <- c() # temp var to store output + + for (i in 1:nrow(df)) { + if (length(df$hashtags[[i]]) > 0) { # skip any rows where NO HASHTAGS were used + for (j in 1:length(df$hashtags[[i]])) { + for (k in 1:length(commonTerms)) { + + match <- grep(commonTerms[k], df$text[i]) + + if (length(match) > 0) { + termAssociatedWithHashtag <- c(termAssociatedWithHashtag, commonTerms[k]) + hashtagAssociatedWithTerm <- c(hashtagAssociatedWithTerm, df$hashtags[[i]][j]) } } - } # THIS IS A *HORRIBLE* LOOPED APPROACH. NEED TO VECTORISE!!! - - # this needs to be changed to termAssociatedWithHashtag and hashtagAssociatedWithTerm - dfSemanticNetwork3 <- data.frame(hashtagAssociatedWithTerm, termAssociatedWithHashtag) - - # OK, now extract only the UNIQUE pairs (i.e. rows) - # But, also create a WEIGHT value for usages of the same hashtag. - # NOTE: This edge weights approach might be problematic for TEMPORAL networks, because each edge (with weight > 1) may represent usage of hashtags at DIFFERENT TIMES. - # NOTE: A possible workaround could be to include an edge attribute that is a set of timestamp elements, showing the date/time of each instance of usage of a hashtag. - # NOTE: For example, in a temporal visualisation, the first timestamp might 'pop in' the edge to the graph, which then might start to 'fade out' over time (or just 'pop out' of graph after N seconds) if there are no more timestamps indicating activity (i.e. a user using a hashtag). - # NOTE: So, a 'timestamps' edge attribute could factor into a kind of 'entropy' based approach to evolving the network visually over time. - - # unique pairs: - unique_dfSemanticNetwork3 <- unique(dfSemanticNetwork3) # hmm, need this still? - - # number of times hashtag was used per user/hashtag pair (i.e. edge weight): - for (i in 1:nrow(unique_dfSemanticNetwork3)) { - unique_dfSemanticNetwork3$numHashtagTermOccurrences[i] <- sum( - hashtagAssociatedWithTerm==unique_dfSemanticNetwork3[i,1] & - termAssociatedWithHashtag==unique_dfSemanticNetwork3[i,2]) - } - - # make a dataframe of the relations between actors - relations <- data.frame(from=as.character(unique_dfSemanticNetwork3[,1]),to=as.character(unique_dfSemanticNetwork3[,2]),weight=unique_dfSemanticNetwork3$numHashtagTermOccurrences) - relations$from <- as.factor(relations$from) - relations$to <- as.factor(relations$to) - - actorsFixed <- rbind(as.character(unique_dfSemanticNetwork3[,1]),as.character(unique_dfSemanticNetwork3[,2])) - actorsFixed <- as.factor(actorsFixed) - actorsFixed <- unique(actorsFixed) - - ##### STEP FOUR ##### - - # convert into a graph - suppressWarnings( - g <- graph.data.frame(relations, directed=FALSE, vertices=actorsFixed) - ) - # we need to simplify the graph because multiple use of same term - # in one tweet will cause self-loops, etc - # g <- simplify(g) - - # Make the node labels play nice with Gephi - V(g)$label <- V(g)$name - - # remove the search term / hashtags, if user specified it: - if (removeTermsOrHashtags[1]!="foobar") { - toDel <- match(tolower(removeTermsOrHashtags),V(g)$name) # we force to lowercase because all terms/hashtags are already converted to lowercase - toDel <- toDel[!is.na(toDel)] # in case of user error (i.e. trying to delete terms/hashtags that don't exist in the data) - g <- delete.vertices(g, toDel) - } - - if (writeToFile=="TRUE" | writeToFile=="true" | writeToFile=="T" | writeToFile==TRUE) { - # Output the final network to a graphml file, to import directly into Gephi - currTime <- format(Sys.time(), "%b_%d_%X_%Y_%Z") - currTime <- gsub(":","_",currTime) - write.graph(g,paste0(currTime,"_TwitterSemanticNetwork.graphml"),format="graphml") - cat("Twitter semantic network was written to current working directory, with filename:\n") - cat(paste0(currTime,"_TwitterSemanticNetwork.graphml")) } - - cat("\nDone.") ### DEBUG - flush.console() - - return(g) - + } + } # THIS IS A *HORRIBLE* LOOPED APPROACH. NEED TO VECTORISE!!! + + # this needs to be changed to termAssociatedWithHashtag and hashtagAssociatedWithTerm + dfSemanticNetwork3 <- data.frame(hashtagAssociatedWithTerm, termAssociatedWithHashtag) + + # OK, now extract only the UNIQUE pairs (i.e. rows) + # But, also create a WEIGHT value for usages of the same hashtag. + # NOTE: This edge weights approach might be problematic for TEMPORAL networks, because each edge (with weight > 1) + # may represent usage of hashtags at DIFFERENT TIMES. + # NOTE: A possible workaround could be to include an edge attribute that is a set of timestamp elements, showing the + # date/time of each instance of usage of a hashtag. + # NOTE: For example, in a temporal visualisation, the first timestamp might 'pop in' the edge to the graph, which + # then might start to 'fade out' over time (or just 'pop out' of graph after N seconds) if there are no more + # timestamps indicating activity (i.e. a user using a hashtag). + # NOTE: So, a 'timestamps' edge attribute could factor into a kind of 'entropy' based approach to evolving the + # network visually over time. + + # unique pairs + unique_dfSemanticNetwork3 <- unique(dfSemanticNetwork3) # hmm, need this still? + + # number of times hashtag was used per user/hashtag pair (i.e. edge weight): + for (i in 1:nrow(unique_dfSemanticNetwork3)) { + unique_dfSemanticNetwork3$numHashtagTermOccurrences[i] <- sum( + hashtagAssociatedWithTerm == unique_dfSemanticNetwork3[i, 1] & + termAssociatedWithHashtag == unique_dfSemanticNetwork3[i, 2]) + } + + # make a dataframe of the relations between actors + relations <- data.frame(from = as.character(unique_dfSemanticNetwork3[, 1]), + to = as.character(unique_dfSemanticNetwork3[,2]), + weight = unique_dfSemanticNetwork3$numHashtagTermOccurrences) + + relations$from <- as.factor(relations$from) + relations$to <- as.factor(relations$to) + + actorsFixed <- rbind(as.character(unique_dfSemanticNetwork3[, 1]), as.character(unique_dfSemanticNetwork3[, 2])) + actorsFixed <- as.factor(actorsFixed) + actorsFixed <- unique(actorsFixed) + df_stats <- networkStats(df_stats, "unique entities (nodes)", length(actorsFixed)) + df_stats <- networkStats(df_stats, "relations (edges)", nrow(relations)) + + # convert into a graph + suppressWarnings(g <- graph.data.frame(relations, directed = FALSE, vertices = actorsFixed)) + + # we need to simplify the graph because multiple use of same term in one tweet will cause self-loops, etc + # g <- simplify(g) + + # make the node labels play nice with Gephi + V(g)$label <- V(g)$name + + # remove the search term / hashtags, if user specified it + if (removeTermsOrHashtags[1] != "foobar") { + # we force to lowercase because all terms/hashtags are already converted to lowercase + toDel <- match(tolower(removeTermsOrHashtags), V(g)$name) + + # in case of user error (i.e. trying to delete terms/hashtags that don't exist in the data) + toDel <- toDel[!is.na(toDel)] + + g <- delete.vertices(g, toDel) + + df_stats <- networkStats(df_stats, "entities after terms/hashtags removed", vcount(g)) + } + + # print stats + if (verbose) { networkStats(df_stats, print = TRUE) } + + if (writeToFile) { writeOutputFile(g, "graphml", "TwitterSemanticNetwork") } + + cat("Done.\n") + flush.console() + + return(g) } diff --git a/vosonSML/R/EnsurePackage.R b/vosonSML/R/EnsurePackage.R deleted file mode 100644 index 8e24ffe..0000000 --- a/vosonSML/R/EnsurePackage.R +++ /dev/null @@ -1,12 +0,0 @@ -EnsurePackage <- -function(x) { - # EnsurePackage(x) - Installs and loads a package if necessary - # Args: - # x: name of package - - x <- as.character(x) - if (!require(x, character.only=TRUE)) { - install.packages(pkgs=x, repos="http://cran.r-project.org") - require(x, character.only=TRUE) - } -} diff --git a/vosonSML/R/ExtractHashtagInfo.R b/vosonSML/R/ExtractHashtagInfo.R deleted file mode 100644 index 784bdda..0000000 --- a/vosonSML/R/ExtractHashtagInfo.R +++ /dev/null @@ -1,27 +0,0 @@ -ExtractHashtagInfo <- -function(df) { - # For each tweet, extract ANY hashtags that a user has used: - - df$hashtags_used <- sapply(df$text, function(tweet) - - # OLD WAY: - # TrimHead(str_match_all(tweet,"#[[:alnum:]_+]*")[[1]]) - - # NEW WAY: - # This matches hashtags, but not if the hashtag is "cut off" at the end - # of the tweet text, denoted by a 'trailing ellipsis' character. - # This avoids the problem of picking up erroneous hashtags that are cut off, - # e.g. "#ausp..." when it should be "#auspol" - - # horizontalEllipsis <- "\u2026" - # horizontalEllipsisFixed <- stri_unescape_unicode(horizontalEllipsis) - - # patternRegex <- paste0("#[^#\\s]+(?!\\\u2026)\\b") - # TrimHead(str_match_all(tweet,paste0("#[[:alnum:]_+^",horizontalEllipsis,"$]*"))[[1]]) - - regmatches(tweet, gregexpr("#[^#\\s]+(?!\u2026)\\b", tweet, perl=T)) - - ) - - return(df) -} diff --git a/vosonSML/R/ExtractUrls.R b/vosonSML/R/ExtractUrls.R deleted file mode 100644 index 73bd0dc..0000000 --- a/vosonSML/R/ExtractUrls.R +++ /dev/null @@ -1,16 +0,0 @@ -ExtractUrls <- -function(df) { - # For each tweet, extract url, remove it from the tweet, - # and put them separately in a new column - # TODO: cannot deal with multiple urls in one tweet right now - - # EnsurePackage("stringr") - # EnsurePackage("grid") - - # extracts links (quick and dirty) - # wish to have something like http://daringfireball.net/2009/11/liberal_regex_for_matching_urls - df$links <- sapply(df$text,function(tweet) str_extract(tweet,("http[^[:blank:]]+"))) - df$text <- sapply(df$text, function(x) TrimUrls(x)) - - return(df) -} diff --git a/vosonSML/R/ExtractUserInfo.R b/vosonSML/R/ExtractUserInfo.R deleted file mode 100644 index fcb919c..0000000 --- a/vosonSML/R/ExtractUserInfo.R +++ /dev/null @@ -1,19 +0,0 @@ -ExtractUserInfo <- -function(df) { - # For each tweet, extract information related to users - # such as to_user, rt_user... - - # extract to_user - df$reply_to <- sapply(df$text, function(tweet) - TrimHead(str_extract(tweet,"^((\\.)?(@[[:alnum:]_+]*))"))) - - # extract any MENTIONS at all (inc. replies, mentions, etc) ### This is a completely new approach - it 'vacuums' up ANY mentions - df$users_mentioned <- sapply(df$text, function(tweet) - TrimHead(str_match_all(tweet,"@[[:alnum:]_+]*")[[1]])) - - # extract rt_user - df$retweet_from <- sapply(df$text, function(tweet) - TrimHead(str_extract(tweet,"^[RM]T (@[[:alnum:]_+]*)"))) - - return(df) -} diff --git a/vosonSML/R/GetYoutubeVideoIDs.R b/vosonSML/R/GetYoutubeVideoIDs.R index ab366e6..0bc0537 100644 --- a/vosonSML/R/GetYoutubeVideoIDs.R +++ b/vosonSML/R/GetYoutubeVideoIDs.R @@ -1,60 +1,51 @@ -#' Extract/scrape the IDs from a set of YouTube video URLs -#' -#' This function reads a list of YouTube video URLs from a text file and -#' converts them to a vector object. For example, -#' "https://www.youtube.com/watch?v=73I5dRucCds" has the ID "73I5dRucCds". This -#' function can be used to create an object for the argument \code{videoIDs} in -#' the function \code{CollectDataYoutube}, that is, by extracting the IDs for a -#' set of YouTube videos and compiling them into a vector, ready for collecting -#' data with \code{CollectDataYoutube}. -#' -#' -#' @param file The connection to read from. This can be a local file, or a http -#' or ftp connection. It can also be a character string with the file name or -#' URI. The file must be plain text format with the URL of each YouTube video -#' specified on a new line (separated by character return). For example, the -#' first line might contain https://www.youtube.com/watch?v=73I5dRucCds, and -#' the second line might contain https://www.youtube.com/watch?v=6S9r_YbqHy8. -#' @return a character vector representing a set of YouTube video IDs, each -#' with number of characters equal to 11 (e.g. "73I5dRucCds"). -#' @note This function is useful for lots of videos. However, many videos may -#' take a *long* time to collect data from. In such cases it is recommended to -#' use the \code{verbose=TRUE} argument for the function -#' \code{CollectDataYoutube}, in order to keep track of progress during -#' computation. -#' @author Timothy Graham & Robert Ackland -#' -#' @seealso Use \code{CollectDataYoutube} for collecting YouTube comments data. +#' Extract the IDs from a set of YouTube video URLs +#' +#' This function reads a list of YouTube video URLs from a text file and converts them to a vector object. For example, +#' "https://www.youtube.com/watch?v=73I5dRucCds" has the ID "73I5dRucCds". This function can be used to create an +#' object for the argument \code{videoIDs} in the function \code{CollectDataYoutube}, that is, by extracting the IDs +#' for a set of YouTube videos and compiling them into a vector, ready for collecting data with +#' \code{CollectDataYoutube}. +#' +#' @param file The connection to read from. This can be a local file, or a http or ftp connection. It can also be a +#' character string with the file name or URI. The file must be plain text format with the URL of each YouTube video +#' specified on a new line (separated by character return). For example, the first line might contain +#' https://www.youtube.com/watch?v=73I5dRucCds, and the second line might contain +#' https://www.youtube.com/watch?v=6S9r_YbqHy8. +#' +#' @return a character vector representing a set of YouTube video IDs, each with number of characters equal to 11 +#' (e.g. "73I5dRucCds"). +#' +#' @note This function is useful for lots of videos. However, many videos may take a *long* time to collect data from. +#' In such cases it is recommended to use the \code{verbose = TRUE} argument for the function \code{CollectDataYoutube} +#' , in order to keep track of progress during computation. +#' +#' @seealso \code{CollectDataYoutube} #' @keywords youtube scraping vosonSML -#' @examples #' +#' @examples #' \dontrun{ -#' ## This example shows how to use `GetYoutubeVideoIDs` to extract video IDs from YouTube -#' ## video URLs, and then collect data using the function `CollectDataYoutube` +#' # this example shows how to use 'GetYoutubeVideoIDs' to extract video IDs from YouTube video +#' # URLs, and then collect data using the function 'CollectDataYoutube' #' -#' # Use your own Google Developer API Key here: -#' myApiKey <- "1234567890" +#' # set your Google Developer API key +#' myYtApiKey <- "xxxxxxxxxx" #' -#' # Authenticate with the Google API -#' apiKeyYoutube <- AuthenticateWithYoutubeAPI(apiKeyYoutube=myApiKey) +#' # authenticate with the Google API +#' apiKeyYoutube <- AuthenticateWithYoutubeAPI(apiKeyYoutube = myYtApiKey) #' -#' # Use the function `GetYoutubeVideoIDs` to automatically generate vector of IDs from -#' # a plain text file of video URLs -#' videoIDs <- GetYoutubeVideoIDs(file="youtube_to_scrape.txt") +#' # use the function 'GetYoutubeVideoIDs' to automatically generate vector of IDs from a plain +#' # text file of video URLs +#' videoIDs <- GetYoutubeVideoIDs(file = "youtube_urls_to_scrape.txt") #' -#' # Collect the data using function `CollectDataYoutube` -#' myYoutubeData <- CollectDataYoutube(videoIDs,apiKeyYoutube,writeToFile=FALSE) +#' # collect the data using function 'CollectDataYoutube' +#' myYoutubeData <- CollectDataYoutube(videoIDs, apiKeyYoutube, writeToFile = FALSE) #' } +#' #' @export -GetYoutubeVideoIDs <- -function(file){ - - videoIDsTemp <- read.table(file, - sep="\n", - strip.white=TRUE) # in case of user input error - +GetYoutubeVideoIDs <- function(file) { + + videoIDsTemp <- read.table(file, sep = "\n", strip.white = TRUE) # in case of user input error videoIDsTemp <- as.vector(videoIDsTemp$V1) - - videoIDsOut <- substr(videoIDsTemp,33,43) - + + videoIDsOut <- substr(videoIDsTemp, 33, 43) } diff --git a/vosonSML/R/GraphUserInfoTwitter.R b/vosonSML/R/GraphUserInfoTwitter.R new file mode 100644 index 0000000..2f5607e --- /dev/null +++ b/vosonSML/R/GraphUserInfoTwitter.R @@ -0,0 +1,91 @@ +#' Create twitter network graph with user information attributes +#' +#' Creates a network from the relations and users dataframes generated by Create. Network is supplemented with +#' additional downloaded user information applied as node attributes. +#' +#' @param df_collect A dataframe containing the collected tweet data from \code{Collect}. +#' @param df_relations A dataframe containing the network relations data from \code{Create}. +#' @param df_users A dataframe containing the network users data from \code{Create}. +#' @param lookup_missing_users Logical. Request user information for any users missing from df_collect. Default +#' is \code{TRUE}. +#' @param twitter_token An twitter authentication token from \code{Authenticate}. +#' @param writeToFile Logical. If \code{TRUE} a data frame of user information and the resulting network graph will +#' be saved to file. Default is \code{FALSE}. +#' +#' @note Only supports twitter actor network at this time. Bimodal network support will require the filtering +#' of twitter user ids from nodes of other types. +#' +#' @return A list containing a dataframe with user information and an igraph object of the twitter network with +#' user node attributes. +#' +#' @export +GraphUserInfoTwitter <- function(df_collect, df_relations, df_users, lookup_missing_users = TRUE, + twitter_token = NULL, writeToFile = FALSE) { + + cat("Creating twitter network graph with user information as node attributes...\n") + flush.console() + + df_users %<>% dplyr::mutate_all(as.character) # changes all col types to character + + df_users_info <- rtweet::users_data(df_collect) %>% dplyr::distinct(.data$user_id, .keep_all = TRUE) + df_users_info %<>% dplyr::mutate_all(as.character) # changes all col types to character + df_missing_users <- dplyr::anti_join(df_users, df_users_info, by = "user_id") %>% + dplyr::distinct(.data$user_id, .keep_all = TRUE) + + df_missing_users_info <- NULL + if (lookup_missing_users) { + if (is.null(twitter_token)) { + cat("Please supply rtweet twitter authentication token to look up missing users info.\n") + } else { + cat(paste0("Fetching user information for ", nrow(df_missing_users), " users.\n")) + + # 90000 users per 15 mins with unused rate limit + df_lookup_data <- rtweet::lookup_users(df_missing_users$user_id, parse = TRUE, + token = twitter_token$auth) + df_missing_users_info <- rtweet::users_data(df_lookup_data) + cat(paste0("User information collected for ", nrow(df_missing_users_info), " users.\n")) + + if (nrow(df_missing_users) != nrow(df_missing_users_info)) { + cat("Collected user records does not match the number requested. Adding incomplete records back in.\n") + df_not_collected <- dplyr::anti_join(df_missing_users, df_missing_users_info, by = "user_id") + df_missing_users_info <- dplyr::bind_rows(df_missing_users_info, df_not_collected) + } + } + } else { + cat("No additional users information fetched.\n") + } + + if (!is.null(df_missing_users_info)) { + df_users_info_all <- rbind(df_users_info, df_missing_users_info) + } else { + df_users_info_all <- dplyr::bind_rows(df_users_info, df_missing_users) + } + + df_users_info_all %<>% dplyr::rename("display_name" = .data$name, "name" = .data$user_id) + + # fix numeric cols type and replacing na's for convenience + # col names ending in "count" + df_users_info_all %<>% dplyr::mutate_at(vars(ends_with("count")), funs(ifelse(is.na(.data$.), as.integer(0), + as.integer(.data$.)))) + + if (!is.null(df_missing_users_info) & writeToFile) { + writeOutputFile(df_users_info_all, "rds", "TwitterUserInfo") + } + + g <- graph_from_data_frame(df_relations, directed = TRUE, vertices = df_users_info_all) + + V(g)$screen_name <- ifelse(is.na(V(g)$screen_name), paste0("ID:", V(g)$name), V(g)$screen_name) + V(g)$label <- V(g)$screen_name + + if (writeToFile) { writeOutputFile(g, "graphml", "TwitterUserNetwork") } + + cat("Done.\n") + flush.console() + + function_output <- list( + "users" = df_users_info_all, + "graph" = g + ) + + return(function_output) +} diff --git a/vosonSML/R/importData.R b/vosonSML/R/ImportData.R similarity index 96% rename from vosonSML/R/importData.R rename to vosonSML/R/ImportData.R index b262d5f..3ce8400 100644 --- a/vosonSML/R/importData.R +++ b/vosonSML/R/ImportData.R @@ -28,13 +28,13 @@ #' rangeTo="2015-03-02", writeToFile=TRUE) #' #' # Import the data (that was saved to disk in the previous step) -#' myStarWarsData <- importData("2015-03-01_to_2015-03-02_StarWars_FacebookData.csv","facebook") +#' myStarWarsData <- ImportData("2015-03-01_to_2015-03-02_StarWars_FacebookData.csv","facebook") #' #' # Create a network using the imported dataframe object #' myNetwork <- myStarWarsData %>% Create("Bimodal") #' } #' @export -importData <- function(file,dataSource) { +ImportData <- function(file, dataSource) { df <- read.csv(file) #if(all(colnames(df)==c("X","from","to","edgeType","postType","postLink","postTimestamp","commentText","commentTimestamp"))) { # class(df) <- c("data.table","data.frame","dataSource","facebook") diff --git a/vosonSML/R/PopulateUserInfo.R b/vosonSML/R/PopulateUserInfo.R deleted file mode 100644 index f8e0d6a..0000000 --- a/vosonSML/R/PopulateUserInfo.R +++ /dev/null @@ -1,187 +0,0 @@ -#' Populate Twitter networks with user information -#' -#' This function is used to 'populate' Twitter networks (generated -#' with the \code{\link{Create}} function) with information about -#' the users in the network. This involves calls to the Twitter API -#' to collect this information, which is then applied to the network -#' as vertex attributes. -#' -#' @param networkObject an igraph graph object created with \code{\link{Create}} -#' @return An igraph graph object -#' @author Timothy Graham & Robert Ackland -#' -#' @seealso \code{\link{Collect}}, \code{\link{Create}} -#' @examples -#' -#' \dontrun{ -#' require(magrittr) -#' ## Get Twitter user information and apply to network -#' myTwitterNetwork_userInfo <- PopulateUserInfo(myTwitterNetwork) -#' -#' } -#' @export -PopulateUserInfo <- function(networkObject) { - name=NULL # appease the gods of R CMD CHECK - # This function is supposed to perform the lookups in batches - # and mind the rate limit: - getUserObjects <- function(users) { - groups <- split(users, ceiling(seq_along(users)/50)) - userObjects <- ldply(groups, function(group) { # ldply is a very cool function, found in plyr package. - objects <- lookupUsers(group, includeNA=TRUE) - out <- twListToDF(objects) # twListToDF is also a handy function, found in twitteR package. Converts weird class object to data frame. - # print("Waiting for 15 minutes (to 'refresh' the rate limit)...") # Don't need to use this yet. Implement later for number of users > 7500 (have to do chunked batches... chunks of chunks... urrghh) - # Sys.sleep(900) - return(out) - }) - return(userObjects) - } - - # get the list of users - listOfUsers <- unique(V(networkObject)$name) - listOfUsers_actual <- listOfUsers[-grep("^#",listOfUsers)] - - ## Test if it is a Twitter actor network - if (length(listOfUsers_actual)<1) { - - # predefine a data table to store the results (later delete unneeded rows) - actors <- data.table( - name=as.character(c(rep("NA_f00",length(listOfUsers)))), - userDescription=as.character(c(rep("NA_f00",length(listOfUsers)))), - statusesCount=as.character(c(rep("NA_f00",length(listOfUsers)))), - followersCount=as.character(c(rep("NA_f00",length(listOfUsers)))), - favoritesCount=as.character(c(rep("NA_f00",length(listOfUsers)))), - friendsCount=as.character(c(rep("NA_f00",length(listOfUsers)))), - url=as.character(c(rep("NA_f00",length(listOfUsers)))), - realName=as.character(c(rep("NA_f00",length(listOfUsers)))), - dateAccountCreated=as.character(c(rep("NA_f00",length(listOfUsers)))), - userLocation=as.character(c(rep("NA_f00",length(listOfUsers)))), - userLanguage=as.character(c(rep("NA_f00",length(listOfUsers)))), - numberOfListsUserIsFeaturedOn=as.character(c(rep("NA_f00",length(listOfUsers)))), - profileImageUrl=as.character(c(rep("NA_f00",length(listOfUsers)))) - ) - - setkey(actors,name) # set the key value of the data table - - nextEmptyRow <- 1 # so we can update rows in `dataCombined` in a relatively efficient way - - # This function is supposed to perform the lookups in batches - # and mind the rate limit: - getUserObjects <- function(users) { - groups <- split(users, ceiling(seq_along(users)/50)) - userObjects <- ldply(groups, function(group) { # ldply is a very cool function, found in plyr package. - objects <- lookupUsers(group, includeNA=TRUE) - out <- twListToDF(objects) # twListToDF is also a handy function, found in twitteR package. Converts weird class object to data frame. - # print("Waiting for 15 minutes (to 'refresh' the rate limit)...") # Don't need to use this yet. Implement later for number of users > 7500 (have to do chunked batches... chunks of chunks... urrghh) - # Sys.sleep(900) - return(out) - }) - return(userObjects) - } - - # Collect user data (will return NA for users who don't exist) - - # query the user data - cat("\n Fetching the user data...\n") # DEBUG - usersInformationAttributes <- getUserObjects(listOfUsers) - actorsInfoDF <- usersInformationAttributes - - actors$name <- actorsInfoDF$screenName - actors$userDescription <- actorsInfoDF$description - actors$statusesCount <- actorsInfoDF$statusesCount - actors$followersCount <- actorsInfoDF$followersCount - actors$favoritesCount <- actorsInfoDF$favoritesCount - actors$friendsCount <- actorsInfoDF$friendsCount - actors$url <- actorsInfoDF$url - actors$realName <- actorsInfoDF$name - actors$dateAccountCreated <- actorsInfoDF$created - actors$userLocation <- actorsInfoDF$location - actors$userLanguage <- actorsInfoDF$lang - actors$numberOfListsUserIsFeaturedOn <- actorsInfoDF$listedCount - actors$profileImageUrl <- actorsInfoDF$profileImageUrl - - # the final thing to do is apply the values in `actors` to the networkObject - - V(networkObject)$screenName <- actors$name - V(networkObject)$userDescription <- actors$userDescription - V(networkObject)$statusesCount <- actors$statusesCount - V(networkObject)$followersCount <- actors$followersCount - V(networkObject)$favoritesCount <- actors$favoritesCount - V(networkObject)$friendsCount <- actors$friendsCount - V(networkObject)$url <- actors$url - V(networkObject)$realName <- actors$realName - V(networkObject)$dateAccountCreated <- actors$dateAccountCreated - V(networkObject)$userLocation <- actors$userLocation - V(networkObject)$userLanguage <- actors$userLanguage - V(networkObject)$numberOfListsUserIsFeaturedOn <- actors$numberOfListsUserIsFeaturedOn - V(networkObject)$profileImageUrl <- actors$profileImageUrl - - return(networkObject) - - } - - ## Test if it is a Twitter bimodal network - if (length(listOfUsers_actual)>=1) { - - # predefine a data table to store the results (later delete unneeded rows) - actors <- data.table( - name=as.character(c(rep(NA,length(listOfUsers)))), - userDescription=as.character(c(rep(NA,length(listOfUsers)))), - statusesCount=as.character(c(rep(NA,length(listOfUsers)))), - followersCount=as.character(c(rep(NA,length(listOfUsers)))), - favoritesCount=as.character(c(rep(NA,length(listOfUsers)))), - friendsCount=as.character(c(rep(NA,length(listOfUsers)))), - url=as.character(c(rep(NA,length(listOfUsers)))), - realName=as.character(c(rep(NA,length(listOfUsers)))), - dateAccountCreated=as.character(c(rep(NA,length(listOfUsers)))), - userLocation=as.character(c(rep(NA,length(listOfUsers)))), - userLanguage=as.character(c(rep(NA,length(listOfUsers)))), - numberOfListsUserIsFeaturedOn=as.character(c(rep(NA,length(listOfUsers)))), - profileImageUrl=as.character(c(rep(NA,length(listOfUsers)))) - ) - - setkey(actors,name) # set the key value of the data table - - # query the user data - cat("\n Fetching the user data...\n") # DEBUG - usersInformationAttributes <- getUserObjects(listOfUsers_actual) # exclude hashtag vertices - actorsInfoDF <- usersInformationAttributes - actorsInfoDF_hashtag_NA <- - - # but this does not give us data for 'hashtag' type vertices (none exists of course) - # so, need to fill in this information manually - - actors$name[1:length(listOfUsers_actual)] <- actorsInfoDF$screenName - actors$userDescription[1:length(listOfUsers_actual)] <- actorsInfoDF$description - actors$statusesCount[1:length(listOfUsers_actual)] <- actorsInfoDF$statusesCount - actors$followersCount[1:length(listOfUsers_actual)] <- actorsInfoDF$followersCount - actors$favoritesCount[1:length(listOfUsers_actual)] <- actorsInfoDF$favoritesCount - actors$friendsCount[1:length(listOfUsers_actual)] <- actorsInfoDF$friendsCount - actors$url[1:length(listOfUsers_actual)] <- actorsInfoDF$url - actors$realName[1:length(listOfUsers_actual)] <- actorsInfoDF$name - actors$dateAccountCreated[1:length(listOfUsers_actual)] <- actorsInfoDF$created - actors$userLocation[1:length(listOfUsers_actual)] <- actorsInfoDF$location - actors$userLanguage[1:length(listOfUsers_actual)] <- actorsInfoDF$lang - actors$numberOfListsUserIsFeaturedOn[1:length(listOfUsers_actual)] <- actorsInfoDF$listedCount - actors$profileImageUrl[1:length(listOfUsers_actual)] <- actorsInfoDF$profileImageUrl - - # the final thing to do is apply the values in `actors` to the networkObject - - V(networkObject)$screenName <- actors$name - V(networkObject)$userDescription <- actors$userDescription - V(networkObject)$statusesCount <- actors$statusesCount - V(networkObject)$followersCount <- actors$followersCount - V(networkObject)$favoritesCount <- actors$favoritesCount - V(networkObject)$friendsCount <- actors$friendsCount - V(networkObject)$url <- actors$url - V(networkObject)$realName <- actors$realName - V(networkObject)$dateAccountCreated <- actors$dateAccountCreated - V(networkObject)$userLocation <- actors$userLocation - V(networkObject)$userLanguage <- actors$userLanguage - V(networkObject)$numberOfListsUserIsFeaturedOn <- actors$numberOfListsUserIsFeaturedOn - V(networkObject)$profileImageUrl <- actors$profileImageUrl - - return(networkObject) - - } - -} diff --git a/vosonSML/R/PreprocessTweets.R b/vosonSML/R/PreprocessTweets.R deleted file mode 100644 index 356d542..0000000 --- a/vosonSML/R/PreprocessTweets.R +++ /dev/null @@ -1,13 +0,0 @@ -PreprocessTweets <- -function(df) { - # Perform a few preprocessing tasks - - # removing odd characters - df.new <- RemoveOddChars(df) - # extract user info and add to df - df.new <- ExtractUserInfo(df.new) - # extract urls and add to df - df.new <- ExtractUrls(df.new) - - return(df.new) -} diff --git a/vosonSML/R/RemoveOddChars.R b/vosonSML/R/RemoveOddChars.R deleted file mode 100644 index 56aa21b..0000000 --- a/vosonSML/R/RemoveOddChars.R +++ /dev/null @@ -1,6 +0,0 @@ -RemoveOddChars <- -function(df) { - # Remove odd characters in tweets - df$text <- sapply(df$text, function(x) TrimOddChar(x)) - return(df) -} diff --git a/vosonSML/R/RemoveOddCharsUserInfo.R b/vosonSML/R/RemoveOddCharsUserInfo.R deleted file mode 100644 index cb70c27..0000000 --- a/vosonSML/R/RemoveOddCharsUserInfo.R +++ /dev/null @@ -1,13 +0,0 @@ -RemoveOddCharsUserInfo <- -function(actorsInfoDF) { - # Remove odd characters in the user information attributes - # Odd characters is especially problematic for search queries that trawl non-English speaking users/collectives. - actorsInfoDF$screenName <- sapply(actorsInfoDF$screenName, function(x) TrimOddChar(x)) - actorsInfoDF$description <- sapply(actorsInfoDF$description, function(x) TrimOddChar(x)) - actorsInfoDF$url <- sapply(actorsInfoDF$url, function(x) TrimOddChar(x)) - actorsInfoDF$name <- sapply(actorsInfoDF$name, function(x) TrimOddChar(x)) - actorsInfoDF$location <- sapply(actorsInfoDF$location, function(x) TrimOddChar(x)) - actorsInfoDF$lang <- sapply(actorsInfoDF$lang, function(x) TrimOddChar(x)) - actorsInfoDF$profileImageUrl <- sapply(actorsInfoDF$profileImageUrl, function(x) TrimOddChar(x)) - return(actorsInfoDF) -} diff --git a/vosonSML/R/SaveCredential.R b/vosonSML/R/SaveCredential.R new file mode 100644 index 0000000..7e836db --- /dev/null +++ b/vosonSML/R/SaveCredential.R @@ -0,0 +1,51 @@ +#' Save and load credential information +#' +#' Functions to save and load credential information. Currently, credential information will be stored as a RDS file. +#' \code{SaveCredential} will return the input \code{credential}, useful for working as a filter between +#' \code{Authenticate} and \code{Collect}. +#' +#' @aliases SaveCredential LoadCredential +#' +#' @param credential A \code{credential} object. +#' @param filename Character string. Filename to be saved to or restored from. Default value is \code{credential.RDS}. +#' +#' @return A \code{credential} object. +#' +#' @examples +#' \dontrun{ +#' require(magrittr) +#' +#' ## save credential example +#' +#' myIgAppID <- "xxxxxxxxxxx" +#' myIgAppSecret <- "xxxxxxxxxxxxxxxxxxxxxx" +#' listIgUsernames <- c("senjohnmccain", "obama") +#' +#' Authenticate("instagram", appID = myIgAppID, appSecret = myIgAppSecret) %>% +#' SaveCredential("instagramCred.RDS") %>% +#' Collect(ego = TRUE, username = listIgUsernames) %>% Create() +#' +#' ## load previously saved credential example +#' +#' LoadCredential("instagramCred.RDS") %>% +#' Collect(tag = "obama", distance = 5000, n = 100) %>% Create("bimodal") +#' } +#' +#' @export +SaveCredential <- function(credential, filename) { + if (missing(credential) || missing(filename)) { + stop("please supply a credential object and credential file name to save.") + } + saveRDS(credential, filename) + return(credential) +} + +#' @rdname SaveCredential +#' @export +LoadCredential <- function(filename) { + if (missing(filename)) { + stop("please supply a credential file name to load.") + } + credential <- readRDS(filename) + return(credential) +} diff --git a/vosonSML/R/TrimAt.R b/vosonSML/R/TrimAt.R deleted file mode 100644 index 76dd144..0000000 --- a/vosonSML/R/TrimAt.R +++ /dev/null @@ -1,6 +0,0 @@ -TrimAt <- -function(x) { - # remove @ from text - - sub('@', '', x) -} diff --git a/vosonSML/R/TrimHashtags.R b/vosonSML/R/TrimHashtags.R deleted file mode 100644 index 469f57c..0000000 --- a/vosonSML/R/TrimHashtags.R +++ /dev/null @@ -1,6 +0,0 @@ -TrimHashtags <- -function(x) { - # remove hashtags, i.e. "#tag", in a tweet - - str_replace_all(x, '(#[[:alnum:]_]*)', '') -} diff --git a/vosonSML/R/TrimHead.R b/vosonSML/R/TrimHead.R deleted file mode 100644 index f4d0477..0000000 --- a/vosonSML/R/TrimHead.R +++ /dev/null @@ -1,6 +0,0 @@ -TrimHead <- -function(x) { - # remove starting @, .@, RT @, MT @, etc. - - sub('^(.*)?@', '', x) -} diff --git a/vosonSML/R/TrimOddChar.R b/vosonSML/R/TrimOddChar.R deleted file mode 100644 index d7ef9ef..0000000 --- a/vosonSML/R/TrimOddChar.R +++ /dev/null @@ -1,11 +0,0 @@ -TrimOddChar <- -function(x) { - # remove odd charactors - iconv(x, to = 'utf-8') -} - -TrimOddCharMac <- -function(x) { - # remove odd charactors - iconv(x, to = 'utf-8-mac') -} diff --git a/vosonSML/R/TrimUrls.R b/vosonSML/R/TrimUrls.R deleted file mode 100644 index d84eea0..0000000 --- a/vosonSML/R/TrimUrls.R +++ /dev/null @@ -1,6 +0,0 @@ -TrimUrls <- -function(x) { - # remove urls in a tweet - - str_replace_all(x, 'http[^[:blank:]]+', '') -} diff --git a/vosonSML/R/TrimUsers.R b/vosonSML/R/TrimUsers.R deleted file mode 100644 index 801203f..0000000 --- a/vosonSML/R/TrimUsers.R +++ /dev/null @@ -1,6 +0,0 @@ -TrimUsers <- -function(x) { - # remove users, i.e. "@user", in a tweet - - str_replace_all(x, '(@[[:alnum:]_]*)', '') -} diff --git a/vosonSML/R/utils.R b/vosonSML/R/Utils.R similarity index 84% rename from vosonSML/R/utils.R rename to vosonSML/R/Utils.R index 7e8e46b..1cd862a 100644 --- a/vosonSML/R/utils.R +++ b/vosonSML/R/Utils.R @@ -28,8 +28,7 @@ systemTimeFilename <- function(name_suffix, name_ext, current_time = NULL, clean # write data to file as type writeOutputFile <- function(data, type, name, msg = TRUE) { - # package <- environmentName(environment(writeOutputFile)) - + if (missing(type)) { type <- "rds" } @@ -70,3 +69,18 @@ writeOutputFile <- function(data, type, name, msg = TRUE) { return(NULL) }) } + +# installs and loads a package if necessary +EnsurePackage <- function(x) { + x <- as.character(x) + if (!require(x, character.only = TRUE)) { + install.packages(pkgs = x, repos = "http://cran.r-project.org") + require(x, character.only = TRUE) + } +} + +quiet <-function(x) { + sink(tempfile()) + on.exit(sink()) + invisible(force(x)) +} diff --git a/vosonSML/R/UtilsTwitter.R b/vosonSML/R/UtilsTwitter.R new file mode 100644 index 0000000..3f91395 --- /dev/null +++ b/vosonSML/R/UtilsTwitter.R @@ -0,0 +1,180 @@ +# remove odd characters in tweets +RemoveOddChars <- function(df) { + df$text <- sapply(df$text, function(x) TrimOddChar(x)) + + return(df) +} + +# for each tweet, extract information related to users +# such as to_user, rt_user etc. +ExtractUserInfo <- function(df) { + + # extract to_user + df$reply_to <- sapply(df$text, function(tweet) TrimHead(str_extract(tweet,"^((\\.)?(@[[:alnum:]_+]*))"))) + + # extract any mentions at all (inc. replies, mentions, etc) + # this is a completely new approach - it 'vacuums' up ANY mentions + df$users_mentioned <- sapply(df$text, function(tweet) TrimHead(str_match_all(tweet,"@[[:alnum:]_+]*")[[1]])) + + # extract rt_user + df$retweet_from <- sapply(df$text, function(tweet) TrimHead(str_extract(tweet,"^[RM]T (@[[:alnum:]_+]*)"))) + + return(df) +} + +# for each tweet, extract any hashtags that a user has used +ExtractHashtagInfo <- function(df) { + + df$hashtags_used <- sapply(df$text, function(tweet) regmatches(tweet, gregexpr("#[^#\\s]+(?!\u2026)\\b", + tweet, perl = T))) + + # old way: + # TrimHead(str_match_all(tweet,"#[[:alnum:]_+]*")[[1]]) + + # new way: + # this matches hashtags, but not if the hashtag is "cut off" at the end of the tweet text, denoted by a 'trailing + # ellipsis' character. this avoids the problem of picking up erroneous hashtags that are cut off, e.g. "#ausp..." + # when it should be "#auspol" + + # horizontalEllipsis <- "\u2026" + # horizontalEllipsisFixed <- stri_unescape_unicode(horizontalEllipsis) + + # patternRegex <- paste0("#[^#\\s]+(?!\\\u2026)\\b") + # TrimHead(str_match_all(tweet,paste0("#[[:alnum:]_+^",horizontalEllipsis,"$]*"))[[1]]) + + return(df) +} + +# for each tweet, extract url, remove it from the tweet, and put them separately in a new column +# todo: cannot deal with multiple urls in one tweet right now +ExtractUrls <- function(df) { + # EnsurePackage("stringr") + # EnsurePackage("grid") + + # extracts links (quick and dirty) + # wish to have something like http://daringfireball.net/2009/11/liberal_regex_for_matching_urls + df$links <- sapply(df$text,function(tweet) str_extract(tweet,("http[^[:blank:]]+"))) + df$text <- sapply(df$text, function(x) TrimUrls(x)) + + return(df) +} + +# remove odd characters in the user information attributes +# odd characters are especially problematic for search queries that trawl non-english speaking users/collectives +RemoveOddCharsUserInfo <- function(actorsInfoDF) { + + actorsInfoDF$screenName <- sapply(actorsInfoDF$screenName, function(x) TrimOddChar(x)) + actorsInfoDF$description <- sapply(actorsInfoDF$description, function(x) TrimOddChar(x)) + actorsInfoDF$url <- sapply(actorsInfoDF$url, function(x) TrimOddChar(x)) + actorsInfoDF$name <- sapply(actorsInfoDF$name, function(x) TrimOddChar(x)) + actorsInfoDF$location <- sapply(actorsInfoDF$location, function(x) TrimOddChar(x)) + actorsInfoDF$lang <- sapply(actorsInfoDF$lang, function(x) TrimOddChar(x)) + actorsInfoDF$profileImageUrl <- sapply(actorsInfoDF$profileImageUrl, function(x) TrimOddChar(x)) + + return(actorsInfoDF) +} + +# trim functions + +# remove users, i.e. "@user", in a tweet +TrimUsers <- function(x) { + str_replace_all(x, '(@[[:alnum:]_]*)', '') +} + +# remove urls in a tweet +TrimUrls <- function(x) { + str_replace_all(x, 'http[^[:blank:]]+', '') +} + +# remove odd charactors +TrimOddChar <- function(x) { + iconv(x, to = 'utf-8') +} + +# remove odd charactors +TrimOddCharMac <- function(x) { + iconv(x, to = 'utf-8-mac') +} + +# remove starting @, .@, RT @, MT @, etc. +TrimHead <- function(x) { + sub('^(.*)?@', '', x) +} + +# remove hashtags, i.e. "#tag", in a tweet +TrimHashtags <- function(x) { + str_replace_all(x, '(#[[:alnum:]_]*)', '') +} + +# remove @ from text +TrimAt <- function(x) { + sub('@', '', x) +} + +PreprocessTweets <- function(df) { + # Perform a few preprocessing tasks + + # removing odd characters + df.new <- RemoveOddChars(df) + # extract user info and add to df + df.new <- ExtractUserInfo(df.new) + # extract urls and add to df + df.new <- ExtractUrls(df.new) + + return(df.new) +} + +# accepts a df to add or increment a field value with count +networkStats <- function(df, field, count, edge, print) { + if (missing(print)) { + print <- FALSE + } + + if (missing(edge)) { + edge <- FALSE + } + + if (print == TRUE) { + if (!is.null(df) & nrow(df) > 0) { + lf <- lc <- 0 + for (i in 1:nrow(df)) { + lf <- ifelse(nchar(df$field[i]) > lf, nchar(df$field[i]), lf) + lc <- ifelse(nchar(df$count[i]) > lc, nchar(df$count[i]), lc) + } + + for (i in 1:nrow(df)) { + lfm <- lf + if (nchar(df$field[i]) != lf) { + lfm <-lf + 1 + } + line <- paste0(df$field[i], paste0(replicate(lfm - nchar(df$field[i]), ""), collapse = " "), " | ") + line <- paste0(line, df$count[i], paste0(replicate(lc - nchar(df$count[i]), ""), collapse = " "), "\n") + cat(line) + } + } + + return(TRUE) + } + + if (is.null(df)) { + df <- data.frame("field" = character(0), "count" = character(0), "edge_count" = character(0), + stringsAsFactors = FALSE) + } + df <- rbind(df, list(field = field, count = count, edge_count = edge), stringsAsFactors = FALSE) + + return(df) +} + +printTwitterRateLimit <- function(token) { + rtlimit <- rtweet::rate_limit(token, "search/tweets") + remaining <- rtlimit[["remaining"]] * 100 + reset <- rtlimit[["reset"]] + reset <- as.numeric(reset, "secs") + cat(paste0("remaining search num / 15 mins: ", remaining, "\n")) + cat(paste0("reset: ", reset, " secs\n")) +} + +getRemainingSearchNum <- function(token) { + rtlimit <- rtweet::rate_limit(token, "search/tweets") + remaining <- rtlimit[["remaining"]] * 100 +} \ No newline at end of file diff --git a/vosonSML/R/UtilsYoutube.R b/vosonSML/R/UtilsYoutube.R new file mode 100644 index 0000000..faf16b2 --- /dev/null +++ b/vosonSML/R/UtilsYoutube.R @@ -0,0 +1,84 @@ +SearchCommentsForMentions <- function(commentsTextCleaned, usernamesCleaned) { + + ptm <- proc.time() + + matchTemp <- lapply(commentsTextCleaned, function(x) { + tempResult <- lapply(usernamesCleaned, function(y) { + foo <- grep(paste("(\\+|\\@)", y, sep=""), x) + + if(length(foo) > 0) { + return(y) + } else { + return("FALSE") + } + }) + }) + + matchTemp <- unlist(matchTemp) + # matchTemp <- as.vector(matchTemp) + # matchTemp <- iconv(matchTemp, to = 'UTF-8') + + # have to split `matchTemp` into as many groups as there are rows (i.e. comment texts) + matchTemp2 <- split(matchTemp, ceiling(seq_along(matchTemp) / length(commentsTextCleaned))) + + # Now we want to retrieve the username with MAX CHARACTERS that was mentioned, or if all values were "FALSE" then + # just return a single "FALSE" value. + # THE REASON IS: + # If we have the following comment text: "+Timothy some text", and there are two users in the data, namely "Tim" + # and "Timothy", the `grep` will have matched both of these in the comment text. + # So, we want to ensure it takes the username with more characters (i.e. "Timothy"), rather than the subset + # match (i.e. "Tim"). + matchTemp3 <- tryCatch({ + lapply(matchTemp2, function(x) { + + # if length of element is 0 then return FALSE + # if (length(x)==0) { + # return("FALSE") + # } + + # if all elements == "FALSE" then just return "FALSE" + if (length(x[which(x=="FALSE")]) == length(x)) { + # cat("\nAll elements of list slice are FALSE\n") + return("FALSE") + } + + # if all elements except one == "FALSE" then return the 'non false' element + # e.g. c("FALSE", "FALSE", "Timothy", "FALSE") --> returns "Timothy" + if (length(x[which(x != "FALSE")]) == 1) { + # cat("\nFound 1 non-false ELEMENT:\n") + # cat(paste0(x[which(x!="FALSE")],"\n")) + return(x[which(x != "FALSE")]) + + } else { + tempResult <- x[which(x != "FALSE")] + # if two duplicate results (e.g. "Timothy" and "Timothy"), then just return the 1st + tempResult <- x[which(nchar(x) == max(nchar(x)))][1] + + # cat("\nTwo or more results found:\n") + # cat(x[which(x != "FALSE")]) + # cat("\n") + return(tempResult) + # return(max(nchar(x))) + } + }) + }, error = function(err) { + # error handler picks up where error was generated + print(paste("\nI caught an error (are there mentions/replies between users in the comments for your video(s)? :\n", + err)) + return(matchTemp2) # if it catches an error, we just return the original object + }) # end tryCatch + + # debugResultDF <- data.frame(commentsTextCleaned, usernamesCleaned, unlist(matchTemp3)) # DEBUG + finalMatchesTemp <- as.vector(unlist(matchTemp3)) + + # convert back (or 'de-regex') the username characters + finalMatches <- gsub("\\\\","",finalMatchesTemp) + + # functionRunTime <- proc.time() - ptm # DEBUG + # print("Runtime of FindMentions function was:") # DEBUG + # flush.console() # DEBUG + # print(functionRunTime) # DEBUG + # flush.console() # DEBUG + + return (finalMatches) +} diff --git a/vosonSML/R/authenticateWithYoutubeAPI.R b/vosonSML/R/authenticateWithYoutubeAPI.R deleted file mode 100644 index af818f1..0000000 --- a/vosonSML/R/authenticateWithYoutubeAPI.R +++ /dev/null @@ -1,18 +0,0 @@ -#' YouTube API Authentication -#' -#' OAuth based authentication with the Google API. -#' -#' In order to collect data from YouTube, the user must first authenticate with Google's Application Programming -#' Interface (API). Users can obtain a Google Developer API key at: https://console.developers.google.com. -#' -#' @param apiKeyYoutube character string specifying your Google Developer API key. -#' -#' @return This is called for its side effect. -#' -#' @note In the future this function will enable users to save the API key in working directory, and the function will -#' automatically look for a locally stored key whenever it is called without apiKeyYoutube argument. -#' -#' @noRd -authenticateWithYoutubeAPI <- function(apiKeyYoutube) { - return(apiKeyYoutube) -} diff --git a/vosonSML/R/quiet.R b/vosonSML/R/quiet.R deleted file mode 100644 index 5c7ddec..0000000 --- a/vosonSML/R/quiet.R +++ /dev/null @@ -1,6 +0,0 @@ -quiet <- -function(x) { - sink(tempfile()) - on.exit(sink()) - invisible(force(x)) -} diff --git a/vosonSML/R/searchCommentsForMentions.R b/vosonSML/R/searchCommentsForMentions.R deleted file mode 100644 index 782bace..0000000 --- a/vosonSML/R/searchCommentsForMentions.R +++ /dev/null @@ -1,97 +0,0 @@ -searchCommentsForMentions <- -function (commentsTextCleaned,usernamesCleaned) { - - ptm <- proc.time() # Start the timer # DEBUG - - matchTemp <- lapply(commentsTextCleaned, function(x) { - - tempResult <- lapply(usernamesCleaned, function(y) { - - foo <- grep(paste("(\\+|\\@)", y, sep=""),x) - - if(length(foo)>0){ - return(y) - } - else { - return("FALSE") - } - - } - ) - } - ) - - matchTemp <- unlist(matchTemp) - # matchTemp <- as.vector(matchTemp) - # matchTemp <- iconv(matchTemp, to = 'UTF-8') - - # have to split `matchTemp` into as many groups as there are rows (i.e. comment texts) - matchTemp2 <- split(matchTemp, ceiling(seq_along(matchTemp)/length(commentsTextCleaned))) - - # Now we want to retrieve the username with MAX CHARACTERS that was mentioned, - # or if all values were "FALSE" then just return a single "FALSE" value. - # THE REASON IS: - # If we have the following comment text: "+Timothy some text", - # and there are two users in the data, namely "Tim" and "Timothy", - # the `grep` will have matched both of these in the comment text. - # So, we want to ensure it takes the username with more characters (i.e. "Timothy"), - # rather than the subset match (i.e. "Tim"). - - matchTemp3 <- tryCatch({ - - lapply(matchTemp2, function(x) { - - # if length of element is 0 then return FALSE - # if (length(x)==0) { - # return("FALSE") - # } - - # if all elements == "FALSE" then just return "FALSE" - if (length(x[which(x=="FALSE")])==length(x)) { - # cat("\nAll elements of list slice are FALSE\n") # DEBUG - return("FALSE") - } - - # if all elements except one == "FALSE" then return the 'non false' element - # e.g. c("FALSE", "FALSE", "Timothy", "FALSE") ---> returns "Timothy" - if (length(x[which(x!="FALSE")])==1){ - # cat("\nFound 1 non-false ELEMENT:\n") # DEBUG - # cat(paste0(x[which(x!="FALSE")],"\n")) # DEBUG - return(x[which(x!="FALSE")]) - } - - else { - tempResult <- x[which(x!="FALSE")] - tempResult <- x[which(nchar(x)==max(nchar(x)))][1] # if two duplicate results (e.g. "Timothy" and "Timothy"), then just return the 1st - # cat("\nTwo or more results found:\n") # DEBUG - # cat("\nTwo or more results found:\n") # DEBUG - # cat(x[which(x!="FALSE")]) - # cat("\n") - return(tempResult) - # return(max(nchar(x))) #DEBUG - } - }) - - }, error = function(err) { - - # error handler picks up where error was generated - print(paste("\nI caught an error (are there mentions/replies between users in the comments for your video(s)? :\n",err)) - return(matchTemp2) # if it catches an error, we just return the original object - - }) # END tryCatch - - # debugResultDF <- data.frame(commentsTextCleaned,usernamesCleaned,unlist(matchTemp3)) #DEBUG - finalMatchesTemp <- as.vector(unlist(matchTemp3)) - - # convert back (or 'de-regex') the username characters - finalMatches <- gsub("\\\\","",finalMatchesTemp) - - #functionRunTime <- proc.time() - ptm # DEBUG - #print("Runtime of FindMentions function was:") # DEBUG - #flush.console() # DEBUG - #print(functionRunTime) # DEBUG - #flush.console() # DEBUG - - return (finalMatches) - -} diff --git a/vosonSML/R/vosonSML-package.R b/vosonSML/R/vosonSML-package.R index 0fd95d3..12f175d 100644 --- a/vosonSML/R/vosonSML-package.R +++ b/vosonSML/R/vosonSML-package.R @@ -1,20 +1,20 @@ #' Collection and network analysis of social media data #' -#' The goal of the vosonSML package is to provide a suite of easy-to-use tools for collecting data from social media -#' sources (Instagram, Facebook, Twitter, Youtube, and Reddit) and generating different types of networks suited to -#' Social Network Analysis (SNA) and text analytics. It offers tools to create unimodal, multimodal, semantic, and -#' dynamic networks. It draws on excellent packages such as \pkg{twitteR}, \pkg{instaR}, \pkg{Rfacebook}, -#' \pkg{RedditExtractoR} and \pkg{igraph} in order to provide an integrated 'work flow' for collecting different types -#' of social media data and creating different types of networks out of these data. Creating networks from social media +#' The goal of the vosonSML package is to provide a suite of easy-to-use tools for collecting data from social media +#' sources (Instagram, Facebook, Twitter, Youtube, and Reddit) and generating different types of networks suited to +#' Social Network Analysis (SNA) and text analytics. It offers tools to create unimodal, multimodal, semantic, and +#' dynamic networks. It draws on excellent packages such as \pkg{rtweet}, \pkg{instaR}, \pkg{Rfacebook}, +#' \pkg{RedditExtractoR} and \pkg{igraph} in order to provide an integrated 'work flow' for collecting different types +#' of social media data and creating different types of networks out of these data. Creating networks from social media #' data is often non-trivial and time consuming. This package simplifies such tasks so users can focus on analysis. #' -#' vosonSML uses a straightforward S3 class system. Data collected with this package produces \code{data.table} objects -#' (extension of class \code{data.frame}), which are assigned the class \code{dataSource}. Additionally, -#' \code{dataSource} objects are assigned a class identifying the source of data, e.g. \code{facebook} or \code{youtube} -#' . In this way, \code{dataSource} objects are fast, easy to work with, and can be used as input to easily construct -#' different types of networks. For example, the function \code{\link{Collect}} can be used to collect Twitter data, -#' which is then 'piped' to the \code{\link{Create}} function, resulting in a network (an igraph object) that is ready -#' for analysis. +#' vosonSML uses a straightforward S3 class system. Data collected with this package produces \code{data.table} objects +#' (extension of class \code{data.frame}), which are assigned the class \code{dataSource}. Additionally, +#' \code{dataSource} objects are assigned a class identifying the source of data, e.g. \code{facebook} or +#' \code{youtube}. In this way, \code{dataSource} objects are fast, easy to work with, and can be used as input to +#' easily construct different types of networks. For example, the function \code{Collect} can be used to collect +#' Twitter data, which is then 'piped' to the \code{Create} function, resulting in a network (an igraph object) +#' that is ready for analysis. #' #' @name vosonSML-package #' @aliases vosonSML-package vosonSML @@ -29,17 +29,18 @@ #' @import methods #' @import httr #' @importFrom Hmisc escapeRegex -#' @importFrom igraph delete.vertices graph.data.frame simplify write.graph V 'V<-' set.graph.attribute +#' @importFrom igraph delete.vertices graph.data.frame simplify write.graph V 'V<-' set.graph.attribute vcount #' graph_from_data_frame delete_vertex_attr set_graph_attr #' @importFrom Rfacebook fbOAuth getPost getPage getUsers #' @importFrom instaR getComments getLikes instaOAuth searchInstagram getUser getFollowers getFollows #' @importFrom plyr ldply -#' @importFrom twitteR lookupUsers searchTwitter setup_twitter_oauth twListToDF +#' @importFrom rtweet create_token rate_limit search_tweets users_data lookup_users #' @importFrom stringr str_extract str_replace_all str_match_all #' @importFrom stats 'na.omit' #' @importFrom utils "flush.console" head "install.packages" "read.table" "write.csv" "read.csv" #' @importFrom RedditExtractoR reddit_content user_network -#' @importFrom magrittr '%>%' +#' @importFrom magrittr '%>%' '%<>%' #' @importFrom dplyr rename group_by summarise ungroup left_join select mutate filter coalesce row_number +#' distinct anti_join mutate_all mutate_at ends_with vars funs #' @importFrom rlang '.data' NULL diff --git a/vosonSML/man/Authenticate.Rd b/vosonSML/man/Authenticate.Rd index 7affa01..536e0c7 100644 --- a/vosonSML/man/Authenticate.Rd +++ b/vosonSML/man/Authenticate.Rd @@ -2,67 +2,71 @@ % Please edit documentation in R/Authenticate.R \name{Authenticate} \alias{Authenticate} -\title{Create credential to access social media APIs} +\title{Create a credential to access social media APIs} \usage{ Authenticate(socialmedia, ...) } \arguments{ -\item{socialmedia}{character string, social media API to authenticate, -currently supports "facebook", "youtube", "twitter", "instagram" and "reddit"} +\item{socialmedia}{Character string. Identifier for social media API to authenticate.\cr +Supports: \code{"twitter"}, \code{"youtube"}, \code{"reddit"}, \code{"instagram"} and \code{"facebook"}.} -\item{...}{additional parameters for authentication -\code{facebook}: appID, appSecret -\code{youtube}: apiKey -\code{twitter}: apiKey, apiSecret, accessToken, accessTokenSecret -\code{instagram}: appID, appSecret -\code{reddit}: appName, appKey, appSecret, useTokenCache} +\item{...}{Additional parameters for authentication appropriate to \code{socialmedia} identifier. +\describe{ + \item{twitter:}{\code{[appName], apiKey, apiSecret, accessToken, + accessTokenSecret, [useCachedToken]}} + \item{youtube:}{\code{apiKey}} + \item{reddit:}{\code{[appName], appKey, appSecret, [useCachedToken]}} + \item{instagram:}{\code{appID, appSecret, [useCachedToken]}} + \item{facebook:}{\code{appID, appSecret, [extendedPermissions, useCachedToken]}} +}} } \value{ -credential object with authentication information +A \code{credential} object with authentication information. } \description{ -\code{Authenticate} creates a \code{credential} object that enables R to -make authenticated calls to social media APIs. A \code{credential} object -is a S3 object with the authentication-related information such as access -tokens and the information on the social media that grant authentication. -\code{Authenticate} is the first step of the \code{Authenticate}, -\code{Collect}, \code{Create} workflow. +\code{Authenticate} creates a \code{credential} object that enables R to make authenticated calls to social media +APIs. A \code{credential} object is a S3 object with the authentication-related information such as access tokens +and the information on the social media that grant authentication. \code{Authenticate} is the first step of the +\code{Authenticate}, \code{\link{Collect}} and \code{\link{Create}} workflow. } \note{ -Currently, \code{Authenticate} with socialmedia = "twitter" generates -oauth information to be used in the current active session only (i.e. -"side-effect") and no authentication-related information will be stored in -the returned \code{credential} object. +Currently, \code{Authenticate} with \code{socialmedia = "twitter"} generates OAuth information to be used in +the current active session only (i.e. "side-effect") and no authentication-related information will be stored in the +returned \code{credential} object. + +For other social network API's it's useful to cache the credential to a file and then re-use it in future sessions. +Refer to \code{\link{SaveCredential}} and \code{\link{LoadCredential}} to do this. } \examples{ - \dontrun{ require(magrittr) -## Instagram ego network example -myAppID <- "123456789098765" -myAppSecret <- "abc123abc123abc123abc123abc123ab" -myUsernames <- c("senjohnmccain","obama") -Authenticate("instagram", -appID = myAappId, -appSecret = myAppSecret) \%>\% Collect(ego = TRUE, -username = myUsernames) \%>\% Create +## youtube actor network example + +myYoutubeAPIKey <- "xxxxxxxxxxxxxxxxxxxxxx" +listYoutubeVideoIDs <- c("W2GZFeYGU3s", "mL27TAJGlWc") + +myActorNetwork <- Authenticate("youtube", apiKey = myYoutubeAPIKey) \%>\% + Collect(videoIDs = listYoutubeVideoIDs) \%>\% Create("actor") -## YouTube actor network example -my_apiKeyYoutube <- "314159265358979qwerty" -videoIDs <- c("W2GZFeYGU3s","mL27TAJGlWc") +## instagram ego network example -Authenticate("youtube", -apiKey = my_apiKeyYoutube) \%>\% Collect(videoIDs = videoIDs) \%>\% Create('actor') +myInstaAppID <- "xxxxxxxxxxx" +myInstaAppSecret <- "xxxxxxxxxxxxxxxxxxxxxx" +listInstaUsernames <- c("senjohnmccain", "obama") + +myEgoNetwork <- Authenticate("instagram", appID = myInstaAppID, appSecret = myInstaAppSecret) \%>\% + Collect(ego = TRUE, username = listInstaUsernames) \%>\% Create("ego") } + } \seealso{ -\code{\link{AuthenticateWithFacebookAPI}}, -\code{\link{AuthenticateWithInstagramAPI}}, -\code{\link{AuthenticateWithTwitterAPI}}, -\code{\link{SaveCredential}}, -\code{\link{LoadCredential}} -} -\author{ -Chung-hong Chan +\code{\link{SaveCredential}}, \code{\link{Collect}}, \code{\link{Create}} } +\keyword{authenticate} +\keyword{credential} +\keyword{facebook} +\keyword{instagram} +\keyword{reddit} +\keyword{twitter} +\keyword{youtube} diff --git a/vosonSML/man/AuthenticateWithRedditAPI.Rd b/vosonSML/man/AuthenticateWithRedditAPI.Rd index db949a2..30a1eac 100644 --- a/vosonSML/man/AuthenticateWithRedditAPI.Rd +++ b/vosonSML/man/AuthenticateWithRedditAPI.Rd @@ -4,7 +4,7 @@ \alias{AuthenticateWithRedditAPI} \title{Reddit API authentication.} \usage{ -AuthenticateWithRedditAPI(appName, appKey, appSecret, useTokenCache) +AuthenticateWithRedditAPI(appName, appKey, appSecret, useCachedToken) } \arguments{ \item{appName}{character string containing the reddit app name associated with the API key.} @@ -13,7 +13,7 @@ AuthenticateWithRedditAPI(appName, appKey, appSecret, useTokenCache) \item{appSecret}{character string containing the app secret.} -\item{useTokenCache}{logical. Use cached authentication token if found.} +\item{useCachedToken}{logical. Use cached authentication token if found.} } \value{ a reddit authentication token diff --git a/vosonSML/man/AuthenticateWithTwitterAPI.Rd b/vosonSML/man/AuthenticateWithTwitterAPI.Rd index 478f599..9673c1f 100644 --- a/vosonSML/man/AuthenticateWithTwitterAPI.Rd +++ b/vosonSML/man/AuthenticateWithTwitterAPI.Rd @@ -2,66 +2,39 @@ % Please edit documentation in R/AuthenticateWithTwitterAPI.R \name{AuthenticateWithTwitterAPI} \alias{AuthenticateWithTwitterAPI} -\title{Note: this function is DEPRECATED and will be removed in a future release. -Please use the \code{Authenticate} function} +\title{Note: this function is DEPRECATED. Please use the \code{\link{Authenticate}} function.} \usage{ -AuthenticateWithTwitterAPI(api_key, api_secret, access_token, - access_token_secret, createToken) +AuthenticateWithTwitterAPI(appName, apiKey, apiSecret, accessToken, + accessTokenSecret, useCachedToken) } \arguments{ -\item{api_key}{character string specifying the 'API key' used for -authentication.} +\item{appName}{Character string. Specifies the twitter registered app name associated with API keys.} -\item{api_secret}{character string specifying the 'API secret' used for -authentication.} +\item{apiKey}{Character string. Specifies the app 'API key' used for authentication.} -\item{access_token}{character string specifying the 'access token' used for -authentication.} +\item{apiSecret}{Character string. Specifies the app 'API secret'.} -\item{access_token_secret}{character string specifying the 'access token -secret' used for authentication.} +\item{accessToken}{Character string. Specifies the app 'access token'.} -\item{createToken}{logical. !! NOT PROPERLY IMPLEMENTED YET.} +\item{accessTokenSecret}{Character string. Specifies the app 'access token secret'.} + +\item{useCachedToken}{Logical. If \code{TRUE} uses cached API token if found otherwise creates one.} } \value{ -This is called for its side effect. +twitter_oauth. Returns a twitter oauth token object. } \description{ -Twitter API Authentication +Twitter API authentication } \details{ -Oauth based authentication with the Twitter API - -In order to collect data from Twitter, the user must first authenticate with -Twitter's Application Programming Interface (API). +Oauth based authentication using the Twitter API. -This requires setting up an App on Twitter. An excellent guide to achieving -this can be found at: -http://thinktostart.com/twitter-authentification-with-r/ -} -\examples{ - -\dontrun{ - # Firstly specify your API credentials - my_api_key <- "1234567890qwerty" - my_api_secret <- "1234567890qwerty" - my_access_token <- "1234567890qwerty" - my_access_token_secret <- "1234567890qwerty" - - AuthenticateWithTwitterAPI(api_key=my_api_key, api_secret=my_api_secret, - access_token=my_access_token, access_token_secret=my_access_token_secret) -} +In order to collect data from Twitter, the user must first authenticate with Twitter's API. This requires setting up +an app on Twitter. A useful guide to creating an app can be found in the rtweet documentation: +https://rtweet.info/articles/auth.html#creating-a-twitter-app } \seealso{ -\code{AuthenticateWithFacebookAPI} and -\code{AuthenticateWithYouTubeAPI} for other ways to collect social media -data. -} -\author{ -Timothy Graham & Robert Ackland - +\code{\link{Authenticate}} } -\keyword{SNA} -\keyword{media} -\keyword{social} +\keyword{authenticate} \keyword{twitter} diff --git a/vosonSML/man/AuthenticateWithYoutubeAPI.Rd b/vosonSML/man/AuthenticateWithYoutubeAPI.Rd new file mode 100644 index 0000000..ea335a8 --- /dev/null +++ b/vosonSML/man/AuthenticateWithYoutubeAPI.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/AuthenticateWithYoutubeAPI.R +\name{AuthenticateWithYoutubeAPI} +\alias{AuthenticateWithYoutubeAPI} +\title{YouTube API Authentication} +\usage{ +AuthenticateWithYoutubeAPI(apiKey) +} +\arguments{ +\item{apiKey}{character string specifying your Google Developer API key.} +} +\value{ +This is called for its side effect. +} +\description{ +OAuth based authentication with the Google API. +} +\details{ +In order to collect data from YouTube, the user must first authenticate with Google's Application Programming +Interface (API). Users can obtain a Google Developer API key at: https://console.developers.google.com. +} +\note{ +In the future this function will enable users to save the API key in working directory, and the function will +automatically look for a locally stored key whenever it is called without apiKeyYoutube argument. +} diff --git a/vosonSML/man/Collect.Rd b/vosonSML/man/Collect.Rd index dbc207e..094581c 100644 --- a/vosonSML/man/Collect.Rd +++ b/vosonSML/man/Collect.Rd @@ -7,66 +7,81 @@ Collect(credential, ego = FALSE, ...) } \arguments{ -\item{credential}{\code{credential} object generated from -\code{Authenticate}} +\item{credential}{A \code{credential} object generated from \code{Authenticate}.} -\item{ego}{logical, collecting ego network data. Currently only support -Instagram.} +\item{ego}{Logical. If \code{TRUE} collect ego network data. Currently only supports Instagram.} -\item{...}{additional parameters for data collection (refer to -CollectDataFrom* and CollectEgo* functions) - -\code{facebook}: pageName, rangeFrom, rangeTo, verbose, n, writeToFile, dynamic -\code{youtube}: videoIDs, verbose, writeToFile, maxComments -\code{twitter}: searchTerm, numTweets, verbose, writeToFile, language -\code{instagram}: credential, tag, n, lat, lng, distance, folder, mindate, maxdate, verbose, sleep, writeToFile, -waitForRateLimit -\code{reddit}: threadUrls, waitTime, writeToFile - -\code{instagram} with \code{ego} = TRUE: username, userid, verbose, -degreeEgoNet, waitForRateLimit, getFollows} +\item{...}{Additional parameters for data collection by appropriate to credential \code{socialmedia} type. +Refer to CollectDataFrom* and CollectEgo* functions for more details. +\describe{ + \item{twitter:}{\code{authToken, searchTerm, [searchType, numTweets, includeRetweets, retryOnRateLimit,}\cr + \code{writeToFile, verbose, ...]}} + \item{youtube:}{\code{videoIDs, apiKeyYoutube, [verbose, writeToFile, maxComments]}} + \item{reddit:}{\code{threadUrls, [waitTime, writeToFile]}} + \item{instagram:}{\code{tag, n, lat, lng, [distance, folder, mindate, maxdate, verbose, sleep,}\cr + \code{writeToFile, waitForRateLimit, credential]}} + \item{instagram with \code{ego = TRUE}:}{\code{username, userid, [verbose, degreeEgoNet,}\cr + \code{waitForRateLimit, getFollows, credential]}} + \item{facebook:}{\code{pageName, [rangeFrom, rangeTo, verbose, n, writeToFile, dynamic]}} +}} } \value{ -A data.frame object of class \code{dataSource.*} that can be used -with \code{Create}. +A data.frame object of class \code{dataSource.*} that can be used with \code{Create}. } \description{ -This function collects data from social media APIs, and structures the data -into a data frame of class \code{dataSource.*}, ready for creating networks -for further analysis. \code{Collect} is the second step of the -\code{Authenticate}, \code{Collect}, \code{Create} workflow. This function is -a convenient UI wrapper to the core CollectDataFrom* family of functions. +This function collects data from social media APIs, and structures the data into a data frame of class +\code{dataSource.*}, ready for creating networks for further analysis. \code{Collect} is the second step of the +\code{Authenticate}, \code{Collect}, \code{Create} workflow. This function is a convenient UI wrapper to the core +CollectDataFrom* family of functions. } \examples{ - \dontrun{ require(magrittr) -## Instagram ego network example -myAppID <- "123456789098765" -myAppSecret <- "abc123abc123abc123abc123abc123ab" -myUsernames <- c("senjohnmccain","obama") -Authenticate("instagram", -appID = myAappId, -appSecret = myAppSecret) \%>\% Collect(ego = TRUE, -username = myUsernames) \%>\% Create +## youtube actor network example + +myYoutubeAPIKey <- "xxxxxxxxxxxxxxxxxxxxxx" +listYoutubeVideoIDs <- c("W2GZFeYGU3s", "mL27TAJGlWc") + +myActorNetwork <- Authenticate("youtube", apiKey = myYoutubeAPIKey) \%>\% + Collect(videoIDs = listYoutubeVideoIDs) \%>\% Create("actor") + +## instagram ego network example -## YouTube actor network example -my_apiKeyYoutube <- "314159265358979qwerty" -videoIDs <- c("W2GZFeYGU3s","mL27TAJGlWc") +myInstaAppID <- "xxxxxxxxxxx" +myInstaAppSecret <- "xxxxxxxxxxxxxxxxxxxxxx" +listInstaUsernames <- c("senjohnmccain", "obama") -Authenticate("youtube", -apiKey = my_apiKeyYoutube) \%>\% Collect(videoIDs = videoIDs) \%>\% Create('actor') +myEgoNetwork <- Authenticate("instagram", appID = myInstaAppID, appSecret = myInstaAppSecret) \%>\% + Collect(ego = TRUE, username = listInstaUsernames) \%>\% Create("ego") + +## facebook bimodal network example + +myFacebookAppID <- "xxxxxxxxxxx" +myFacebookAppSecret <- "xxxxxxxxxxxxxxxxxxxxxx" + +myBimodalNetwork <- Authenticate("Facebook", appID = myFacebookAppID, + appSecret = myFacebookAppSecret) \%>\% + SaveCredential("FBCredential.RDS") \%>\% + Collect(pageName = "StarWars", rangeFrom = "2015-03-01", rangeTo = "2015-03-02", + writeToFile = FALSE) \%>\% + Create("bimodal") + +## facebook dynamic network example + +myDynamicNetwork <- LoadCredential("FBCredential.RDS") \%>\% + Collect(pageName = "StarWars", rangeFrom = "2015-03-01", rangeTo = "2015-03-02", + writeToFile = FALSE) \%>\% + Create("dynamic") } } \seealso{ -\code{CollectDataFacebook}, -\code{CollectDataInstagram}, -\code{CollectDataTwitter}, -\code{CollectEgoInstagram}, -\code{CollectDataReddit}, -} -\author{ -Chung-hong Chan +\code{Authenticate}, \code{Create} } +\keyword{collect} +\keyword{facebook} +\keyword{instagram} +\keyword{reddit} +\keyword{twitter} +\keyword{youtube} diff --git a/vosonSML/man/CollectDataReddit.Rd b/vosonSML/man/CollectDataReddit.Rd index 028db5e..37ed7d1 100644 --- a/vosonSML/man/CollectDataReddit.Rd +++ b/vosonSML/man/CollectDataReddit.Rd @@ -4,7 +4,7 @@ \alias{CollectDataReddit} \title{Collect reddit thread data} \usage{ -CollectDataReddit(threadUrls, waitTime = 5, writeToFile) +CollectDataReddit(threadUrls, waitTime = 5, writeToFile = FALSE) } \arguments{ \item{threadUrls}{character string vector. Reddit thread url's to collect data from.} diff --git a/vosonSML/man/CollectDataTwitter.Rd b/vosonSML/man/CollectDataTwitter.Rd index abaf665..8db42c3 100644 --- a/vosonSML/man/CollectDataTwitter.Rd +++ b/vosonSML/man/CollectDataTwitter.Rd @@ -2,140 +2,73 @@ % Please edit documentation in R/CollectDataTwitter.R \name{CollectDataTwitter} \alias{CollectDataTwitter} -\title{Note: this function is DEPRECATED and will be removed in a future release. -Please use the \code{Collect} function} +\title{Note: this function is DEPRECATED. Please use the \code{\link{Collect}} function.} \usage{ -CollectDataTwitter(searchTerm, numTweets, verbose, writeToFile, language, - since, until, locale, geocode, sinceID, maxID, resultType, - retryOnRateLimit) +CollectDataTwitter(authToken = NULL, searchTerm = "", + searchType = "recent", numTweets = 100, includeRetweets = TRUE, + retryOnRateLimit = FALSE, writeToFile = FALSE, verbose = FALSE, + ...) } \arguments{ -\item{searchTerm}{character string, specifying a search term or phrase (e.g. -"Australian politics") or hashtag (e.g. "#auspol"). Many query operators are -available - see the Twitter documentation for more information: -https://dev.twitter.com/rest/public/search} - -\item{numTweets}{numeric integer, specifying how many tweets to be -collected. Defaults to 1500. Maximum tweets for a single call of this -function is 1500.} - -\item{verbose}{logical. If \code{TRUE} then this function will output -runtime information to the console as it computes. Useful diagnostic tool -for long computations. Default is \code{FALSE}.} - -\item{writeToFile}{logical. If \code{TRUE} then the data is saved to file in -current working directory (CSV format), with filename denoting current -system time and \code{searchTerm}. Default is \code{FALSE}.} +\item{authToken}{Twitter oauth token created by rtweet.} -\item{language}{character string, restricting tweets to the given language, -given by an ISO 639-1 code. For example, "en" restricts to English tweets. -Defaults to NULL.} - -\item{since}{If not NULL, restricts tweets to those since the given date. Date is to be formatted -as YYYY-MM-DD (this is a wrapper to the searchTwitter function in the twitteR package).} +\item{searchTerm}{Character string. Specifies a search term or phrase (e.g. "Australian politics") or hashtag (e.g. +"#auspol"). Many query operators are available - see the Twitter documentation for more information: +https://dev.twitter.com/rest/public/search} -\item{until}{If not NULL, restricts tweets to those up until the given date. Date is to be formatted -as YYYY-MM-DD (this is a wrapper to the searchTwitter function in the twitteR package).} +\item{searchType}{Character string. Returns filtered tweets as per search type \code{recent}, \code{mixed} or +\code{popular}. Default type is \code{recent}.} -\item{locale}{If not NULL, will set the locale for the search. As of 03/06/11 only ja is effective, -as per the Twitter API (this is a wrapper to the searchTwitter function in the twitteR package).} +\item{numTweets}{Numeric. Specifies how many tweets to be collected. Defaults is \code{100}.} -\item{geocode}{If not NULL, returns tweets by users located within a given radius of the given -latitude/longitude. (this is a wrapper to the searchTwitter function in the twitteR package).} +\item{includeRetweets}{Logical. Specifies if the search should filter out retweets. Defaults is \code{TRUE}.} -\item{sinceID}{If not NULL, returns tweets with IDs greater (ie newer) than the specified ID -(this is a wrapper to the searchTwitter function in the twitteR package).} +\item{retryOnRateLimit}{Logical. Default is \code{FALSE}.} -\item{maxID}{If not NULL, returns tweets with IDs smaller (ie older) than the specified ID -(this is a wrapper to the searchTwitter function in the twitteR package).} +\item{writeToFile}{Logical. If \code{TRUE} then the data is saved to file in current working directory (RDS format), +with filename denoting current system time and \code{searchTerm}. Default is \code{FALSE}.} -\item{resultType}{If not NULL, returns filtered tweets as per value. See details for allowed values. -(this is a wrapper to the searchTwitter function in the twitteR package).} +\item{verbose}{Logical. If \code{TRUE} then this function will output runtime information to the console as it +computes. Useful diagnostic tool for long computations. Default is \code{FALSE}.} -\item{retryOnRateLimit}{If non-zero the search command will block retry up to X times if the rate limit -is experienced. This might lead to a much longer run time but the task will -eventually complete if the retry count is high enough (this is a wrapper to the searchTwitter -function in the twitteR package).} +\item{...}{Additional parameters to pass to the rtweet \code{search_tweets} function.} } \value{ -A data frame object of class \code{dataSource.twitter} that can be -used for creating unimodal networks (\code{CreateActorNetwork}), bimodal -networks (\code{CreateBimodalNetwork}), and semantic networks +A data frame object of class \code{dataSource.twitter} that can be used for creating unimodal networks +(\code{CreateActorNetwork}), bimodal networks (\code{CreateBimodalNetwork}), and semantic networks (\code{CreateSemanticNetwork}). } \description{ Collect data from Twitter for generating different types of networks } \details{ -This function collects data from Twitter based on hashtags or search terms, -and structures the data into a data frame of class -\code{dataSource.twitter}, ready for creating networks for further analysis. +This function collects data from Twitter based on hashtags or search terms, and structures the data into a data +frame of class \code{dataSource.twitter}, ready for creating networks for further analysis. -\code{CollectDataTwitter} collects public 'tweets' from Twitter using the -Twitter API. +\code{CollectDataTwitter} collects public 'tweets' from Twitter using the Twitter API. -The function then finds and maps the relationships of entities of interest -in the data (e.g. users, terms, hashtags), and structures these -relationships into a data frame format suitable for creating unimodal -networks (\code{CreateActorNetwork}), bimodal networks -(\code{CreateBimodalNetwork}), and semantic networks +The function then finds and maps the relationships of entities of interest in the data (e.g. users, terms, hashtags) +, and structures these relationships into a data frame format suitable for creating unimodal networks +(\code{CreateActorNetwork}), bimodal networks (\code{CreateBimodalNetwork}), and semantic networks (\code{CreateSemanticNetwork}). -The maximum number of tweets for a single call of \code{CollectDataTwitter} -is 1500. +The maximum number of tweets for a single call of \code{CollectDataTwitter} is 1500. -Language support is available, using the \code{language} argument. The user -can restrict tweets returned to a particular language, using the ISO 639-1 -code. For example, restricting to English would use \code{language="en"}. -The full list of codes is available here: -https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. +Language support is available, using the \code{language} parameter. The user can restrict tweets returned to a +particular language, using the ISO 639-1 code. For example, restricting to English would use \code{language="en"}. +The full list of codes is available here: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes. -A variety of query operators are available through the Twitter API. For -example, "love OR hate" returns any tweets containing either term (or both). -For more information see the Twitter API documentation (under the heading +A variety of query operators are available through the Twitter API. For example, "love OR hate" returns any tweets +containing either term (or both). For more information see the Twitter API documentation (under the heading 'Query Operators'): https://dev.twitter.com/rest/public/search } \note{ -Data generated using this function is *not* suitable for dynamic -networks. Dynamic Twitter networks are not currently implemented in the -vosonSML package. This will be implemented in a future release. -} -\examples{ +Supported network types: \code{actor}, \code{bimodal}, \code{semantic} -\dontrun{ - # Firstly specify your API credentials - my_api_key <- "1234567890qwerty" - my_api_secret <- "1234567890qwerty" - my_access_token <- "1234567890qwerty" - my_access_token_secret <- "1234567890qwerty" - - # Authenticate with the Twitter API using \\code{AuthenticateWithTwitterAPI} - AuthenticateWithTwitterAPI(api_key=my_api_key, api_secret=my_api_secret, - access_token=my_access_token, access_token_secret=my_access_token_secret) - - # Collect tweets data using \\code{myTwitterData} - myTwitterData <- CollectDataTwitter(searchTerm="#auspol", - numTweets=150,writeToFile=FALSE,verbose=FALSE) - - # Create an 'actor' network using \\code{CreateActorNetwork} - g_actor_twitter <- CreateActorNetwork(myTwitterData) - - # Create a 'bimodal' network using \\code{CreateBimodalNetwork} - g_bimodal_twitter <- CreateBimodalNetwork(myTwitterData) - - # Create a 'semantic' network using \\code{CreateSemanticNetwork} - g_semantic_twitter <- CreateSemanticNetwork(myTwitterData) - } +Data generated using this function is *not* suitable for dynamic networks. } \seealso{ -\code{AuthenticateWithTwitterAPI} must be run first or no data will -be collected. -} -\author{ -Timothy Graham & Robert Ackland - +\code{Collect} } -\keyword{SNA} -\keyword{data} -\keyword{mining} +\keyword{collect} \keyword{twitter} diff --git a/vosonSML/man/CollectDataYoutube.Rd b/vosonSML/man/CollectDataYoutube.Rd new file mode 100644 index 0000000..52a296c --- /dev/null +++ b/vosonSML/man/CollectDataYoutube.Rd @@ -0,0 +1,60 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CollectDataYoutube.R +\name{CollectDataYoutube} +\alias{CollectDataYoutube} +\title{Collect YouTube comments data for generating different types of networks} +\usage{ +CollectDataYoutube(apiKey, videoIDs, verbose = FALSE, + writeToFile = FALSE, maxComments = 1e+13) +} +\arguments{ +\item{apiKey}{character string, specifying the Google Developer API Key used for authentication.} + +\item{videoIDs}{character vector, specifying one or more YouTube video IDs. For example, if the video URL is +'https://www.youtube.com/watch?v=W2GZFeYGU3s', then use videoIDs='W2GZFeYGU3s'. For multiple videos, the +function GetYoutubeVideoIDs can be used to create a vector object suitable as input for videoIDs.} + +\item{verbose}{logical. If TRUE then this function will output runtime information to the console as it +computes. Useful diagnostic tool for long computations. Default is FALSE.} + +\item{writeToFile}{logical. If TRUE then the data is saved to file in current working directory (CSV format), +with filename denoting current system time. Default is FALSE.} + +\item{maxComments}{numeric integer, specifying how many 'top-level' comments to collect from each video. This value +*does not* take into account 'reply' comments (i.e. replies to top-level comments), therefore the total number of +comments collected may be higher than maxComments. By default this function attempts to collect all comments.} +} +\value{ +A dataframe object of class dataSource.youtube that can be used for creating unimodal networks +(CreateActorNetwork). +} +\description{ +This function collects YouTube comments data for one or more YouTube videos. It structures the data into a data +frame of class dataSource.youtube, ready for creating networks for further analysis. +} +\details{ +CollectDataYoutube collects public comments from YouTube videos, using the YouTube API. + +The function then finds and maps the relationships of YouTube users who have interacted with each other +(i.e. user i has replied to user j or mentioned user j in a comment) and structures these relationships into a data +frame format suitable for creating unimodal networks (CreateActorNetwork). + +For multiple videos, the user may wish to use the function GetYoutubeVideoIDs, which creates a character +vector of video IDs from a plain text file of YouTube video URLs, which can then be used for the videoIDs +argument of the function CollectDataYoutube. +} +\note{ +Currently supported network types: unimodal 'actor' network; CreateActorNetwork. + +Data generated using this function is *not* suitable for dynamic networks. +Dynamic YouTube comments networks are not currently implemented in the vosonSML package. This will be implemented in +a future release. + +Note on maxComments argument: Due to quirks/specifications of the Google API, it is currently not possible to +specify the exact number of comments to return from the API using maxResults argument (i.e.including comments +that are replies to top-level comments). Therefore, the number of comments collected is usually somewhat greater than +maxComments, if a value is specified for this argument. For example, if a video contains 10 top-level +comments, and one of these top-level comments has 5 'child' or reply comments, then the total number of comments +collected will be equal to 15. Currently, the user must 'guesstimate' the maxResults value, to collect a +number of comments in the order of what they require. +} diff --git a/vosonSML/man/Create.Rd b/vosonSML/man/Create.Rd index 6442628..9d39fcb 100644 --- a/vosonSML/man/Create.Rd +++ b/vosonSML/man/Create.Rd @@ -7,49 +7,32 @@ Create(dataSource, type = "actor", ...) } \arguments{ -\item{dataSource}{a data frame of class \code{dataSource}} +\item{dataSource}{Social media data collected using the \code{Collect} method.} -\item{type}{character, type of network to be created, currently supports "actor", "bimodal", "dynamic", "semantic" -and "ego"} +\item{type}{Character string. Type of network to be created, can be \code{actor}, \code{bimodal}, +\code{dynamic}, \code{semantic} or \code{ego}.} -\item{...}{additional parameters for create*Network functions} +\item{...}{Additional parameters for network creation for appropriate \code{social media} and network \code{type}. +Refer to S3 methods \code{social media} type for default parameters.} } \value{ -an igraph graph object +Network data containing an igraph object. } \description{ -This function creates networks from social media data (i.e. from data frames of class \code{dataSource}. -\code{Create} is the final step of the \code{Authenticate}, \code{Collect}, \code{Create} workflow. This function is -a convenient UI wrapper to the core create*Network family of functions. -} -\details{ -Note: when creating Twitter networks, the user information can be collected separately using the -\code{\link{PopulateUserInfo}} function and stored into the network as vertex attributes (this involves additional -calls to the Twitter API). -} -\examples{ -\dontrun{ -require(magrittr) - -## instagram ego network example - -my_app_id <- "123456789098765" -my_app_secret <- "abc123abc123abc123abc123abc123ab" -my_usernames <- c("senjohnmccain", "obama") - -my_ego_network <- Authenticate("instagram", appID = my_app_id, appSecret = my_app_secret) \%>\% - Collect(ego = TRUE, username = my_usernames) \%>\% Create - -## youtube actor network example - -my_api_key <- "314159265358979qwerty" -my_video_ids <- c("W2GZFeYGU3s","mL27TAJGlWc") - -my_actor_network <- Authenticate("youtube", apiKey = my_api_key) \%>\% - Collect(videoIDs = my_video_ids) \%>\% Create('actor') - -} -} -\author{ -Chung-hong Chan -} +This function creates networks from social media data (i.e. collected from dataframes of class \code{social media}). +\code{Create} is the final step of the \code{Authenticate}, \code{Collect}, \code{Create} workflow. This function +is a wrapper for the Create*Network S3 methods. +} +\note{ +When creating twitter networks, a network with additional user information can be generated using the +\code{\link{GraphUserInfoTwitter}} function. Additional calls can be made to the twitter API to get information +about users that were identified as nodes during network creation. +} +\seealso{ +\code{\link{CreateActorNetwork}}, \code{\link{CreateBimodalNetwork}}, \code{\link{CreateSemanticNetwork}} +} +\keyword{actor} +\keyword{bimodal} +\keyword{create} +\keyword{network} +\keyword{semantic} diff --git a/vosonSML/man/CreateActorNetwork.Rd b/vosonSML/man/CreateActorNetwork.Rd new file mode 100644 index 0000000..2ea8e55 --- /dev/null +++ b/vosonSML/man/CreateActorNetwork.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/CreateActorNetwork.R, +% R/CreateActorNetwork.reddit.R, R/CreateActorNetwork.twitter.R, +% R/CreateActorNetwork.youtube.R +\name{CreateActorNetwork} +\alias{CreateActorNetwork} +\alias{CreateActorNetwork.default} +\alias{CreateActorNetwork.reddit} +\alias{CreateActorNetwork.twitter} +\alias{CreateActorNetwork.youtube} +\title{Create an actor network from social media data} +\usage{ +CreateActorNetwork(x, ...) + +\method{CreateActorNetwork}{default}(x, ...) + +\method{CreateActorNetwork}{reddit}(x, weightEdges = FALSE, + textData = FALSE, cleanText = TRUE, writeToFile = FALSE, ...) + +\method{CreateActorNetwork}{twitter}(x, writeToFile = FALSE, + verbose = FALSE, ...) + +\method{CreateActorNetwork}{youtube}(x, writeToFile = FALSE, ...) +} +\arguments{ +\item{x}{Collected social media data with \code{social media} class attribute.} + +\item{...}{Additional parameters to pass to the network creation method.} + +\item{weightEdges}{Logical. Combines and weights directed network edges. Default is \code{FALSE}.} + +\item{textData}{Logical. If the igraph network should include the comment text as an edge attribute. +Cannot be used with the \code{weightEdges} parameter. Default is \code{FALSE}.} + +\item{cleanText}{Logical. If non-alphanumeric, non-punctuation, and non-space characters should be removed from the +included text attribute data. Only applies if \code{textData = TRUE}. Default is \code{TRUE}.} + +\item{writeToFile}{Logical. Save network data to a file in the current working directory. Default is \code{FALSE}.} + +\item{verbose}{Logical. Output additional information about the network creation. Default is \code{FALSE}.} +} +\value{ +A reddit actor network as igraph object. + +A twitter actor network as list containing a relations dataframe, users dataframe and igraph object. + +A youtube actor network as igraph object. +} +\description{ +This function creates an actor network from social media data collected using the \code{Collect} method. Edges in +the network represent interactions or relationships between the actors. For example, with twitter data an +interaction is defined as a 'mention', reply' or 'retweet' from user i to user j, given 'tweet' m. With youtube +comments, an interaction is defined as a 'reply' from user i to user j, given 'comment' m. The resulting network is +returned as an igraph object. +} +\note{ +For twitter data, actor networks can be created from multiple data frames (i.e. datasets collected individually +using \code{Collect} method. Simply create a list of the data frames that you wish to create a network from. +For example, \code{myList <- list(myTwitterData1, myTwitterData2, myTwitterData3)} +} +\seealso{ +\code{\link{Create}} +} +\keyword{actor} +\keyword{create} +\keyword{reddit} +\keyword{twitter} +\keyword{youtube} diff --git a/vosonSML/man/CreateActorNetwork.reddit.Rd b/vosonSML/man/CreateActorNetwork.reddit.Rd deleted file mode 100644 index 6633d69..0000000 --- a/vosonSML/man/CreateActorNetwork.reddit.Rd +++ /dev/null @@ -1,36 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/CreateActorNetwork.reddit.R -\name{CreateActorNetwork.reddit} -\alias{CreateActorNetwork.reddit} -\title{Creates a reddit actor network from collected threads} -\usage{ -\method{CreateActorNetwork}{reddit}(x, weightEdges, includeTextData, - cleanText, writeToFile) -} -\arguments{ -\item{x}{a dataframe as vosonSML class object containing collected social network data} - -\item{weightEdges}{logical. Combines and weights directed edges. Can't be used with includeTextData.} - -\item{includeTextData}{logical. If the igraph network edges should include the comment text as attribute.} - -\item{cleanText}{logical. If non-alphanumeric, non-punctuation, and non-space characters should be removed from the -included text attribute data. Default is TRUE} - -\item{writeToFile}{logical. If the igraph network graph should be written to file.} -} -\value{ -an igraph object of the actor network -} -\description{ -Uses RedditExtractoR::user_network to create an igraph directed actor network with comment ids as edge attribute. -} -\note{ -Can create three types of network graphs: -* Directed graph with subreddit, thread_ids and comment ids as edge attributes - default option -* Directed graph with weighted edges (without comment ids) - weightEdges = TRUE -* Directed graph with comment text included as edge attribute - includeTextData = TRUE - -Comment ids as edge attributes in graphs refer to the Collect dataframe comment id not reddits comment id -If "Forbidden control character 0x19 found in igraph_i_xml_escape, Invalid value" then set cleanText = TRUE -} diff --git a/vosonSML/man/CreateBimodalNetwork.Rd b/vosonSML/man/CreateBimodalNetwork.Rd index 80c9f15..4fda66e 100644 --- a/vosonSML/man/CreateBimodalNetwork.Rd +++ b/vosonSML/man/CreateBimodalNetwork.Rd @@ -2,108 +2,55 @@ % Please edit documentation in R/CreateBimodalNetwork.R \name{CreateBimodalNetwork} \alias{CreateBimodalNetwork} -\title{Note: this function is DEPRECATED and will be removed in a future release. -Please use the \code{Create} function} +\title{Create bimodal networks from social media data} \usage{ -CreateBimodalNetwork(x, writeToFile, removeTermsOrHashtags) +CreateBimodalNetwork(x, writeToFile, removeTermsOrHashtags, ...) } \arguments{ -\item{x}{a data frame of class \code{dataSource}. For Twitter data, it is -also possible to provide a *list* of data frames (i.e. data frames that -inherit class \code{dataSource} and \code{twitter}). Only lists of Twitter -data frames are supported at this time. If a list of data frames is -provided, then the function binds these row-wise and computes over the -entire data set.} +\item{x}{A data frame of class \code{dataSource}. For Twitter data, it is also possible to provide a *list* of data +frames (i.e. data frames that inherit class \code{dataSource} and \code{twitter}). Only lists of Twitter data +frames are supported at this time. If a list of data frames is provided, then the function binds these row-wise and +computes over the entire data set.} -\item{writeToFile}{logical. If \code{TRUE} then the network is saved to file -in current working directory (GRAPHML format), with filename denoting the -current date/time and the type of network.} +\item{writeToFile}{Logical. If \code{TRUE} then the network is saved to file in current working directory (GRAPHML +format), with filename denoting the current date/time and the type of network.} -\item{removeTermsOrHashtags}{character vector. Default is none. Otherwise -this argument specifies which terms or hashtags (i.e. vertices with matching -`name`) should be removed from the bimodal network. This is useful to remove -the search term or hashtag that was used to collect the data (i.e. remove -the corresponding vertex in the graph). For example, a value of "#auspol" -means that if there is a vertex with the exact name "#auspol" then this -vertex will be removed.} +\item{removeTermsOrHashtags}{Character string. Default is none. Otherwise this argument specifies which terms or +hashtags (i.e. vertices with matching 'name') should be removed from the bimodal network. This is useful to remove +the search term or hashtag that was used to collect the data (i.e. remove the corresponding vertex in the graph). +For example, a value of "#auspol" means that if there is a vertex with the exact name "#auspol" then this vertex +will be removed.} + +\item{...}{Additional parameters to pass to the network creation method.} } \value{ An igraph graph object, with weighted and directed edges. } \description{ -Create bimodal networks from social media data +This function creates a bimodal network from social media data (i.e. from data frames of class \code{dataSource}, or +for Twitter data it is also possible to provide a *list* of data frames), with edges representing relationships +between actors of two different types (e.g. Facebook users and Facebook posts, with edges representing whether a +user has commented or 'liked' a post). } \details{ -This function creates a bimodal network from social media data (i.e. from -data frames of class \code{dataSource}, or for Twitter data it is also -possible to provide a *list* of data frames), with edges representing -relationships between actors of two different types (e.g. Facebook users and -Facebook posts, with edges representing whether a user has commented or -'liked' a post). - -This function creates a (directed and weighted) bimodal network from a data -frame of class \code{dataSource} (which are created using the `CollectData` -family of functions in the vosonSML package), or a *list* of Twitter -data frames collected using \code{CollectDataTwitter} function. +This function creates a (directed and weighted) bimodal network from a data frame of class \code{dataSource} (which +are created using the 'CollectData' family of functions in the vosonSML package), or a *list* of Twitter data +frames collected using \code{CollectDataTwitter} function. -The resulting network is an igraph graph object. This graph object is -bimodal because edges represent relationships between vertices of two -different types. For example, in a bimodal Facebook network, vertices -represent Facebook users or Facebook posts, and edges represent whether a -user has commented or 'liked' a post. Edges are directed and weighted (e.g. -if user i has commented n times on post j, then the weight of this directed -edge equals n). +The resulting network is an igraph graph object. This graph object is bimodal because edges represent relationships +between vertices of two different types. For example, in a bimodal Facebook network, vertices represent Facebook +users or Facebook posts, and edges represent whether a user has commented or 'liked' a post. Edges are directed and +weighted (e.g. if user i has commented n times on post j, then the weight of this directed edge equals n). } \note{ -Not all data sources in vosonSML can be used for creating -bimodal networks. - -Currently supported data sources are: - -- Facebook - Twitter - -Other data sources (e.g. YouTube) will be implemented in the future. -Additionally, the user is notified if they try to create bimodal networks -for incompatible data sources. - -For Twitter data, bimodal networks can be created from multiple data frames -(i.e. datasets collected individually using CollectDataTwitter). Simply -create a list of the data frames that you wish to create a network from. For -example, \code{myList <- list(myTwitterData1, myTwitterData2, -myTwitterData3)}. -} -\examples{ - -\dontrun{ - ## This example shows how to collect Facebook page data and create a bimodal network - - # Use your own values for myAppID and myAppSecret - myAppID <- "123456789098765" - myAppSecret <- "abc123abc123abc123abc123abc123ab" - - # Authenticate with the Facebook API using `AuthenticateWithFacebookAPI` - fb_oauth <- AuthenticateWithFacebookAPI(appID=myAppID, appSecret=myAppSecret, - extended_permissions=FALSE, useCachedToken=TRUE) - - # Run the `CollectDataFacebook` function and store the results in variable `myFacebookData` - myFacebookData <- CollectDataFacebook(pageName="StarWars", rangeFrom="2014-05-15", - rangeTo="2014-06-03",writeToFile=FALSE,verbose=TRUE) - - # Create a 'bimodal' network using \\code{CreateBimodalNetwork} - g_bimodal_facebook <- CreateBimodalNetwork(myFacebookData) - - # View descriptive information about the bimodal network - g_bimodal_facebook -} +Supported data sources: \code{facebook}, \code{twitter} +For Twitter data, bimodal networks can be created from multiple data frames (i.e. datasets collected individually +using CollectDataTwitter). Simply create a list of the data frames that you wish to create a network from. For +example, \code{myList <- list(myTwitterData1, myTwitterData2, myTwitterData3)}. } \seealso{ -See \code{CollectDataFacebook} and \code{CollectDataTwitter} to -collect data for creating bimodal networks in vosonSML. -} -\author{ -Timothy Graham & Robert Ackland - +\code{CollectDataFacebook}, \code{CollectDataTwitter} } \keyword{SNA} \keyword{bimodal} diff --git a/vosonSML/man/CreateSemanticNetwork.Rd b/vosonSML/man/CreateSemanticNetwork.Rd index 83a1434..0908b3f 100644 --- a/vosonSML/man/CreateSemanticNetwork.Rd +++ b/vosonSML/man/CreateSemanticNetwork.Rd @@ -2,127 +2,73 @@ % Please edit documentation in R/CreateSemanticNetwork.R \name{CreateSemanticNetwork} \alias{CreateSemanticNetwork} -\title{Note: this function is DEPRECATED and will be removed in a future release. -Please use the \code{Create} function} +\title{Creates a semantic network from social media data (semantic relationships between concepts)} \usage{ CreateSemanticNetwork(x, writeToFile, termFreq, hashtagFreq, - removeTermsOrHashtags, stopwordsEnglish) + removeTermsOrHashtags, stopwordsEnglish, ...) } \arguments{ -\item{x}{a data frame of class \code{dataSource}. For Twitter data, it is -also possible to provide a *list* of data frames (i.e. data frames that -inherit class \code{dataSource} and \code{twitter}). Only lists of Twitter -data frames are supported at this time. If a list of data frames is -provided, then the function binds these row-wise and computes over the -entire data set.} +\item{x}{A data frame of class \code{dataSource}. For Twitter data, it is also possible to provide a *list* of data +frames (i.e. data frames that inherit class \code{dataSource} and \code{twitter}). Only lists of Twitter data +frames are supported at this time. If a list of data frames is provided, then the function binds these row-wise and +computes over the entire data set.} -\item{writeToFile}{logical. If \code{TRUE} then the network is saved to file -in current working directory (GRAPHML format), with filename denoting the -current date/time and the type of network.} +\item{writeToFile}{Logical. If \code{TRUE} then the network is saved to file in current working directory (GRAPHML +format), with filename denoting the current date/time and the type of network.} -\item{termFreq}{numeric integer, specifying the percentage of most frequent -TERMS to include. For example, a value of 20 means that the 20 percent most -frequently occurring terms will be included in the semantic network. The +\item{termFreq}{Numeric integer. Specifies the percentage of most frequent TERMS to include. For example, a value +of 20 means that the 20 percent most frequently occurring terms will be included in the semantic network. The default value is 5, meaning the 5 percent most frequent terms are used.} \item{hashtagFreq}{** NOT IMPLEMENTED YET - DEFAULTS TO ALL HASHTAGS **. -numeric integer, specifying the percentage of most frequent HASHTAGS to -include. For example, a value of 80 means that the 80 percent most frequently -occurring hashtags will be included in the semantic network. The default -value is 50, meaning the 50 percent most frequent hashtags are used.} +Numeric integer. Specifies the percentage of most frequent HASHTAGS to include. For example, a value of 80 means +that the 80 percent most frequently occurring hashtags will be included in the semantic network. The default value +is 50, meaning the 50 percent most frequent hashtags are used.} -\item{removeTermsOrHashtags}{character vector. Default is none. Otherwise -this argument specifies which terms or hashtags (i.e. vertices with matching -`name`) should be removed from the semantic network. This is useful to -remove the search term or hashtag that was used to collect the data (i.e. -remove the corresponding vertex in the graph). For example, a value of -"#auspol" means that if there is a vertex with the name "#auspol" then this -vertex will be removed.} +\item{removeTermsOrHashtags}{Character string vector. Default is none. Otherwise this argument specifies which terms +or hashtags (i.e. vertices with matching 'name') should be removed from the semantic network. This is useful to +remove the search term or hashtag that was used to collect the data (i.e. remove the corresponding vertex in the +graph). For example, a value of "#auspol" means that if there is a vertex with the name "#auspol" then this vertex +will be removed.} -\item{stopwordsEnglish}{logical. If \code{TRUE} then English stopwords are -removed from the tweets (e.g. words such as 'the' or 'and'). Using -\code{FALSE} may be helpful non-English data sets. The default is -\code{TRUE} (i.e. stopwords will be removed).} +\item{stopwordsEnglish}{Logical. If \code{TRUE} then English stopwords are removed from the tweets (e.g. words such +as 'the' or 'and'). Using \code{FALSE} may be helpful non-English data sets. The default is \code{TRUE} (i.e. +stopwords will be removed).} + +\item{...}{Additional parameters to pass to the network creation method.} } \value{ An igraph graph object, with weighted edges. } \description{ -Create semantic networks from social media data (semantic relationships -between concepts) +This function creates a semantic network from social media data (i.e. from data frames of class \code{dataSource}, +or for Twitter data it is also possible to provide a list of data frames). In such semantic networks, concepts are +words/terms extracted from the text corpus of social media data (e.g. tweets on Twitter). } \details{ -This function creates a semantic network from social media data (i.e. from -data frames of class \code{dataSource}, or for Twitter data it is also -possible to provide a list of data frames). In such semantic networks, -concepts are words/terms extracted from the text corpus of social media data -(e.g. tweets on Twitter). - -This function creates a weighted network from a data frame of class -\code{dataSource} (which are created using the `CollectData` family of -functions in the vosonSML package), or a list of Twitter data frames -collected using \code{CollectDataTwitter} function. - -The resulting semantic network is an igraph graph object. This graph object -is semantic because vertices represent unique concepts (in this case unique -terms/words extracted from a social media text corpus), and edges represent -the co-occurrence of terms for all observations in the data set. For -example, for a Twitter semantic network, vertices represent either hashtags -(e.g. "#auspol") or single terms ("politics"). If there are 1500 tweets in -the data set (i.e. 1500 observations), and the term "#auspol" and the term -"politics" appear together in every tweet, then this will be represented by -an edge with weight equal to 1500. +This function creates a weighted network from a data frame of class \code{dataSource} (which are created using the +'CollectData' family of functions in the vosonSML package), or a list of Twitter data frames collected using +\code{CollectDataTwitter} function. + +The resulting semantic network is an igraph graph object. This graph object is semantic because vertices represent +unique concepts (in this case unique terms/words extracted from a social media text corpus), and edges represent +the co-occurrence of terms for all observations in the data set. For example, for a Twitter semantic network, +vertices represent either hashtags (e.g. "#auspol") or single terms ("politics"). If there are 1500 tweets in the +data set (i.e. 1500 observations), and the term "#auspol" and the term "politics" appear together in every tweet, +then this will be represented by an edge with weight equal to 1500. } \note{ -Not all data sources in vosonSML can be used for creating -semantic networks. - -Currently supported data sources are: - -- Twitter - -Other data sources (e.g. YouTube and Facebook) will be implemented in the -future. Additionally, the user is notified if they try to create semantic -networks for incompatible data sources. - -For Twitter data, semantic networks can be created from multiple data frames -(i.e. datasets collected individually using CollectDataTwitter). Simply -create a list of the data frames that you wish to create a network from. For -example, \code{myList <- list(myTwitterData1, myTwitterData2, -myTwitterData3)}. -} -\examples{ - -\dontrun{ - ## This example shows how to collect Twitter data and create a semantic network - - # Firstly specify your API credentials - my_api_key <- "1234567890qwerty" - my_api_secret <- "1234567890qwerty" - my_access_token <- "1234567890qwerty" - my_access_token_secret <- "1234567890qwerty" - - # Authenticate with the Twitter API using \\code{AuthenticateWithTwitterAPI} - AuthenticateWithTwitterAPI(api_key=my_api_key, api_secret=my_api_secret, - access_token=my_access_token, access_token_secret=my_access_token_secret) - - # Collect tweets data using \\code{myTwitterData} - myTwitterData <- CollectDataTwitter(searchTerm="#auspol", - numTweets=200,writeToFile=FALSE,verbose=FALSE) - - # Create a 'semantic' network using \\code{CreateSemanticNetwork} - g_semantic_twitter <- CreateSemanticNetwork(myTwitterData,writeToFile=FALSE, - termFreq=20,hashtagFreq=80) +Currently supported data sources: +\itemize{ + \item \code{twitter} } +For Twitter data, semantic networks can be created from multiple data frames (i.e. datasets collected individually +using CollectDataTwitter). Simply create a list of the data frames that you wish to create a network from. For +example, \code{myList <- list(myTwitterData1, myTwitterData2, myTwitterData3)}. } \seealso{ -See \code{CollectDataTwitter} to collect data for creating semantic -networks in vosonSML. -} -\author{ -Timothy Graham & Robert Ackland - +\code{CollectDataTwitter} } \keyword{SNA} \keyword{igraph} diff --git a/vosonSML/man/GetYoutubeVideoIDs.Rd b/vosonSML/man/GetYoutubeVideoIDs.Rd index df14bd6..5184b4c 100644 --- a/vosonSML/man/GetYoutubeVideoIDs.Rd +++ b/vosonSML/man/GetYoutubeVideoIDs.Rd @@ -2,64 +2,55 @@ % Please edit documentation in R/GetYoutubeVideoIDs.R \name{GetYoutubeVideoIDs} \alias{GetYoutubeVideoIDs} -\title{Extract/scrape the IDs from a set of YouTube video URLs} +\title{Extract the IDs from a set of YouTube video URLs} \usage{ GetYoutubeVideoIDs(file) } \arguments{ -\item{file}{The connection to read from. This can be a local file, or a http -or ftp connection. It can also be a character string with the file name or -URI. The file must be plain text format with the URL of each YouTube video -specified on a new line (separated by character return). For example, the -first line might contain https://www.youtube.com/watch?v=73I5dRucCds, and -the second line might contain https://www.youtube.com/watch?v=6S9r_YbqHy8.} +\item{file}{The connection to read from. This can be a local file, or a http or ftp connection. It can also be a +character string with the file name or URI. The file must be plain text format with the URL of each YouTube video +specified on a new line (separated by character return). For example, the first line might contain +https://www.youtube.com/watch?v=73I5dRucCds, and the second line might contain +https://www.youtube.com/watch?v=6S9r_YbqHy8.} } \value{ -a character vector representing a set of YouTube video IDs, each -with number of characters equal to 11 (e.g. "73I5dRucCds"). +a character vector representing a set of YouTube video IDs, each with number of characters equal to 11 +(e.g. "73I5dRucCds"). } \description{ -This function reads a list of YouTube video URLs from a text file and -converts them to a vector object. For example, -"https://www.youtube.com/watch?v=73I5dRucCds" has the ID "73I5dRucCds". This -function can be used to create an object for the argument \code{videoIDs} in -the function \code{CollectDataYoutube}, that is, by extracting the IDs for a -set of YouTube videos and compiling them into a vector, ready for collecting -data with \code{CollectDataYoutube}. +This function reads a list of YouTube video URLs from a text file and converts them to a vector object. For example, +"https://www.youtube.com/watch?v=73I5dRucCds" has the ID "73I5dRucCds". This function can be used to create an +object for the argument \code{videoIDs} in the function \code{CollectDataYoutube}, that is, by extracting the IDs +for a set of YouTube videos and compiling them into a vector, ready for collecting data with +\code{CollectDataYoutube}. } \note{ -This function is useful for lots of videos. However, many videos may -take a *long* time to collect data from. In such cases it is recommended to -use the \code{verbose=TRUE} argument for the function -\code{CollectDataYoutube}, in order to keep track of progress during -computation. +This function is useful for lots of videos. However, many videos may take a *long* time to collect data from. +In such cases it is recommended to use the \code{verbose = TRUE} argument for the function \code{CollectDataYoutube} +, in order to keep track of progress during computation. } \examples{ - \dontrun{ - ## This example shows how to use `GetYoutubeVideoIDs` to extract video IDs from YouTube - ## video URLs, and then collect data using the function `CollectDataYoutube` +# this example shows how to use 'GetYoutubeVideoIDs' to extract video IDs from YouTube video +# URLs, and then collect data using the function 'CollectDataYoutube' - # Use your own Google Developer API Key here: - myApiKey <- "1234567890" +# set your Google Developer API key +myYtApiKey <- "xxxxxxxxxx" - # Authenticate with the Google API - apiKeyYoutube <- AuthenticateWithYoutubeAPI(apiKeyYoutube=myApiKey) +# authenticate with the Google API +apiKeyYoutube <- AuthenticateWithYoutubeAPI(apiKeyYoutube = myYtApiKey) - # Use the function `GetYoutubeVideoIDs` to automatically generate vector of IDs from - # a plain text file of video URLs - videoIDs <- GetYoutubeVideoIDs(file="youtube_to_scrape.txt") +# use the function 'GetYoutubeVideoIDs' to automatically generate vector of IDs from a plain +# text file of video URLs +videoIDs <- GetYoutubeVideoIDs(file = "youtube_urls_to_scrape.txt") - # Collect the data using function `CollectDataYoutube` - myYoutubeData <- CollectDataYoutube(videoIDs,apiKeyYoutube,writeToFile=FALSE) +# collect the data using function 'CollectDataYoutube' +myYoutubeData <- CollectDataYoutube(videoIDs, apiKeyYoutube, writeToFile = FALSE) } + } \seealso{ -Use \code{CollectDataYoutube} for collecting YouTube comments data. -} -\author{ -Timothy Graham & Robert Ackland - +\code{CollectDataYoutube} } \keyword{scraping} \keyword{vosonSML} diff --git a/vosonSML/man/GraphUserInfoTwitter.Rd b/vosonSML/man/GraphUserInfoTwitter.Rd new file mode 100644 index 0000000..c0a0ad7 --- /dev/null +++ b/vosonSML/man/GraphUserInfoTwitter.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/GraphUserInfoTwitter.R +\name{GraphUserInfoTwitter} +\alias{GraphUserInfoTwitter} +\title{Create twitter network graph with user information attributes} +\usage{ +GraphUserInfoTwitter(df_collect, df_relations, df_users, + lookup_missing_users = TRUE, twitter_token = NULL, + writeToFile = FALSE) +} +\arguments{ +\item{df_collect}{A dataframe containing the collected tweet data from \code{Collect}.} + +\item{df_relations}{A dataframe containing the network relations data from \code{Create}.} + +\item{df_users}{A dataframe containing the network users data from \code{Create}.} + +\item{lookup_missing_users}{Logical. Request user information for any users missing from df_collect. Default +is \code{TRUE}.} + +\item{twitter_token}{An twitter authentication token from \code{Authenticate}.} + +\item{writeToFile}{Logical. If \code{TRUE} a data frame of user information and the resulting network graph will +be saved to file. Default is \code{FALSE}.} +} +\value{ +A list containing a dataframe with user information and an igraph object of the twitter network with +user node attributes. +} +\description{ +Creates a network from the relations and users dataframes generated by Create. Network is supplemented with +additional downloaded user information applied as node attributes. +} +\note{ +Only supports twitter actor network at this time. Bimodal network support will require the filtering +of twitter user ids from nodes of other types. +} diff --git a/vosonSML/man/PopulateUserInfo.Rd b/vosonSML/man/PopulateUserInfo.Rd deleted file mode 100644 index 9635e76..0000000 --- a/vosonSML/man/PopulateUserInfo.Rd +++ /dev/null @@ -1,37 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/PopulateUserInfo.R -\name{PopulateUserInfo} -\alias{PopulateUserInfo} -\title{Populate Twitter networks with user information} -\usage{ -PopulateUserInfo(networkObject) -} -\arguments{ -\item{networkObject}{an igraph graph object created with \code{\link{Create}}} -} -\value{ -An igraph graph object -} -\description{ -This function is used to 'populate' Twitter networks (generated -with the \code{\link{Create}} function) with information about -the users in the network. This involves calls to the Twitter API -to collect this information, which is then applied to the network -as vertex attributes. -} -\examples{ - -\dontrun{ -require(magrittr) -## Get Twitter user information and apply to network -myTwitterNetwork_userInfo <- PopulateUserInfo(myTwitterNetwork) - -} -} -\seealso{ -\code{\link{Collect}}, \code{\link{Create}} -} -\author{ -Timothy Graham & Robert Ackland - -} diff --git a/vosonSML/man/SaveCredential.Rd b/vosonSML/man/SaveCredential.Rd index 1dbfb61..beaa8a1 100644 --- a/vosonSML/man/SaveCredential.Rd +++ b/vosonSML/man/SaveCredential.Rd @@ -1,47 +1,45 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Authenticate.R +% Please edit documentation in R/SaveCredential.R \name{SaveCredential} \alias{SaveCredential} \alias{LoadCredential} \title{Save and load credential information} \usage{ -SaveCredential(credential, filename = "credential.RDS") +SaveCredential(credential, filename) -LoadCredential(filename = "credential.RDS") +LoadCredential(filename) } \arguments{ -\item{credential}{\code{credential} object} +\item{credential}{A \code{credential} object.} -\item{filename}{character, filename to be saved to or restored from} +\item{filename}{Character string. Filename to be saved to or restored from. Default value is \code{credential.RDS}.} } \value{ -\code{credential} object +A \code{credential} object. } \description{ -Functions to save and load credential information. Currently, credential -information will be stored as a RDS file. \code{SaveCredential} will return -the input \code{credential}, useful for working as a filter between the +Functions to save and load credential information. Currently, credential information will be stored as a RDS file. +\code{SaveCredential} will return the input \code{credential}, useful for working as a filter between \code{Authenticate} and \code{Collect}. } -\note{ -\code{credential} created from \code{Authenticate} with socialmedia = -'twitter' will not be saved by SaveCredential -} \examples{ - \dontrun{ require(magrittr) -myAppID <- "123456789098765" -myAppSecret <- "abc123abc123abc123abc123abc123ab" -myUsernames <- c("senjohnmccain","obama") - -Authenticate("instagram", -appID = myAppId, -appSecret = myAppSecret) \%>\% SaveCredential("instagramCred.RDS") \%>\% Collect(ego = TRUE, -username = myUsernames) \%>\% Create - -## Load the previously saved credential information -LoadCredential("instagramCred.RDS") \%>\% Collect(tag="obama", -distance=5000, n=100) \%>\% Create("bimodal") + +## save credential example + +myIgAppID <- "xxxxxxxxxxx" +myIgAppSecret <- "xxxxxxxxxxxxxxxxxxxxxx" +listIgUsernames <- c("senjohnmccain", "obama") + +Authenticate("instagram", appID = myIgAppID, appSecret = myIgAppSecret) \%>\% + SaveCredential("instagramCred.RDS") \%>\% + Collect(ego = TRUE, username = listIgUsernames) \%>\% Create() + +## load previously saved credential example + +LoadCredential("instagramCred.RDS") \%>\% + Collect(tag = "obama", distance = 5000, n = 100) \%>\% Create("bimodal") } + } diff --git a/vosonSML/man/importData.Rd b/vosonSML/man/importData.Rd index 4207def..a21d62a 100644 --- a/vosonSML/man/importData.Rd +++ b/vosonSML/man/importData.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/importData.R -\name{importData} -\alias{importData} +% Please edit documentation in R/ImportData.R +\name{ImportData} +\alias{ImportData} \title{Import vosonSML data previously saved to disk using the `Collect()` function.} \usage{ -importData(file, dataSource) +ImportData(file, dataSource) } \arguments{ \item{file}{character, specifying the file path to the data to be imported} @@ -38,7 +38,7 @@ myFacebookData <- Authenticate("Facebook", appID = appID, appSecret = appSecret) rangeTo="2015-03-02", writeToFile=TRUE) # Import the data (that was saved to disk in the previous step) -myStarWarsData <- importData("2015-03-01_to_2015-03-02_StarWars_FacebookData.csv","facebook") +myStarWarsData <- ImportData("2015-03-01_to_2015-03-02_StarWars_FacebookData.csv","facebook") # Create a network using the imported dataframe object myNetwork <- myStarWarsData \%>\% Create("Bimodal") diff --git a/vosonSML/man/vosonSML-package.Rd b/vosonSML/man/vosonSML-package.Rd index 2cd30f6..250003e 100644 --- a/vosonSML/man/vosonSML-package.Rd +++ b/vosonSML/man/vosonSML-package.Rd @@ -6,22 +6,22 @@ \alias{vosonSML} \title{Collection and network analysis of social media data} \description{ -The goal of the vosonSML package is to provide a suite of easy-to-use tools for collecting data from social media -sources (Instagram, Facebook, Twitter, Youtube, and Reddit) and generating different types of networks suited to -Social Network Analysis (SNA) and text analytics. It offers tools to create unimodal, multimodal, semantic, and -dynamic networks. It draws on excellent packages such as \pkg{twitteR}, \pkg{instaR}, \pkg{Rfacebook}, -\pkg{RedditExtractoR} and \pkg{igraph} in order to provide an integrated 'work flow' for collecting different types -of social media data and creating different types of networks out of these data. Creating networks from social media +The goal of the vosonSML package is to provide a suite of easy-to-use tools for collecting data from social media +sources (Instagram, Facebook, Twitter, Youtube, and Reddit) and generating different types of networks suited to +Social Network Analysis (SNA) and text analytics. It offers tools to create unimodal, multimodal, semantic, and +dynamic networks. It draws on excellent packages such as \pkg{rtweet}, \pkg{instaR}, \pkg{Rfacebook}, +\pkg{RedditExtractoR} and \pkg{igraph} in order to provide an integrated 'work flow' for collecting different types +of social media data and creating different types of networks out of these data. Creating networks from social media data is often non-trivial and time consuming. This package simplifies such tasks so users can focus on analysis. } \details{ -vosonSML uses a straightforward S3 class system. Data collected with this package produces \code{data.table} objects -(extension of class \code{data.frame}), which are assigned the class \code{dataSource}. Additionally, -\code{dataSource} objects are assigned a class identifying the source of data, e.g. \code{facebook} or \code{youtube} -. In this way, \code{dataSource} objects are fast, easy to work with, and can be used as input to easily construct -different types of networks. For example, the function \code{\link{Collect}} can be used to collect Twitter data, -which is then 'piped' to the \code{\link{Create}} function, resulting in a network (an igraph object) that is ready -for analysis. +vosonSML uses a straightforward S3 class system. Data collected with this package produces \code{data.table} objects +(extension of class \code{data.frame}), which are assigned the class \code{dataSource}. Additionally, +\code{dataSource} objects are assigned a class identifying the source of data, e.g. \code{facebook} or +\code{youtube}. In this way, \code{dataSource} objects are fast, easy to work with, and can be used as input to +easily construct different types of networks. For example, the function \code{Collect} can be used to collect +Twitter data, which is then 'piped' to the \code{Create} function, resulting in a network (an igraph object) +that is ready for analysis. } \author{ Created by Timothy Graham and Robert Ackland, with major contributions by Chung-hong Chan and Bryan Gertzel. diff --git a/vosonSML/tests/testthat.R b/vosonSML/tests/testthat.R deleted file mode 100644 index d752e69..0000000 --- a/vosonSML/tests/testthat.R +++ /dev/null @@ -1,4 +0,0 @@ -# library(testthat) -# library(vosonSML) - -# test_check("vosonSML") diff --git a/vosonSML/tests/testthat/cred_empty.R b/vosonSML/tests/testthat/cred_empty.R deleted file mode 100644 index e541683..0000000 --- a/vosonSML/tests/testthat/cred_empty.R +++ /dev/null @@ -1,5 +0,0 @@ -### please modify me and rename me to cred.R -### WARNING: don't add cred.R to github repo -### cred.R already in .gitignore - -yt <- "" diff --git a/vosonSML/tests/testthat/test_youtube.R b/vosonSML/tests/testthat/test_youtube.R deleted file mode 100644 index 43d817b..0000000 --- a/vosonSML/tests/testthat/test_youtube.R +++ /dev/null @@ -1,11 +0,0 @@ -source("cred.R") -require(magrittr) - -## "4_hHKlEZ9Go" is a closed comment video - -test_that("Youtube Empty Comment Error",{ - expect_error(Authenticate("youtube", yt) %>% Collect(videoIDs = c("4_hHKlEZ9Go")), "No comment can be collected from the given videoIDs.") -### however, multiple videoIDs with only one with empty comment should not throw an error. - borat <- Authenticate("youtube", yt) %>% Collect(videoIDs = c("4_hHKlEZ9Go", "YzdYF0r3gB4")) - expect_true("dataSource" %in% class(borat)) -})