Twitter Data!!!

I wanted to experiment with getting some Twitter data over the Thanksgiving break for one of my clients that does a fair amount of retail sales. I connected to the Twitter public stream using the “streamR” package and filtered based on around 30 hashtags. The idea was – well is, Cyber Monday is only just ending – to import the data into CartoDB. I tried this with some sample data from Twitter without worrying about filtering by hashtag and it looked great (CartoDB seems to be a pretty great tool). This “analysis” is more of what I like to call a “bonus treat” than actionable data, but why not have some fun. And who knows, maybe in analyzing the data some interesting trends will pop out. The final batch of data is just finishing up (I collected 8 .json files to make sure the file sizes were manageable since this is an experiment).

I wrote a quick little function that reads all of the .json files end, parses them, and does an initial reduction of the main columns I’m interested in (the original .json object creates an R dataframe with 43 columns). For this function I played around with the “foreach” package, which i really like and also allows you to run functions using multiple processors. The documentation says it may not work correctly for connections – and opening a file is a connection – but I used R to time the two methods – one with a single processor and one using two processors – and the loading function was about 30% faster using two.

source(file = "TwitterAnalysisFunctions.R")
 
cl = makeCluster(2)
registerDoParallel(cl)
fileList = list.files(getwd())
TwitterFiles = grep(pattern = ".json", x = fileList)
TweetDF = foreach(file=1:length(TwitterFiles), .combine = rbind, .packages = 'streamR') %dopar% { 
                      loadTweets(parseTweets(fileList[TwitterFiles[file]],verbose = FALSE)) 
}
stopCluster(cl)

Created by Pretty R at inside-R.org

Here are the helper functions I wrote to perform various common tasks. I used the “dplyr” package as much as possible because of its speed and adeptness and handling large dataframes.

#################################################################################################
# A function to load Tweets
#################################################################################################
loadTweets = function(df) {
  require(dplyr)
  Tweets = 
    df %>% 
    # Select needed columns
    select(Tweet = text,
           Retweets = retweet_count,
           Time = created_at,
           Language = lang,
           Location = location,
           Time.Zone = time_zone,
           Lat = lat,
           Long = lon) %>%
    mutate(Implied.Location = ifelse(!is.na(Lat), "Lat/Long Available", Location)) 
  return(Tweets)
}
 
 
#################################################################################################
# A function to remove spurious characters from a string
#################################################################################################
cleanText = function(df, colName) {  
  # Create list of strings
  stringsList = strsplit(as.vector(df[[colName]], mode = "character"), split = " ")
  # Clean every entry
  cleanTextList = sapply(stringsList, FUN = function(text) {
    nonLatinStrings = grep("s2Rz", iconv(x = text, from = "latin1", to = "ASCII", sub = "s2Rz"))
    if(length(nonLatinStrings) > 0) strings2keep = text[-nonLatinStrings]
    else strings2keep = text
    cleanText = paste(strings2keep, collapse = " ")
    return(cleanText)
  })
  df[[colName]] = cleanTextList
  return(df)
}
 
 
#################################################################################################
# A function to extract Lat/Longs
#################################################################################################
extractLatLong = function(df, searchCol, latCol, longCol) {
  require(stringr)
  latlongFinder = "(\\-?[0-9]{1,3}(\\.[0-9]+)?),\\s*(\\-?[0-9]{1,3}(\\.[0-9]+)?)"
  latlongs = strsplit(x = str_extract(df[[searchCol]], pattern = latlongFinder), split = ",")
  rowIndex = which(!is.na(latlongs))
  df[rowIndex , latCol] = unlist(latlongs[rowIndex])[seq(from = 1, to = length(unlist(latlongs[rowIndex])), by = 2)]
  df[rowIndex , longCol] = unlist(latlongs[rowIndex])[seq(from = 2, to = length(unlist(latlongs[rowIndex])), by = 2)]
  df[rowIndex, searchCol] = "Lat/Long Available"
  return(df)
}
 
#################################################################################################
# A function to prepare Implied.Location for geocoding
#################################################################################################
prepareLocForGeoCodeing = function(df) {
  excludePattern = paste0("\\.com|\\.net|\\.org|planet|world|xbox|earth|my |world|home|sofa|couch|",
                          "globe|global|here|facebook|internet|!|/|Lat/Long Available")
  cleanLocations = df %>%
    select(Implied.Location) %>% 
    filter(!grepl(x = df[["Implied.Location"]], pattern = excludePattern, ignore.case = TRUE)) %>%
    do(cleanText(.,"Implied.Location")) %>%
    distinct()
  return(cleanLocations)
}
 
#################################################################################################
# A function to remove blank entries
#################################################################################################
filterBlanks = function(df, colName) {
  require(dplyr)
  return(filter(df, df[[colName]] != ""))
}
 
#################################################################################################
# A function to get unique Tweets dataframe
#################################################################################################
getUniqueTweetsDF = function(df, colName) {
  require(dplyr)
  df[[colName]] = gsub(pattern = "RT ", replacement = "", x = df[[colName]])
  Tweets = df %>% 
    filter(Language == "en") %>%
    distinct(.) 
  return(Tweets)
}
 
#################################################################################################
# A function to get unique Tweets vector
#################################################################################################
getUniqueTweetsVector = function(df, colName) {
  return(getUniqueTweetsDF[[colName]])
}
 
#################################################################################################
# A function to get unique entries and return a dataframe
#################################################################################################
getUniqueEntriesDF = function(df, colName) {
  require(dplyr)
  return(distinct(df, df[[colName]]))
}
 
#################################################################################################
# A function to get unique entries and return a vector
#################################################################################################
getUniqueEntriesVector = function(df, colName) {
  require(dplyr)
  return(distinct(df, df[[colName]])[colName])
}
 
 
#################################################################################################
# A function to search for Tweets containing certain words
#################################################################################################
searchForTweets = function(df, colName, wordList) {
  require(dplyr)
  wordList = paste(as.vector(wordList, mode = "character"), collapse = "|")
  df = getUniqueTweets(df, colName)
  return(filter(.data = df, grepl(x = df[[colName]], pattern = wordList)))
}
 
#################################################################################################
# A function to make a word cloud
#################################################################################################
makeWordCloud = function(df, colName) {
  require(tm)
  require(wordcloud)
 
  # Clean data using the tm package and custom getUniqueTweets
  textCorpus = Corpus(VectorSource(getUniqueTweetsVector(df, colName)))
  textCorpus <- tm_map(textCorpus, content_transformer(tolower))
  textCorpus <- tm_map(textCorpus, removePunctuation)
  textCorpus <- tm_map(textCorpus, function(x) removeWords(x,stopwords("english")))
 
  # Make word cloud
  wordcloud(textCorpus, min.freq = 10, max.words = 300, random.order = FALSE, colors = brewer.pal(8,"Dark2"))
}

Created by Pretty R at inside-R.org

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s