Data Smart, Ch3, Classifying Tweets using Naive Bayes

 combined

 

Executive Summary

In chapter 3 of the book, Data Smart, by John Foreman, chief data scientist at Mailchimp, the author develops a Naive Bayes classifier in Excel to determine whether tweets containing the word ‘mandrill’ are related to Mailchimps’s Mandrill email-transaction app or not.

Whereas the author used Excel, we choose to use R’s text mining package, tm, in developing a solution, in order to take advantage of tm’s built-in automated text processing tools.

The book, Machine Learning for Hackers by Drew Conway and John Myles White is also a useful resource. We utilize elements of that book’s approach to email spam classification here, since it also uses the tm package in its solution.

# load necessary packages
library(tm) # an R text mining package
library(dplyr) # used for easy manipulation of data frames

# Write a function to create a document corpus and a Term Document Matrix. The preparation involves converting 
# all characters to lower case, removing numbers, removing punctuation and removing stopwords.
createTDM <- function(doc){
  corpus <- Corpus(VectorSource(doc)) # creates a corpus from the source document
  control <- list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE) # list of text preparation steps to be applied
  tdm <- TermDocumentMatrix(corpus, control) 
    # converts the terms to lowercase by default
    # tokenizes the document to words by default 
    # removes stopwords. Type 'stopwords()' in the R console to view the list of stopwords 
  return(tdm)
}

# calculate the proportional frequency of occurrence of each word. Arrange information in a data frame
genProb <- function(record){
  counts <- cbind(data.frame(record$dimnames$Terms, data.frame(record$v))) # bind together the data of interest from the tdm, the individual words and counts
  names(counts) <- c("word", "count") # assign appropriate names to the columns
  counts <- mutate(counts, prob = count/sum(count)) # create a new column, 'prob', using the mutate command from the dplyr package
}

# function to return score of testTweet based on word frequencies in the training corpora
genTweetScores <- function(trainingCounts){
  testTweetScore <- as.numeric(vector())
  for (i in 1:length(testTweets)){
    score = 0
    test <- createTDM(testTweets[i])
    wordsIn <- intersect(test$dimnames$Terms, trainingCounts$word)
    wordsNotIn <- setdiff(test$dimnames$Terms, wordsIn)
    score <- score + log(notInValue)*length(wordsNotIn)
    score <- score + sum(log(trainingCounts$prob[match(wordsIn, trainingCounts$word)]))
    testTweetScore[i] <- score
  }
  return (testTweetScore)
}

# read in and concatenate sets of training tweets into a single character variable
mApp <- paste(readLines("MandrillApp.csv"), collapse = "\n")
other <- paste(readLines("Other.csv"), collapse = "\n")
testTweets <- readLines("testTweets.csv") # this body of individual tweets is not concatenated. Each is analyzed as an individual document

# Assign a proportional frequency value to words in test tweets which were not in the training corpus as per the 
# approach in ML for Hackers. Data Smart used an additive smoothing approach
notInValue = 0.00005 

# inspect the stopwords
stopwords()
##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"
# Generate the term-document-matrices for the training corpora using the createTDM function
mTDM <- createTDM(mApp)
oTDM <- createTDM(other)

# generate the counts and proportional frequencies for words in the training corpora
mCounts <- genProb(mTDM)
oCounts <- genProb(oTDM)

# arrange the rows in descending order of occurrence. Inspect the top 20 rows
head(arrange(mCounts, -prob), 20) 
##                   word count        prob
## 1             mandrill   100 0.065703022
## 2                email    28 0.018396846
## 3  httphelpmandrillcom    22 0.014454665
## 4                  can    20 0.013140604
## 5            mailchimp    20 0.013140604
## 6             sendgrid    18 0.011826544
## 7              request    16 0.010512484
## 8          mandrillapp    14 0.009198423
## 9              details    13 0.008541393
## 10              emails    13 0.008541393
## 11                send    13 0.008541393
## 12       transactional    12 0.007884363
## 13                just    11 0.007227332
## 14                mind    11 0.007227332
## 15         newsletters    11 0.007227332
## 16             service    11 0.007227332
## 17                 use    11 0.007227332
## 18               using    11 0.007227332
## 19                 via    11 0.007227332
## 20                 api    10 0.006570302
head(arrange(oCounts, -prob), 20) 
##                     word count        prob
## 1               mandrill   138 0.112837285
## 2                  spark    25 0.020441537
## 3                youtube    17 0.013900245
## 4                megaman    15 0.012264922
## 5               acapella    14 0.011447261
## 6                    get    12 0.009811938
## 7  httpyoutubehyxkwyjdia     9 0.007358953
## 8                    man     9 0.007358953
## 9         smoothmcgroove     9 0.007358953
## 10                 vídeo     9 0.007358953
## 11                gostei     7 0.005723630
## 12                  just     6 0.004905969
## 13                  mega     6 0.004905969
## 14                   can     5 0.004088307
## 15                  like     5 0.004088307
## 16                   new     5 0.004088307
## 17                   que     5 0.004088307
## 18                   via     5 0.004088307
## 19              ccpgames     4 0.003270646
## 20            freebooted     4 0.003270646
# Classify the tweets using their word content probabilities for each of the training corpora
classResults <- data.frame(cbind(genTweetScores(mCounts), genTweetScores(oCounts)))
names(classResults) <- c("mScores", "oScores")
classResults <- mutate(classResults, Classification = ifelse(mScores > oScores, "App", "Other"))
classResults
##       mScores    oScores Classification
## 1   -60.77438  -95.27575            App
## 2   -60.71495  -97.76066            App
## 3   -37.97522  -48.90482            App
## 4  -114.52122 -167.74667            App
## 5   -83.35133 -118.75262            App
## 6   -48.28697  -71.50622            App
## 7   -29.46878  -39.00133            App
## 8   -35.94576  -39.34240            App
## 9   -95.49885 -123.72886            App
## 10  -54.37375  -79.22790            App
## 11  -32.43307  -24.51166          Other
## 12  -62.14354  -47.63061          Other
## 13  -49.66434  -51.69925            App
## 14  -36.76512  -37.39190            App
## 15  -32.43307  -29.09785          Other
## 16  -52.24005  -24.94163          Other
## 17  -88.17968  -80.89603          Other
## 18  -86.55568  -60.70636          Other
## 19  -69.47132  -71.50622            App
## 20  -91.85400  -85.03120          Other
# The classifier classifies all Mandrill-App related tweets correctly (items 1-10 in the classResults list)
# Three of the unrelated tweets (items 11-20 in the classResults 1ist) are misclassified as 'App', so we can see that there is room for improvement.

Leave a Reply

Your email address will not be published.