1 Preparation
The goal of final project is using the 'complaint1700' and 'non-complaint1700' file to train a robust classification model, which can be used afterwards to test if a new tweet is negative and help the company to separate the non-negative tweets.
To begin with, I firstly imported basic packages to set the environment and saved all the necessary data in a certain working directory.
1.1 Import Packages
# Basic packages
import pandas as pd
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from IPython.display import Image
# Packages for text processing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,LancasterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_curve, auc, roc_auc_score
# Packages for modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
# Avoid warnings
import warnings
warnings.filterwarnings("ignore")
1.2 Load Data
It should be noted that the tagged tweets that I need to detect was retrieved solely from http://r.twittersensor.com.
Instead of Python, I used R for that purpose, and my tag is 'xW\v1i81&Wef'.
Image(filename='tag.png',width=700, height=350)
After that, I listed all the files in the current working directory. What I need is 'complaint1700', 'noncomplaint1700' and 'test_data_Susie'.
!ls
# Read csv files
with open('complaint1700.csv') as f1:
neg_data = pd.read_csv(f1)
with open('noncomplaint1700.csv') as f2:
pos_data = pd.read_csv(f2)
# Convert the categorical variable to binary variable
neg_data['target'] = 0
pos_data['target'] = 1
# Concat the two dataset to build a training set # Drop insignificant columns
training_data = pd.concat([neg_data,pos_data],ignore_index=True)[['tweet','target']]
That's what training data looks like.
training_data.head()
with open(folder+'test_data_Susie.csv') as f:
real_data = pd.read_csv(f)
$\qquad$
2 Data Pre-processing
# Stop words
stop_words = set(stopwords.words('english'))
# Some words which might indicate a certain sentiment are kept via a whitelist
whitelist = ["n't", "not", "no"]
# Lemmatize
wordnet_lemmatizer = WordNetLemmatizer()
I defined to function to automatically deal with raw text data.
def normalizer(tweet):
# remove punctuation mark
only_letters = re.sub("[^a-zA-Z]", " ",tweet)
# tokenize
tokens = nltk.word_tokenize(only_letters)
# convert everything to lowercase
lower_case = [l.lower() for l in tokens]
# remove words in stop_words and keep words in whitelist
filtered_result = list(filter(lambda l: l not in stop_words or l in whitelist, lower_case))
# stem
stems1 = [porter.stem(t) for t in filtered_result]
stems2 = [lancaster.stem(t) for t in stems1]
# lemmatize
lemmas = [wordnet_lemmatizer.lemmatize(t) for t in stems2]
# join the words together
normalized_tweet = " ".join(lemmas)
return normalized_tweet
That's what it looks like.
pd.set_option('display.max_colwidth', -1) # Setting this so we can see the full content of cells
training_data['normalized_tweet'] = training_data.tweet.apply(normalizer)
training_data[['tweet','normalized_tweet']].head()
I converted a collection of texts to a matrix of token counts, which can be regarded as 'features'.
CV = CountVectorizer(analyzer='word')
features = CV.fit_transform(training_data.normalized_tweet)
features_nd = features.toarray()
$\qquad$
3 Cross Validation
3.1 Train-test Split
I used 'features_nd' as independent variable and 'training_data.target' as dependent variable; 90% of the dataset was used for training, and the other 10% for validation.
X_train, X_test, y_train, y_test = train_test_split(features_nd, training_data.target, test_size=0.1, random_state=37)
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])
assert X_train.shape[0] == y_train.shape[0]
assert X_test.shape[0] == y_test.shape[0]
3.2 Select Optimal Model
I defined two new functions, one for choosing the optimal hyperparameter and the other for showing the AUC & Accuracy.
def gridSearchCV(model, params):
"""
@param model: sklearn estimator
@param params (dict): Dictionary of possible parameters
@return cv_results (DataFrame)
"""
model_cv = GridSearchCV(model, param_grid=params, scoring='roc_auc', cv=5)
model_cv.fit(X_train, y_train)
cv_results = pd.DataFrame(model_cv.cv_results_)[['params', 'mean_test_score']]
return cv_results
def evaluate(model, plotROC=False):
"""
1. Print AUC and accuracy on the test set
2. Plot ROC
"""
model.fit(X_train, y_train)
probs = model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
print(f'AUC: {roc_auc:.4f}')
# Find optimal threshold
rocDf = pd.DataFrame({'fpr': fpr, 'tpr':tpr, 'threshold':threshold})
rocDf['tpr - fpr'] = rocDf.tpr - rocDf.fpr
optimalThreshold = rocDf.threshold[rocDf['tpr - fpr'].idxmax()]
# Get accuracy over the test set
y_pred = np.where(preds >= optimalThreshold, 1, 0)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy*100:.2f}%')
# Plot ROC AUC
if plotROC:
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
$\qquad$3.2.1 Randon Forest
params = {'n_estimators': [50, 200,500]}
rfc = RandomForestClassifier(random_state=1)
print(gridSearchCV(rfc, params))
rfc = RandomForestClassifier(n_estimators=1000, random_state=1)
evaluate(rfc, plotROC=True)
$\qquad$3.2.2 Logistic Regression
params = {'C': [0.01, 0.1,0.5,1]}
logit = LogisticRegression()
print(gridSearchCV(logit, params))
logit = LogisticRegression(C=0.5)
evaluate(logit, plotROC=True)
$\qquad$3.2.3 Support Vector Machine
params = {'C': [0.1,1]}
svc = SVC(gamma='auto',kernel='rbf',random_state=1)
print(gridSearchCV(svc, params))
from sklearn.svm import SVC
params = {'C': [0.1]}
svc = SVC(gamma='auto', random_state=1)
print(gridSearchCV(svc, params))
$\qquad$3.2.4 MultimonialNB
params = {'alpha':[0.1,0.5,1,1.5]}
mnb = MultinomialNB()
print(gridSearchCV(mnb, params))
mnb = MultinomialNB(alpha=1.5)
evaluate(mnb, plotROC=True)
Among four models, Logistic Model has the highest AUC of 0.8517 and the highest accuracy of 77.35%.
$\qquad$
4 Non-negative Tweet Detection
That's the tagged dataset looks like.
real_data.head()
I used the same method for text processing.
# Text Pre-processing
real_data['normalized_tweet'] = real_data.tweet.apply(normalizer)
real_features = CV.transform(real_data.normalized_tweet)
real_features_nd = real_features.toarray()
I used the optimal classfication model (Logistic Regression, C=0.5) to predict, setting the threshold as 0.9.
logit = LogisticRegression(C=0.5)
logit2 = logit.fit(X=features_nd, y=training_data.target)
probs2 = logit2.predict_proba(real_features_nd)[:,1]
y_pred2 = np.where(probs2 >= 0.9, 1, 0)
I extracted all the non-negative tweets into a new dataframe, and exported it as a csv file named 'chenxi_tao.csv'.
# create a new column to label the tweets
real_data['target'] = y_pred2
# 155 predicted non-negative tweets
num_of_nonneg = len(real_data.loc[real_data.target == 1])
print('There are '+str(num_of_nonneg)+' predicted non-negative tweets.')
# export it as a csv file
non_neg_tweet = real_data.loc[real_data.target == 1][['id','tweet']]
non_neg_tweet.to_csv("chenxi_tao.csv",index=False)
$\qquad$
5 Self-evaluation
I manually evaluate if the classification is right, and added the evaluation as a the 2nd column.
with open('chenxi_tao.csv') as f3:
result = pd.read_csv(f3)
result.head()
correct = len(result.loc[result.evaluation == 1])
wrong = len(result.loc[result.evaluation == 0])
print('Finally, I got '+str(correct)+' correct predictions and '+str(wrong)+' wrong predictions.')
precision = correct / len(result)
print('The precision is '+f'Accuracy: {accuracy*100:.2f}%')
$\qquad$