Sentiment Classification
Solution for submission 175607
A detailed solution for submission 175607 submitted for challenge Sentiment Classification
Starter Code for Sentiment Classification
In this baseline we will be training an sklearn model to do a multi-class classificattion of sentiment from face embeddings.
Downloading Dataset¶
Installing puzzle datasets via aicrowd-cli
!pip install aicrowd-cli
# Make sure to re-run below code whenever you restart colab notebook
%load_ext aicrowd.magic
# Logging in from our AIcrowd account. Make sure you have accepted the puzzle rules before logging in!
%aicrowd login
# Creating a new data directory and downloading the dataset
!rm -rf data
!mkdir data
%aicrowd ds dl -c sentiment-classification -o data
Importing Libraries¶
In this baseline, we will be using sklearn many Classifiers to classify the sentiment of face embeddings. In the end, SVC is used.
import pandas as pd
import os
import numpy as np
from ast import literal_eval
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score, accuracy_score
from tensorflow.keras import models, layers
from tensorflow import keras
import tensorflow as tf
# from sklearn import svm
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score, make_scorer, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer
from sklearn.utils import shuffle
from lightgbm import LGBMClassifier
import lightgbm
from sklearn.model_selection import RandomizedSearchCV
import random
from sklearn.model_selection import train_test_split
# import seaborn as sns
random.seed(42)
# Load the TensorBoard notebook extension
%load_ext tensorboard
# Clear any logs from previous runs
!rm -rf ./logs
log_dir = 'logs/test/'
Reading Dataset¶
As mented in the challenge readme, we have three different sets provided - train, validation and test respectively.
# Reading the csv
train = pd.read_csv("data/train.csv")
val = pd.read_csv("data/val.csv")
submission = pd.read_csv("data/sample_submission.csv")
train
# Getting the feature and labels from each set.
X = [literal_eval(embedding) for embedding in train['embeddings'].values]
y = train['label'].values
X_val = [literal_eval(embedding) for embedding in val['embeddings'].values]
y_val = val['label'].values
def label_to_int(y):
y_res = np.zeros(y.shape)
for i in range(0,len(y)):
if y[i] == 'positive':
y_res[i] = 2
elif y[i] == 'neutral':
y_res[i] = 1
elif y[i] == 'negative':
y_res[i] = 0
return y_res
def int_to_label(y):
y_res = np.zeros(y.shape).astype("U", copy=False)
for i in range(0,len(y)):
if y[i] == 2:
y_res[i] = 'positive'
elif y[i] == 1:
y_res[i] = 'neutral'
elif y[i] == 0:
y_res[i] = 'negative'
else:
print("ERROR")
return y_res
X_train = np.array(X).astype('float32')
X_val = np.array(X_val).astype('float32')
y = np.array(y)
y_train = label_to_int(y)
y_train = np.array(y_train).astype(np.uint8)
y_val_num = label_to_int(y_val)
y_val_num = np.array(y_val_num).astype(np.uint8)
def scale_data(X):
mu = np.mean(X,axis=0)
sigma = np.std(X, axis=0)
normed = (X - mu)/sigma
return normed
#scaler = StandardScaler() # Create feature transformer object, can accept hyperparameters like models can!
X_train2 = np.concatenate([X, X_val])
y_train2 = label_to_int(np.concatenate([y, y_val]))
X_train2, y_train2 = shuffle(X_train2, y_train2, random_state=42)
#scaler.fit(X_train2) # Fitting the transformer on the train split
#X_train2_scaled = scale_data(X_train2) # Transforming the train+val split
#X_train_scaled = scale_data(X_train) # Transforming the train split
#X_val_scaled = scale_data(X_val) # Transforming the val split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import PolynomialFeatures
print(X_train2.shape)
selector = SelectKBest(f_classif, k=500)
selector.fit(X_train2, y_train2)
X_kbest = selector.transform(X_train2)
print(X_kbest.shape)
scaler = Normalizer("l1")
X_train_scaled = scaler.fit_transform(X_train2)
Building the model¶
Here, we will be biuld some models.
callback = keras.callbacks.TensorBoard(
log_dir="/content/gdrive/My Drive/logsRelu",
histogram_freq=1, # How often to log histogram visualizations
embeddings_freq=1, # How often to log embedding visualizations
update_freq="batch",
) # How often to write logs (default: once per epoch)
# MLP Network
model1 = models.Sequential()
#model.add(layers.Flatten(input_shape=(28, 28, 1)))
model1.add(layers.Dense(3, activation='tanh', input_shape=(512,)))
model1.add(layers.Dense(3))
model1.add(layers.Dense(3))
model1.add(layers.Dense(3))
model1.summary()
model1.compile(optimizer=keras.optimizers.RMSprop(), # Optimizer
# Loss function to minimize
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
# List of metrics to monitor
metrics=['sparse_categorical_accuracy'],
)
#relu
#F1 Score : 0.7718555624302187
#Accuracy Score : 0.768
#F1 Score : 0.7688335267842854
#Accuracy Score : 0.7715
#F1 Score : 0.7691153347965752
#Accuracy Score : 0.769
#tanh
#F1 Score : 0.7807127478931334
#Accuracy Score : 0.783
#F1 Score : 0.7757699844251622
#Accuracy Score : 0.7815
# 512-512-3
#F1 Score : 0.7806896514347366
#Accuracy Score : 0.7765
# defining parameter range
param_grid = {'C': [ 1, 10, 100],
'gamma': [ "scale"],
'degree': [2, 3, 4],
'kernel': ['poly']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
clf = SVC(kernel="poly", C=1, gamma="scale", degree=3, decision_function_shape='ovr', break_ties=True)
params = {
'n_estimators': [400, 700, 1000, 1500, 2000],
'colsample_bytree': [0.7, 0.8],
'max_depth': [10, 15, 20, 25],
'num_leaves': [50, 100, 200],
'reg_alpha': [1.1, 1.2, 1.3],
'reg_lambda': [1.1, 1.2, 1.3],
'min_split_gain': [0.3, 0.4],
'subsample': [0.7, 0.8, 0.9],
'subsample_freq': [20],
"objective" : "multiclass",
"metric" : ["multi_error"],
"learning_rate" : [0.1, 0.2, 0.5],
"min_data_in_leaf" : [1, 5, 7, 10, 15, 20, 30]}
def gridsearchlgbm(train_X, train_y, cv=10, param_grid=params):
print('Classification')
model = lightgbm.LGBMClassifier(n_estimators= param_grid['n_estimators'],
colsample_bytree= param_grid['colsample_bytree'],
max_depth= param_grid['max_depth'],
num_leaves = param_grid['num_leaves'],
reg_alpha= param_grid['reg_alpha'],
reg_lambda= param_grid['reg_lambda'],
min_split_gain= param_grid['min_split_gain'],
subsample= param_grid['subsample'],
subsample_freq= param_grid['subsample_freq'],
objective = param_grid['objective'],
metric = param_grid['metric'],
learning_rate = param_grid['learning_rate'],
min_data_in_leaf= param_grid["min_data_in_leaf"])
print("grid")
model_rsc = RandomizedSearchCV(model, param_grid, cv = cv, verbose=3)#if not working use np.random.RandomState(123)
model_rsc = model_rsc.fit(train_X, train_y)
best_param = model_rsc.best_params_
print(best_param)
print("grid done")
return best_param
#search
#params = gridsearchlgbm(X_train2[:5000], y_train2[:5000])
#train with best params
#model = lightgbm.LGBMClassifier(**params)
# model = lightgbm.LGBMRegressor(**params)
#fit
#model.fit(X_train2[:5000], y_train2[:5000])
#predict
#preds = model.predict(X_train2[5000:])
print("\nLightGBM")
print(f"F1 Score : {f1_score(y_train2[5000:], preds, average='weighted')}")
print(f"Accuracy Score : {accuracy_score(y_train2[5000:], preds)}")
SEARCH_PARAMS = {'learning_rate': 0.4,
'max_depth': 15,
'num_leaves': 20,
'feature_fraction': 0.8,
'subsample': 0.2}
FIXED_PARAMS={'objective': 'multiclass',
'metric': 'multi_error',
'is_unbalance': True,
'boosting':'gbdt',
'num_boost_round':600,
'early_stopping_rounds':50}
parameters = {
'objective': 'multiclass',
'metric': 'multi_logloss',
'is_unbalance': 'true',
'boosting': 'dart',
'max_depth': 26,
'num_leaves': 29,
'num_classes': 3,
'feature_fraction': 0.6,
'subsample': 0.2,
'learning_rate': 0.01,
'bagging_freq': 6,
'iterationmax_dropdrop_rate': 0.3}
train_data = lightgbm.Dataset(X_train, label=y_train)
val_data = lightgbm.Dataset(X_val, label=label_to_int(y_val))
Training the model¶
Here, we will be training our model using the training set.
model_lgbm = lightgbm.train(parameters, train_data,
valid_sets=[val_data],
num_boost_round=FIXED_PARAMS['num_boost_round'],
early_stopping_rounds=FIXED_PARAMS['early_stopping_rounds'],
valid_names=['valid'],
)
score = model_lgbm.best_score['valid']['multi_logloss']
model_lgbm.best_score
model_knn = KNeighborsClassifier(n_neighbors=7)
model_rfc = RandomForestClassifier()
model_gnb = GaussianNB()
model_knn
model_knn.fit(X_train_scaled[:5000], y_train2[:5000])
model_rfc.fit(X_train, y)
model_gnb.fit(X_train, y_train)
scoring = {'Accuracy': make_scorer(accuracy_score), 'F1': make_scorer(f1_score,
average='weighted')}
cv=10
scores = cross_validate(clf, X_train_scaled, y_train2, cv=cv, scoring=scoring, return_estimator=True)
print(scores['test_F1'])
print(np.mean(scores['test_F1']))
print(scores['test_Accuracy'])
print(np.mean(scores['test_Accuracy']))
#grid.fit(X_train2, y_train2)
#grid.best_estimator_.get_params()
history = model1.fit(X_train2[0:5000],
y_train2[0:5000],
batch_size=100,
epochs=100,
validation_data=(X_train2[5000:], y_train2[5000:]),
# callbacks=[callback]
)
# Start TensorBoard
#%reload_ext tensorboard
#%tensorboard --logdir="/content/gdrive/My Drive"
Testing the Model¶
Here, we will be evaluator our model using validation set
y_pred_knn = model_knn.predict(X_train_scaled[5000:])
y_pred_rfc = model_rfc.predict(X_val)
y_pred_gnb = model_gnb.predict(X_val)
y_pred_gnb = int_to_label(y_pred_gnb)
y_pred_lgbm = model_lgbm.predict(X_val)
y_pred_lgbm = int_to_label(np.argmax(y_pred_lgbm, axis=1))
y_pred_mlp = model1.predict(X_val)
y_pred_mlp = int_to_label(np.argmax(y_pred_mlp, axis=1))
y_pred_pos = ['positive'] * len(y_pred_rfc)
y_pred_neu = ['neutral'] * len(y_pred_rfc)
y_pred_neg = ['negative'] * len(y_pred_rfc)
y_pred_lgbm.shape
print("KNeighborsClassifier")
print(f"F1 Score : {f1_score(y_train2[5000:], y_pred_knn, average='weighted')}")
print(f"Accuracy Score : {accuracy_score(y_train2[5000:], y_pred_knn)}")
print("\nRandomForestClassifier")
print(f"F1 Score : {f1_score(y_val, y_pred_rfc, average='weighted')}")
print(f"Accuracy Score : {accuracy_score(y_val, y_pred_rfc)}")
print("\nGaussian Naive Bayes")
print(f"F1 Score : {f1_score(y_val, y_pred_gnb, average='weighted')}")
print(f"Accuracy Score : {accuracy_score(y_val, y_pred_rfc)}")
print("\nLightGBM")
print(f"F1 Score : {f1_score(y_val, y_pred_lgbm, average='weighted')}")
print(f"Accuracy Score : {accuracy_score(y_val, y_pred_lgbm)}")
print("\nSVC")
print(f"F1 Score : {np.mean(scores['test_F1'])} (mean) {scores['test_F1']}")
print(f"Accuracy Score : {np.mean(scores['test_Accuracy'])} (mean) {scores['test_Accuracy']}")
print("\nMLP")
print(f"F1 Score : {f1_score(y_val, y_pred_mlp, average='weighted')}")
print(f"Accuracy Score : {accuracy_score(y_val, y_pred_mlp)}")
print("\nOnly positive")
print(f"F1 Score : {f1_score(y_val, y_pred_pos, average='weighted')}")
print(f"Accuracy Score : {accuracy_score(y_val, y_pred_pos)}")
print("\nOnly neutral")
print(f"F1 Score : {f1_score(y_val, y_pred_neu, average='weighted')}")
print(f"Accuracy Score : {accuracy_score(y_val, y_pred_neu)}")
print("\nOnly negative")
print(f"F1 Score : {f1_score(y_val, y_pred_neg, average='weighted')}")
print(f"Accuracy Score : {accuracy_score(y_val, y_pred_neg)}")
Generating the Predictions¶
Generating Predictions from test data to make submission in the puzzle.
def mean_to_label(y):
y_res = np.zeros(y.shape).astype("U", copy=False)
for i in range(0,len(y)):
if y[i] >= 1.4:
y_res[i] = 'positive'
elif y[i] <= 0.6:
y_res[i] = 'negative'
elif y[i] < 1.4 or y[i] > 0.6:
y_res[i] = 'neutral'
else:
print("ERROR")
return y_res
def preds_to_label(y):
y = np.array(y)
print(y.shape)
y_res = np.zeros(y.shape[1])
for i in range(0,y_res.shape[0]):
values, counts = np.unique(y[:,i], return_counts=True)
winner = np.argwhere(counts == np.amax(counts))
if len(winner) == 1:
y_res[i] = int(values[np.argmax(counts)])
else:
y_res[i] = 1
print(int(values[np.argmax(counts)]))
y_res = int_to_label(y_res)
return y_res
submission_embeddings = [literal_eval(embedding) for embedding in submission['embeddings'].values]
X_test_kbest = scaler.transform(submission_embeddings) # Transforming the test split
prediction_list = []
estimator_index_list = [ 5, 6, 7, 8, 9]
for i in estimator_index_list:
estimator = scores['estimator'][i]
prediction_list.append(estimator.predict(X_test_kbest))
predictions = np.mean(prediction_list, axis = 0)
##predictions = scores['estimator'][2].predict(X_test_kbest)
rounded = np.round(predictions)
predictions = int_to_label(rounded)
#predictions = preds_to_label(prediction_list)
predictions.shape
submission['label'] = predictions
print(predictions[0:100])
submission
from collections import Counter
values, counts = np.unique(y_train2, return_counts=True)
print(values)
print(counts)
values, counts = np.unique(predictions, return_counts=True)
print(values)
print(counts)
Saving the Predictions¶
# Saving the predictions
!rm -rf assets
!mkdir assets
submission.to_csv(os.path.join("assets", "submission.csv"))
Submitting our Predictions¶
%aicrowd notebook submit -c sentiment-classification -a assets --no-verify
Congratulations to making your first submission in the puzzle 🎉 . Let's continue with the journey by improving the baseline & making submission! Don't be shy to ask question related to any errors you are getting or doubts in any part of this notebook in discussion forum or in AIcrowd Discord sever, AIcrew will be happy to help you :)
Have a cool new idea that you want to see in the next blitz ? Let us know!
Content
Comments
You must login before you can post a comment.