Sentiment Classification
Sentiment Classification: SVM/LGBM/CatBoost/XGBC classifier
A notebook trying to solve this challenge with four methods (from a high-schooler in Vietnam^^)
Set up¶
In [ ]:
!pip install aicrowd-cli
%load_ext aicrowd.magic
In [ ]:
%aicrowd login
In [ ]:
!rm -rf data
!mkdir data
%aicrowd ds dl -c sentiment-classification -o data
Process the data¶
In [ ]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import ast
import time
In [ ]:
train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")
In [ ]:
test_df = pd.read_csv("data/test.csv")
In [ ]:
train_df.head()
Out[ ]:
In [ ]:
x_train = []
y_train = []
x_val = []
y_val = []
label_dict = {'positive': 1, 'negative': 0, 'neutral': 2}
for i in range(len(train_df)):
x_train.append(ast.literal_eval(train_df.embeddings[i]))
y_train.append(label_dict[train_df.label[i]])
for i in range(len(val_df)):
x_val.append(ast.literal_eval(val_df.embeddings[i]))
y_val.append(label_dict[val_df.label[i]])
'''y_train = tf.keras.utils.to_categorical(y_train, num_classes = 3)
y_val = tf.keras.utils.to_categorical(y_val, num_classes = 3)'''
x_train = np.array(x_train)
x_val = np.array(x_val)
SVM¶
In [ ]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import f1_score
In [ ]:
C_values = [0.01, 0.1, 1, 10, 100]
gamma_values = [0.01, 0.1, 1, 10, 100]
# Kernel rbf has gamma but linear and poly doesn't
# We need to define two searching methods
# With rbf kernel, search a gamma and C
rbf_search = {'kernel': ['rbf'], 'gamma': gamma_values, 'C': C_values}
# With linear and poly kernel, search C
linear_poly_search = {'kernel': ['linear','poly'], 'C': C_values}
# A list of searching method
param_grid = [rbf_search, linear_poly_search]
# Just random state model random_state
model = SVC(random_state = 42)
# GridSearchCV
grid = GridSearchCV(model, param_grid, cv = 3, verbose = 1)
# fit
grid.fit(x_train, y_train)
Out[ ]:
In [ ]:
# best_model
# print the best params
print(grid.best_params_)
best_model = grid.best_estimator_
In [ ]:
# deeper GridSearch
C_values = list(np.linspace(0.1, 10, 100))
poly_search = {'kernel': ['poly'], 'C': C_values}
model = SVC(random_state = 42)
grid = GridSearchCV(model, poly_search, cv = 3, verbose = 1)
grid.fit(x_train, y_train)
Out[ ]:
LGBM/CatBoost/XGBC classifier¶
In [ ]:
!pip install catboost
In [ ]:
from sklearn.metrics import accuracy_score
In [ ]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
In [ ]:
lgb_params = {
'objective' : 'multiclass',
'metric' : 'multi_logloss',
'device' : 'cpu',
}
start_time = time.time()
model = LGBMClassifier(**lgb_params)
model.fit(x_train, y_train, verbose = 0)
val_pred = model.predict(x_val)
acc = accuracy_score(y_val, val_pred)
run_time = time.time() - start_time
print(acc)
print(f'Run time: {run_time:.2f}s')
In [ ]:
best_model = model
In [ ]:
catb_params = {
"objective": "MultiClass",
"task_type": "CPU",
}
start_time = time.time()
model = CatBoostClassifier(**catb_params)
model.fit(x_train, y_train, verbose = 0)
val_pred = model.predict(x_val)
acc = accuracy_score(y_val, val_pred)
run_time = time.time() - start_time
print(acc)
print(f'Run time: {run_time:.2f}s')
In [ ]:
best_model = model
In [ ]:
xgb_params = {
'objective': 'multi:softmax',
'eval_metric': 'mlogloss',
'predictor': 'cpu_predictor'}
start_time = time.time()
model = XGBClassifier(**xgb_params)
model.fit(x_train, y_train, verbose = 0)
val_pred = model.predict(x_val)
acc = accuracy_score(y_val, val_pred)
run_time = time.time() - start_time
print(acc)
print(f"Run time: {run_time:.2f}s")
In [ ]:
best_model = model
Submit¶
In [ ]:
x_test = []
for i in range(len(test_df)):
x_test.append(ast.literal_eval(test_df.embeddings[i]))
In [ ]:
labels = dict((v,k) for k,v in label_dict.items())
pred = best_model.predict(x_test)
print(pred[0:10])
In [ ]:
results = []
for i in pred:
results.append(labels[i])
In [ ]:
test_df['label'] = results
test_df
Out[ ]:
In [ ]:
!rm -rf assets
!mkdir assets
test_df.to_csv(os.path.join("assets", "submission.csv"))
In [ ]:
%aicrowd notebook submit -c sentiment-classification -a assets --no-verify
Content
Comments
You must login before you can post a comment.