Sentiment Classification
Solution for submission 174846
A detailed solution for submission 174846 submitted for challenge Sentiment Classification
Sentiment classification: sklearn models¶
In [125]:
%load_ext aicrowd.magic
In [126]:
%aicrowd login
In [127]:
# %aicrowd ds dl -c sentiment-classification -o data
Imports¶
In [128]:
import os
from ast import literal_eval
from collections import Counter
import pandas as pd
import neptune.new as neptune
import numpy as np
from scipy.stats import uniform
import optuna
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
In [129]:
NEPTUNE_PROJECT = "deepsense-ai/AIcrowd"
NEPTUNE_API = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiI0NWE5MTZhNi0yMDE3LTQ3N2EtOGMwOS1lZGY1YjRiOWJlYmUifQ=="
Data¶
In [130]:
train_df = pd.read_csv("data/train.csv")
val_df = pd.read_csv("data/val.csv")
test_df = pd.read_csv("data/test.csv")
In [131]:
train_df.head()
Out[131]:
In [132]:
train_df.label.value_counts()
Out[132]:
In [133]:
X_train = [literal_eval(embedding) for embedding in train_df['embeddings'].values]
y_train = train_df['label'].values
X_valid = [literal_eval(embedding) for embedding in val_df['embeddings'].values]
y_valid = val_df['label'].values
X_test = [literal_eval(embedding) for embedding in test_df['embeddings'].values]
Standard Scaler¶
In [134]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled = scaler.transform(X_test)
Submission¶
In [135]:
def make_submission(y_test_pred):
submission = pd.DataFrame(
{
"embeddings": X_test,
"label": y_test_pred,
}
)
submission.to_csv(os.path.join("assets", "submission.csv"))
%aicrowd notebook submit -c sentiment-classification -a assets --no-verify
Sklearn models¶
In [136]:
def objective(trial):
global best_score
# optional PCA
X_train_scaled_reduced = X_train_scaled
X_valid_scaled_reduced = X_valid_scaled
X_test_scaled_reduced = X_test_scaled
num_dim = 512
reduce_dim = trial.suggest_categorical("reduce_dim", [False, True])
if reduce_dim:
num_dim = trial.suggest_int("num_dim", 32, 512)
pca = PCA(n_components=num_dim)
X_train_scaled_reduced = pca.fit_transform(X_train_scaled)
X_valid_scaled_reduced = pca.transform(X_valid_scaled)
X_test_scaled_reduced = pca.transform(X_test_scaled)
classifier_name = trial.suggest_categorical('classifier', ['mlp', 'svc']) # 'knn'
if classifier_name == 'svc':
svc_c = trial.suggest_int('svc_c', 1, 1e8)
svc_degree = trial.suggest_int('svc_degree', 2, 11)
svc_gamma = trial.suggest_float('svc_gamma', 1e-10, 1e1)
params = {
"svc_c": svc_c,
"svc_degree": svc_degree,
"svc_gamma": svc_gamma,
}
classifier = SVC(
C=svc_c,
degree=svc_degree,
gamma=svc_gamma, # 'auto',
random_state=42,
)
elif classifier_name == 'knn':
knn_neighbors = trial.suggest_int('knn_neighbors', 1, 21)
params = {
"knn_neighbors": knn_neighbors,
}
classifier = KNeighborsClassifier(
n_neighbors=knn_neighbors,
)
elif classifier_name == 'mlp':
mlp_alpha = trial.suggest_float('mlp_alpha', 1e-10, 10)
mlp_hidden_layer_sizes = trial.suggest_int('mlp_hidden_layer_sizes', 128, 1024)
mlp_validation_fraction = trial.suggest_float('mlp_validation_fraction', 0.01, 0.2)
params = {
"mlp_alpha": mlp_alpha,
"mlp_hidden_layer_sizes": mlp_hidden_layer_sizes,
"mlp_validation_fraction": mlp_validation_fraction,
}
classifier = MLPClassifier(
alpha = mlp_alpha,
hidden_layer_sizes = mlp_hidden_layer_sizes,
early_stopping = True,
n_iter_no_change = 100,
max_iter = 1000,
validation_fraction = mlp_validation_fraction,
random_state=42,
)
else:
raise Exception("Wrong classifier name")
classifier = classifier.fit(X_train_scaled_reduced, y_train)
valid_accuracy = classifier.score(X_valid_scaled_reduced, y_valid)
if valid_accuracy > best_score:
print("SUBMISION, valid/acc:", valid_accuracy)
best_score = valid_accuracy
run = neptune.init(
project=NEPTUNE_PROJECT,
api_token=NEPTUNE_API,
tags=["sentiment_classification", "sklearn", "optuna"]
)
run["model"] = classifier_name
run["parameters"] = params
run["reduce_dim"] = reduce_dim
run["num_dim"] = num_dim
run["train/acc"] = classifier.score(X_train_scaled_reduced, y_train)
run["valid/acc"] = valid_accuracy
run.stop()
y_test_pred = classifier.predict(X_test_scaled)
make_submission(y_test_pred)
return valid_accuracy
In [137]:
best_score = 0.795
In [63]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
In [14]:
study.best_params
Out[14]:
Sklearn models with crossvalidation¶
In [138]:
train_valid_df = pd.concat([train_df, val_df]) # concat the train and validation set, we will be using the k fold method later
X_train_valid = [literal_eval(embedding) for embedding in train_valid_df['embeddings'].values]
y_train_valid = train_valid_df['label'].values
In [139]:
X_train_valid_scaled = scaler.transform(X_train_valid)
In [140]:
Fold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
In [141]:
def objective_cv(trial):
global best_score
params = {
"alpha": trial.suggest_float('alpha', 1e-10, 10),
"hidden_layer_sizes": trial.suggest_int('hidden_layer_sizes', 128, 1024),
"validation_fraction": trial.suggest_float('mlp_validation_fraction', 0.01, 0.2),
}
# optional PCA
X_train_valid_scaled_reduced = X_train_valid_scaled
X_test_scaled_reduced = X_test_scaled
num_dim = 512
reduce_dim = trial.suggest_categorical("reduce_dim", [False, True])
if reduce_dim:
num_dim = trial.suggest_int("num_dim", 32, 512)
pca = PCA(n_components=num_dim)
X_train_valid_scaled_reduced = pca.fit_transform(X_train_valid_scaled)
X_test_scaled_reduced = pca.transform(X_test_scaled)
# kfold
f1_scores = []
models = []
for n, (trn_, val_) in tqdm(enumerate(Fold.split(X_train_valid_scaled_reduced, y_train_valid))):
fold_train_data = X_train_valid_scaled_reduced[trn_]
fold_valid_data = X_train_valid_scaled_reduced[val_]
fold_train_labels = y_train_valid[trn_]
fold_valid_labels = y_train_valid[val_]
model = MLPClassifier(**params)
model.fit(fold_train_data, fold_train_labels)
models.append(model)
valid_pred = model.predict(fold_valid_data)
f1 = f1_score(fold_valid_labels, valid_pred, average ='weighted')
f1_scores.append(f1)
mean_valid_f1 = np.mean(f1_scores)
# neptune and submission
if mean_valid_f1 > best_score:
print("SUBMISION, mean_valid_f1:", mean_valid_f1)
# best_score = mean_valid_f1
run = neptune.init(
project=NEPTUNE_PROJECT,
api_token=NEPTUNE_API,
tags=["sentiment_classification", "mlp", "optuna", "crossval"]
)
run["model"] = "mlp"
run["parameters"] = params
run["reduce_dim"] = reduce_dim
run["num_dim"] = num_dim
run["mean_valid_f1"] = mean_valid_f1
run.stop()
predictions = []
for model in models:
predictions.append(model.predict(X_test_scaled_reduced))
y_test_pred = [
Counter([pred[i] for pred in predictions]).most_common(1)[0][0]
for i in range(len(X_test_scaled_reduced))
]
make_submission(y_test_pred)
return mean_valid_f1
In [142]:
best_score = 0.795
In [ ]:
study_cv = optuna.create_study(direction='maximize')
study_cv.optimize(objective_cv, n_trials=50)
In [ ]:
In [ ]:
In [336]:
classifier = SVC(
C=20,
degree=5,
max_iter=-1,
# break_ties=True,
# tol=0.00001,
# probability=True,
gamma=0.0004,
random_state=42,
).fit(X_train_scaled, y_train)
In [337]:
classifier.score(X_train_scaled, y_train)
Out[337]:
In [338]:
classifier.score(X_valid_scaled, y_valid)
Out[338]:
Random Forest¶
In [105]:
classifier = RandomForestClassifier(
n_estimators = 200,
max_depth = 50,
min_samples_leaf = 20,
).fit(X_train_scaled, y_train)
In [106]:
classifier.score(X_train_scaled, y_train)
Out[106]:
In [107]:
classifier.score(X_valid_scaled, y_valid)
Out[107]:
In [117]:
predictions = []
for model in [classifier, classifier, classifier]:
predictions.append(model.predict(X_valid_scaled))
y_test_pred = [
Counter([pred[i] for pred in predictions]).most_common(1)[0][0]
for i in range(len(X_valid_scaled))
]
In [120]:
# classifier.score(X_valid_scaled, y_test_pred)
Neural Network¶
In [19]:
X_train_valid_scaled = np.concatenate([X_train_scaled, X_valid_scaled])
y_train_valid = np.concatenate([y_train, y_valid])
In [23]:
classifier = MLPClassifier(
alpha = 0.5,
hidden_layer_sizes = 455,
early_stopping = True,
n_iter_no_change = 100,
max_iter = 1000,
validation_fraction = 0.02,
random_state=42,
# verbose = True,
# ).fit(X_train_scaled, y_train)
).fit(X_train_valid_scaled, y_train_valid)
In [21]:
classifier.score(X_train_scaled, y_train)
Out[21]:
In [22]:
classifier.score(X_valid_scaled, y_valid)
Out[22]:
Validation¶
In [25]:
y_valid_pred = classifier.predict(X_valid_scaled)
print(classification_report(y_valid, y_valid_pred))
In [26]:
y_test_pred = classifier.predict(X_test_scaled)
In [27]:
submission = pd.DataFrame(
{
"embeddings": X_test_scaled.tolist(),
"label": y_test_pred,
}
)
Submission¶
In [28]:
submission.to_csv(os.path.join("assets", "submission.csv"))
In [29]:
%aicrowd notebook submit -c sentiment-classification -a assets --no-verify
In [ ]:
Content
Comments
You must login before you can post a comment.