Programming Language Classification
Solution for submission 171996
A detailed solution for submission 171996 submitted for challenge Programming Language Classification
Getting Started with Programming Language Classification
In this puzzle, we have to classify the programming language from code. For classifying programming language we will have code snippets from which we need to identify the programming language. As the code snippets are texts, at first we need to tokenize the code snippets. In this process, we will learn more about tokenization and classification algorithms.
In this starter notebook:
For tokenization: We will use CountVectorizer and TfidfTransformer.
For Classification: We will use Multinomial Naive Bayes Classifier.
AIcrowd code utilities for downloading data for Language Classification
!pip install aicrowd-cli
%load_ext aicrowd.magic
!pip install autogluon
!pip install mxnet
!pip install guesslang
Login to AIcrowd ㊗¶¶
%aicrowd login
Download Dataset¶¶
We will create a folder name data and download the files there.
!rm -rf data
!mkdir data
%aicrowd ds dl -c programming-language-classification -o data
Importing Libraries:¶
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score,accuracy_score,f1_score
from sklearn import set_config
set_config(display="diagram")
plt.rcParams["figure.figsize"] = (15,6)
Diving in the dataset 🕵️♂️¶
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder().fit(train_df.language)
train_df["target"] = LE.transform(train_df.language)
Splitting the dataset¶
Here we will be splitting out dataset into training, validation and test set
X_train,X_comb,Y_train,Y_comb = train_test_split(train_df["code"],train_df["target"],stratify=train_df["target"],test_size=0.2,random_state=42 , shuffle = True)
X_validation,X_test,Y_validation,Y_test = train_test_split(X_comb,Y_comb,test_size=0.5,random_state=0 , shuffle = True)
X_train.shape,X_validation.shape,X_test.shape,Y_train.shape,Y_validation.shape,Y_test.shape
!pip install flair
!pip install transformers
!pip install -U numpy
from flair.embeddings import TransformerDocumentEmbeddings
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence
from tqdm.notebook import tqdm
doc_e=[]
model_name='huggingface/CodeBERTa-small-v1'
model_name2='microsoft/codebert-base-mlm'
model_name3="huggingface/CodeBERTa-language-id"
model_name4='flax-community/gpt-neo-125M-code-clippy-dedup'
doc_embedding = TransformerDocumentEmbeddings(model_name3,pooling='cls',layer_mean=True)
for d in tqdm(train_df["code"].values):
sent=Sentence(d.strip())
doc_embedding.embed(sent)
doc_e.append(sent.embedding.detach().cpu().numpy())
test_doc_e=[]
for d in tqdm(test_df["code"].values):
sent=Sentence(d.strip())
doc_embedding.embed(sent)
test_doc_e.append(sent.embedding.detach().cpu().numpy())
np.save('./train.npy',doc_e)
np.save('./test.npy',test_doc_e)
doc_e=np.load('./train.npy')
test_doc_e=np.load('./test.npy')
!pip install catboost
test_df.code.values[7075]
from sklearn.ensemble import StackingClassifier
from catboost import CatBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
estimators1 = [
# ('catboost', CatBoostClassifier(n_estimators=200)),
('LR', LogisticRegression()),
# ('KNN', make_pipeline(PCA(n_components=50),KNeighborsClassifier(n_neighbors=5))),
]
clf1= StackingClassifier(
estimators=estimators1, final_estimator=LogisticRegression()
)
estimators2 = [
# ('catboost', CatBoostClassifier(n_estimators=200)),
('LR', LogisticRegression()),
# ('KNN', make_pipeline(PCA(n_components=50),KNeighborsClassifier(n_neighbors=5))),
]
clf2= StackingClassifier(
estimators=estimators2, final_estimator=LogisticRegression()
)
estimators = [
('pipe2', make_pipeline(CountVectorizer(analyzer='word',min_df=5,max_df=1500),TfidfTransformer(),clf1)),
('pipe1', make_pipeline(CountVectorizer(analyzer='char',ngram_range=(1,2),min_df=10),TfidfTransformer(),clf1))
]
clf3= StackingClassifier(
estimators=estimators, final_estimator=LogisticRegression(),verbose=3
)
clf3.fit(X_train,Y_train)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier
token_pattern = r"""([A-Za-z_]\w*\b|[!\#\$%\&\*\+:\-\./<=>\?@\\\^_\|\~]+|[ \t\(\),;\{\}\[\]"'`])"""
# vectorizer = TfidfVectorizer(token_pattern=token_pattern, max_features=3000)
classifier = Pipeline([('tfidf', TfidfVectorizer(vocabulary=test_vect.vocabulary_)), ('clf', RandomForestClassifier(random_state=0,max_depth=20))])
classifier = classifier.fit(train_df['code'],train_df['target'])
lengths=np.array([len(c) for c in train_df.code])
train_df.loc[lengths<10].code.str.replace(" ","").value_counts()
train_df.loc[lengths<10].language.value_counts()
vals=[
'->','=>','++','+=','--','-=','<-','::',':=','&&',';','||','#','\\\\\\','\\\\','\\*','){','===','!==', '*','"""','\'\'\'','@param',
]
token_pattern = r"""([A-Za-z_]\w*\b|[!\#\$%\&\*\+:\-\./<=>\?@\\\^_\|\~]+|[ \t\(\),;\{\}\[\]"'`])"""
for language in train_df.language.unique():
sub_df=train_df[train_df.language==language].copy()
min_df=30 if sub_df.shape[0]>1000 else 5
min_df_chars=10 if sub_df.shape[0]>1000 else 5
max_features=300 if sub_df.shape[0]>1000 else 100
v1=CountVectorizer(token_pattern=token_pattern,min_df=min_df,max_features=max_features)
v1.fit(sub_df.code)
vals+=[t.strip() for t in list(v1.vocabulary_.keys())]
print(language,len(v1.vocabulary_))
vocab={k:v for v,k in enumerate(set(vals))}
vocab
test_vect=CountVectorizer(token_pattern=token_pattern,min_df=3,max_features=1500)
test_vect.fit(test_df['code'])
total_vocab=list(vocab.keys())+list(test_vect.vocabulary_.keys())
total_vocab={k:v for v,k in enumerate(set(total_vocab))}
len(total_vocab)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
tfidf=CountVectorizer(vocabulary=vocab,token_pattern=token_pattern)
X=train_df.code.values
X=tfidf.fit_transform(X).toarray()
from guesslang import Guess
guess = Guess()
def to_prob(prob):
return np.array([p[1] for p in prob])
# Guess the language from code
language = guess.probabilities("""
% Quick sort
-module (recursion).
-export ([qsort/1]).
qsort([]) -> [];
qsort([Pivot|T]) ->
qsort([X || X <- T, X < Pivot])
++ [Pivot] ++
qsort([X || X <- T, X >= Pivot]).
""")
from tqdm.notebook import tqdm
probabilities=[]
for c in tqdm(train_df.code):
probabilities.append(to_prob(guess.probabilities(c)))
np.c_[np.array(doc_e),X]
df=pd.DataFrame(np.c_[np.array(doc_e),X])
X.shape,
df['target']=train_df['target']
df.to_csv("./train.csv",index=False)
train_df[['code','target']].to_csv('./train_df.csv',index=False)
from autogluon.core.utils.loaders.load_pd import load
train_data=load('./train.csv')
!pip install mxnet
from autogluon.text import TextPredictor
predictor = TextPredictor(label='target', eval_metric='acc',)
predictor.fit(train_data, time_limit=7200)
from autogluon.tabular import TabularDataset, TabularPredictor
train_data = TabularDataset("./train.csv")
predictor = TabularPredictor(label="target").fit(train_data,time_limit=10800)
predictor.leaderboard()
v=[]
for val in list(test_vect.vocabulary_.keys()):
v.append(val.strip())
len(set(v)),len(test_vect.vocabulary_)
Auto Keras¶
!pip install autokeras
import autokeras as ak
import tensorflow as tf
TRAIN_DATA_URL = "./train.csv"
# TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
train_file_path = tf.keras.utils.get_file("./train.csv")
# test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)
# Initialize the structured data classifier.
clf = ak.StructuredDataClassifier(
overwrite=True, max_trials=20
) # It tries 3 different models.
# Feed the structured data classifier with training data.
clf.fit(
# The path to the train.csv file.
TRAIN_DATA_URL,
# The name of the label column.
"target",
epochs=20,
)
# Predict with the best model.
predicted_y = clf.predict(test_file_path)
x_train = np.array(train_df.code)
y_train = np.array(train_df.target)
clf = ak.TextClassifier(
overwrite=True, max_trials=2
) # It only tries 1 model as a quick demo.
# Feed the text classifier with training data.
clf.fit(x_train,y_train, epochs=5)
# Predict with the best model.
predicted_y = clf.predict(X_test)
!nvidia-smi
Kmeans Repeated runs Model¶
from tqdm.notebook import tqdm
from sklearn.cluster import KMeans,AgglomerativeClustering,SpectralClustering,MiniBatchKMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import adjusted_rand_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
n_runs=20
true_k = 15
run_results=[]
vectorizer=TfidfVectorizer(vocabulary=vocab)
X=vectorizer.fit_transform(train_df['code'])
# mf=mutual_info_classif(X,train_df.target)
# feature_importances=np.argsort(mf)[::-1]
# X=X[:,feature_importances[:500]]
rands=[]
for i in tqdm(range(2,n_runs)):
model = KMeans(n_clusters=i, init='k-means++',n_init=10, max_iter=1000)
X_pred=model.fit_predict(X)
# enc=OneHotEncoder()
# run_results.append(enc.fit_transform(X_pred.reshape(-1,1)).toarray())
print("here")
rands.append(adjusted_rand_score(train_df.target.values,X_pred))
# print(rands[i])
rands
from sklearn.neural_network import MLPClassifier
classifier=MLPClassifier(verbose=True,hidden_layer_sizes=(1000),learning_rate='adaptive',early_stopping=True, max_iter=100)
classifier.fit(np.array(doc_e),train_df['target'])
import pickle
with open('model2.pkl','wb+') as f:
pickle.dump(classifier,f)
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
classifier = Pipeline([('vect', PCA(n_components=50)), ('clf', CatBoostClassifier(n_estimators=1000))])
classifier = classifier.fit(np.array(doc_e).astype(np.float32), train_df["target"])
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, Y_train)
classifier['vect'].vocabulary_
from skorch import NeuralNetClassifier
from torch import nn
# X = np.array(X_train).astype(np.float32)
# y = Y_train.astype(np.int64)
class MyModule(nn.Module):
def __init__(self, num_units=10, nonlin=nn.ReLU()):
super(MyModule, self).__init__()
self.output = nn.Linear(768, 256)
self.hidden=nn.Linear(256,256)
self.hidden2=nn.Linear(256,15)
self.nonlin=nn.Tanh()
self.softmax = nn.Softmax(dim=-1)
def forward(self, X, **kwargs):
X=self.nonlin(self.output(X))
X=self.nonlin(self.hidden(X))
X = self.softmax(self.hidden2(X))
return X
net = NeuralNetClassifier(
MyModule,
max_epochs=150,
lr=0.003,
# Shuffle training data on each epoch
iterator_train__shuffle=True,
)
net.fit(X_res,Y_res)
# net.fit(np.array(doc_e).astype(np.float32), train_df["target"])
# y_proba = net.predict_proba(X)
from sklearn.linear_model import LogisticRegressionCV
net2=LogisticRegression()
net2.fit(X_train,Y_train)
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
vect=TfidfVectorizer(vocabulary=vocab)
X=vect.fit_transform(X_train)
Feature Engineering¶
def line_starts(text):
processed=text.replace("\t",'').split("\\n")
for line in processed:
line=line.strip()
if line.startswith("#"):
pass
elif line.startswith('*'):
pass
elif line.startswith('\\*'):
pass
elif line.startswith('\'\'\''):
pass
elif line.startswith('\\\\'):
pass
elif line.startswith('\\\\\\'):
pass
def token_in_string(text):
processed=text.replace("\t",'').split("\\n")
for line in processed:
from sklearn.metrics import balanced_accuracy_score
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
# X_v=vect.transform(X_validation)
print("F1:" ,f1_score(Y_validation,classifier.predict(X_validation),average='macro'))
print("Accuracy:" ,accuracy_score(Y_validation,classifier.predict(X_validation))*100)
print("Accuracy:" ,balanced_accuracy_score(Y_validation,classifier.predict(X_validation))*100)
print("F1:" ,f1_score(Y_test,classifier.predict(X_test),average='macro'))
print("Accuracy:" ,accuracy_score(Y_test,classifier.predict(X_test))*100)
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(classifier, X_train, Y_train)
train_df[train_df.language=='python']
Prediction Phase ✈¶
from tqdm.notebook import tqdm
test_probs=[]
for c in tqdm(test_df.code):
probabilities.append(to_prob(guess.probabilities(c)))
test_df['target'] = net.predict(np.array(test_probs))
X_test=tfidf.fit_transform(test_df.code).toarray()
pd.DataFrame(np.c_[np.array(test_doc_e),X_test]).to_csv('./test.csv')
test_df['target'] = predictor.predict(TabularDataset('./test.csv'))
test_df['target']=clf.predict(np.array(test_df.code))
test_df.target
test_df["prediction"] = LE.inverse_transform(test_df.target.astype(int))
test_df.prediction.value_counts()
test_df.prediction.value_counts()/test_df.shape[0]
test_df["prediction"] = LE.inverse_transform(test_df.target)
test_df.target.unique()
Generating Prediction File¶
test_df = test_df.sample(frac=1)
test_df.head(20)
n=0
for c in test_df.code.values:
if len(c)<10:
print(c)
print('----------')
n+=1
n
for c in test_df[test_df.prediction=='python'].code.values:
# print(len(c))
# if len(c)>10 and len(c)<100:
# print(c)
# print('----------')
if "\t" in c:
print(c)
!rm -rf assets
!mkdir assets
test_df.to_csv(os.path.join("assets", "submission.csv"))
test_df['length']=test_df.code.apply(lambda x: len(x.strip()))
test_df[test_df.length<15]
test_df.shape
Submitting our Predictions¶
Note : Please save the notebook before submitting it (Ctrl + S)
%aicrowd notebook submit -c programming-language-classification -a assets --no-verify
test_df.to_csv('./out1.csv')
probs=predictor.predict_proba(TabularDataset('./test.csv'))
probs.values[18]
classifier.predict_proba(test_df['code'][test_df.id==10489])
test_df.loc[test_df.id==10489,'code']
f=np.argsort(classifier['tfidf'].transform(test_df['code'][test_df.id==10489]).toarray())[::-1]
mylist=sorted(classifier['tfidf'].vocabulary_, key=classifier['tfidf'].vocabulary_.get)
print(np.array(mylist)[f[:50]])
for feat,imp in zip(np.array(mylist)[f[:50]],classifier['tfidf'].transform(test_df['code'][test_df.id==10489]).toarray()[0][f[:50]]):
print(feat,imp)
clf_pred=classifier['tfidf'].transform(test_df['code'][test_df.id==10489]).toarray()[0]
mylist=sorted(classifier['tfidf'].vocabulary_, key=classifier['tfidf'].vocabulary_.get)
f=np.argsort(clf_pred)[::-1]
myslice=f[:50]
clf_pred[myslice]
np.array(mylist)[myslice]
from sklearn.feature_selection import mutual_info_classif
mf=mutual_info_classif(test_vect.transform(train_df['code']),train_df.target)
test_vect.vocabulary_[np.argsort(mf)[::-1]]
predictor.leaderboard()
Content
Comments
You must login before you can post a comment.