Speaker Identification
Solution for submission 172136
A detailed solution for submission 172136 submitted for challenge Speaker Identification
Getting Started with Speaker Identification
In this puzzle, we have to cluster the sentences spoken by same speaker together.
In this starter notebook:
For tokenization: We will use TfidfVectorizer.
For Clustering: We will use K Means Classifier.
In [1]:
!pip install aicrowd-cli
%load_ext aicrowd.magic
Login to AIcrowd ㊗¶
In [2]:
%aicrowd login
Download Dataset¶
We will create a folder name data and download the files there.
In [3]:
!rm -rf data
!mkdir data
%aicrowd ds dl -c speaker-identification -o data
In [4]:
import re,os
import pandas as pd
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
In [133]:
test_df = pd.read_csv("data/test.csv")
In [134]:
sub_df = pd.read_csv("data/sample_sub.csv")
In [135]:
# Remove punctuation, new line and lower case all the text available in sentence
test_df.sentence = test_df.sentence.apply(lambda x: re.sub('[,\.!?]', '', x))
test_df.sentence = test_df.sentence.apply(lambda x: x.lower())
test_df.sentence = test_df.sentence.apply(lambda x: x.replace("\n", " "))
In [ ]:
test_
In [ ]:
long_string = ','.join(list(test_df.sentence.values))
# Create a WordCloud object
wordcloud = WordCloud(background_color="silver", max_words=1000, contour_width=3, contour_color='steelblue')
# Generate a word cloud
wordcloud.generate(long_string)
# Visualize the word cloud
wordcloud.to_image()
Out[ ]:
In [ ]:
!pip install flair
!pip install transformers
In [ ]:
!pip install sentence-transformers
In [ ]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(test_df.sentence, show_progress_bar=True)
In [18]:
!pip install transformers
In [19]:
!pip install textstat
In [114]:
!pip install hdbscan
In [ ]:
!pip install umap-learn
In [ ]:
import umap
umap_embeddings = umap.UMAP(n_neighbors=15,
n_components=50,
metric='cosine').fit_transform(embeddings)
In [130]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=17,
metric='l1',
cluster_selection_method='eom').fit(X)
In [131]:
np.unique(cluster.labels_)
Out[131]:
In [ ]:
import numpy as np
np.unique(cluster.labels_)
Out[ ]:
In [ ]:
from flair.embeddings import TransformerDocumentEmbeddings
from flair.data import Sentence
from tqdm.notebook import tqdm
doc_e=[]
doc_embedding = TransformerDocumentEmbeddings('roberta-large')
for d in tqdm(test_df.sentence):
sent=Sentence(d)
doc_embedding.embed(sent)
doc_e.append(sent.embedding.detach().cpu().numpy())
In [21]:
import textblob
import textstat
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
In [22]:
test_df.to_csv('./speakers.csv')
In [45]:
from tqdm.notebook import tqdm
import textblob
import textstat
# result = classifier("I hate you")[0]
# print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
# result = classifier("I love you")[0]
# print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
def featurize(sentence):
result=classifier(sentence)[0]
feeling=1 if result['label']=='POSITIVE' else -1
feeling_intensity=result['score']
ari=textstat.automated_readability_index(sentence)
avg_wd=textstat.difficult_words(sentence)
testimonial = textblob.TextBlob(sentence)
exclamation=sentence.count('!')
question=sentence.count('?')
i=sentence.count(' i ')
you=sentence.count(' you ')
we=sentence.count(' we ')
they=sentence.count(' they ')
numbers=sum([1 for t in sentence if t.isnumeric()])
return [feeling,feeling_intensity,ari,avg_wd,testimonial.sentiment.polarity
,testimonial.sentiment.subjectivity,exclamation,question,i,you,we,they,numbers]
features=[featurize(sentence) for sentence in tqdm(test_df.sentence)]
In [182]:
speaker1=[
'oh my goodness','fellow','károly','two minute papers','what a time','on to your papers','wow','down the line'
]
speaker2=['confidence interval','this video is brought']
speaker3=['python','python','numpy','scikit','tensorflow','pytorch','keras']
speaker4=[' um ']
speaker5=['3b1b','theorem']
sents_speaker1=[]
for idx,sent in enumerate(test_df.sentence):
if any([True if p in sent else False for p in speaker1]):
sents_speaker1.append(idx)
In [185]:
In [187]:
len(sents_speaker1)
Out[187]:
In [136]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = TfidfVectorizer(stop_words='english',min_df=5,max_df=1500,ngram_range=(1,3),max_features=3000)
X = vectorizer.fit_transform(test_df.sentence)
In [31]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler
import numpy as np
lda=LatentDirichletAllocation()
X=lda.fit_transform(MinMaxScaler(feature_range=(0,1)).fit_transform(np.array(features)))
In [80]:
X
Out[80]:
In [ ]:
classes=lda.predict(MinMaxScaler(feature_range=(0,1)).fit_transform(np.array(features)))
In [42]:
len(vectorizer.vocabulary_)
Out[42]:
In [ ]:
!pip install networkx==2.6.2
In [ ]:
from sklearn.decomposition import TruncatedSVD
lsi=TruncatedSVD(n_components=300)
X=lsi.fit_transform(X)
In [ ]:
!pip install karateclub
In [15]:
!pip install h2o
In [ ]:
import pandas as pd
df=pd.DataFrame(X.toarray())
df
In [17]:
predictors=df.columns
df.to_csv('train.csv')
In [35]:
import h2o
from h2o.estimators import H2OKMeansEstimator
h2o.init()
# Import the iris dataset into H2O:
iris = h2o.import_file("./train.csv")
# Set the predictors:
# predictors = ["sepal_len", "sepal_wid", "petal_len", "petal_wid"]
# Split the dataset into a train and valid set:
train, valid = iris.split_frame(ratios=[.95], seed=1234)
# Build and train the model:
iris_kmeans = H2OKMeansEstimator(k=10,
estimate_k=False,
standardize=True,
max_iterations=10000,
seed=1234)
iris_kmeans.train(x=predictors.values.tolist(),
training_frame=train,
validation_frame=valid)
# Eval performance:
perf = iris_kmeans.model_performance()
# Generate predictions on a validation set (if necessary):
pred = iris_kmeans.predict(valid)
Encoded Kmean Runs¶
In [19]:
from sklearn.preprocessing import StandardScaler
scl=StandardScaler()
X=scl.fit_transform(np.array(features))
In [44]:
X=np.c_[X.toarray(),np.array(features)]
In [235]:
seeds=np.arange(1,1001,2)
Out[235]:
In [ ]:
from tqdm.notebook import tqdm
from sklearn.cluster import KMeans,AgglomerativeClustering,SpectralClustering,MiniBatchKMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import davies_bouldin_score,silhouette_score,calinski_harabasz_score
import random
from collections import Counter
import numpy as np
n_runs=500
true_k = 10
run_results=[]
run_scores=[]
seeds=np.arange(1,1001,2)
# X=np.array(features)
for i in tqdm(range(n_runs)):
model = KMeans(n_clusters=10, init='k-means++',n_init=10, max_iter=1000,random_state=seeds[i])
X_pred=model.fit_predict(X.toarray())
# print(Counter(X_pred))
enc=OneHotEncoder()
run_results.append(enc.fit_transform(X_pred.reshape(-1,1)).toarray())
run_scores.append(davies_bouldin_score(X.toarray(),model.predict(X)))
print(run_scores[i])
In [ ]:
import numpy as np
np.sort(run_scores)
In [277]:
import numpy as np
final_X=np.stack([x for x,xdash in zip(run_results,run_scores) if xdash<np.quantile(run_scores,.55) ],axis=1)
final_X=final_X.reshape(-1,final_X.shape[1]*final_X.shape[2])
final_X.shape
Out[277]:
In [148]:
final_X.shape
Out[148]:
Generating Predictions¶
Clustering using K-Means.
In [178]:
from sklearn.ensemble import RandomTreesEmbedding
trans=RandomTreesEmbedding(max_depth=4,n_estimators=1000)
transformed=trans.fit_transform(X)
In [ ]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans,AgglomerativeClustering,SpectralClustering,MiniBatchKMeans
from sklearn.metrics import davies_bouldin_score,silhouette_score,calinski_harabasz_score
inertias=[]
for i in range(2,17):
model = KMeans(n_clusters=i, init='k-means++',n_init=10, max_iter=1000)
model.fit(final_X)
print(i, davies_bouldin_score(final_X,model.predict(final_X)),silhouette_score(final_X,model.predict(final_X)),calinski_harabasz_score(final_X,model.predict(final_X)))
inertias.append(model.inertia_)
plt.plot(inertias)
plt.yscale('log')
In [39]:
X=np.argmax(X,axis=1)
In [266]:
final_X.shape
Out[266]:
In [278]:
submission = test_df
In [279]:
model = KMeans(n_clusters=10, init='k-means++',n_init=10, max_iter=1000,random_state=1)
submission['prediction'] = model.fit_predict(final_X)
In [280]:
from collections import Counter
#speaker 1 introduces himself as dr karoly who exhibits high emotion/amazement (easy to distinguish)
speaker1=[
'oh my goodness','fellow','károly','two minute papers','what a time','on to your papers','down the line','excellent'
]
#speaker 2 many occurances of confidence intervals
speaker2=['confidence interval']
# mentions of python libraries and models
speaker3=['python','python','numpy','scikit','tensorflow','pytorch','keras']
#speaker quotes in the dataset
speaker5=['3b1b']
for speaker in [speaker1,speaker2,speaker3,speaker5]:
sents_speaker=[]
for idx,sent in enumerate(test_df.sentence):
if any([True if p in sent else False for p in speaker]):
sents_speaker.append(idx)
submission.loc[sents_speaker,'prediction']=Counter(submission.iloc[sents_speaker].prediction.values).most_common(1)[0][0]
In [ ]:
!rm -rf assets
!mkdir assets
submission.to_csv(os.path.join("assets", "submission.csv"))
Submitting our Predictions¶
Note : Please save the notebook before submitting it (Ctrl + S)
In [ ]:
%aicrowd notebook submit -c speaker-identification -a assets --no-verify
In [ ]:
Content
Comments
You must login before you can post a comment.