Lingua Franca Translation
Solution for submission 172021
A detailed solution for submission 172021 submitted for challenge Lingua Franca Translation
Getting Started with Lingua Franca Translation
In this puzzle, we've to translate to english from crowd-talk lanugage. There are multiple ways to build the language translator:
- Using Dictionary and Mapping
- Using LSTM
- Using Transformers
In this starter notebook, we'll go with dictionary and mapping. Here We'll create dictionary of words for both english and corwd-talk language.
In [1]:
%%capture
!pip install aicrowd-cli
%load_ext aicrowd.magic
Login to AIcrowd ㊗¶
In [2]:
%aicrowd login
Download Dataset¶
We will create a folder name data and download the files there.
In [3]:
!rm -rf data
!mkdir data
%aicrowd ds dl -c lingua-franca-translation -o data
Importing Necessary Libraries¶
In [4]:
import os
import pandas as pd
import gensim
from sklearn.metrics.pairwise import cosine_similarity
In [21]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
Out[21]:
Diving in the dataset:¶
In [6]:
train_df = pd.read_csv("data/train.csv")
In [7]:
test_df = pd.read_csv("data/test.csv")
In [8]:
from gensim.models import Phrases
from gensim.models import Word2Vec
# Train a bigram detector.
my_sents=[s.split(" ") for s in train_df.crowdtalk]
bigram_transformer = Phrases(my_sents,min_count=3)
# Apply the trained MWE detector to a corpus, using the result to train a Word2vec model.
model = Word2Vec(bigram_transformer[my_sents], min_count=1)
In [9]:
my_sents=[s.split(" ") for s in train_df.crowdtalk]
In [10]:
bi_words=set([s.decode('utf-8') for s in list(bigram_transformer.vocab.keys())])
In [85]:
len(bi_words)
Out[85]:
In [96]:
crowd_words=[]
lengths=[]
for s in train_df.crowdtalk.values:
words= s.split(" ")
lengths.append(len(words))
for w in words:
crowd_words.append(w)
In [97]:
my_test=[s.split(" ") for s in test_df.crowdtalk]
test_set=[]
for s in bigram_transformer[my_test]:
for w in s:
test_set.append(w)
In [12]:
not_present=[]
present=[]
for word in test_set:
if word not in bi_words:
not_present.append(word)
else:
present.append(word)
In [19]:
len(not_present)
Out[19]:
In [11]:
words=[[word.lower() for word in nltk.word_tokenize(s) if word.isalnum()] for s in train_df.english.values]
english_words=[]
for s in words:
for w in s:
english_words.append(w)
english_words=set(english_words)
In [83]:
len(english_words)
Out[83]:
In [12]:
crowd_indices={v:k for k,v in enumerate(bi_words)}
english_indices={v:k for k,v in enumerate(english_words)}
In [13]:
reverse_english_idx={v:k for k,v in english_indices.items()}
In [29]:
import numpy as np
crowd_to_english_mat=np.zeros((len(english_words),len(bi_words)))
In [ ]:
words
In [30]:
my_sents=[s.split(" ") for s in train_df.crowdtalk]
for crowd_sentence,english_sentence in zip(my_sents,words):
bi_sent=bigram_transformer[crowd_sentence]
for wi,word in enumerate(bi_sent):
try:
word_idx=crowd_indices[word]
ewords_idx=[english_indices[eword]for eword in english_sentence]
for ei,eidx in enumerate(ewords_idx):
if abs(wi-ei)<2:
crowd_to_english_mat[eidx,word_idx]+=1
# crowd_to_english_mat[ewords_idx,word_idx]+=1
except KeyError:
pass
In [102]:
crowd_to_english_mat.shape
Out[102]:
In [35]:
from sklearn.preprocessing import StandardScaler
scl=StandardScaler(copy=False)
In [16]:
for i in range(crowd_to_english_mat.shape[0]):
crowd_to_english_mat[i,crowd_to_english_mat[i,:]!=crowd_to_english_mat[i,:].max(]=0
In [22]:
for w in stopwords.words('english'):
if w in english_words:
i=english_indices[w]
crowd_to_english_mat[i,crowd_to_english_mat[i,:]!=np.max(crowd_to_english_mat[i,:])]=0
In [31]:
translation_dict={}
crowd_to_english_mat/=(np.mean(crowd_to_english_mat, axis=1).reshape(-1,1)+1)
for word in bi_words:
word_idx=crowd_indices[word]
max_trans_idx=np.argmax(crowd_to_english_mat[:,word_idx])
translation=reverse_english_idx[max_trans_idx]
translation_dict[word]=translation
In [143]:
len(stopwords.words('english'))
Out[143]:
In [32]:
import nltk.translate.bleu_score as bleu
bleues=[]
sentences=[]
for i in range(len(train_df.crowdtalk.values)):
reference_trans=[train_df.english[i].lower().split(" ")]
candidate=[translation_dict[w] for w in bigram_transformer[my_sents[i]]]
sentences.append(" ".join(candidate))
score=bleu.sentence_bleu(reference_trans,candidate)
bleues.append(score)
if score<.3:
print(" ".join(reference_trans[0]),"||||"," ".join(candidate))
In [ ]:
In [183]:
pd.DataFrame({"english":train_df.english,"translated":sentences}).to_csv('inspection.csv')
In [33]:
np.mean(bleues)
Out[33]:
In [ ]:
In [34]:
reference_trans=["i went to the park".lower().split(" ")]
candidate2='i went the to park'.split()
bleu.sentence_bleu(reference_trans,candidate2)
Out[34]:
In [ ]:
train_df.crowdtalk[2]
Out[ ]:
In [ ]:
[translation_dict[w] for w in bigram_transformer[my_sents[1]]],[train_df.english[1].lower().split(" ")]
In [ ]:
[translation_dict[w] for w in bigram_transformer[my_sents[0]]]
Out[ ]:
In [ ]:
for p in bigram_transformer.export_phrases(train_df.crowdtalk.values):
print(p)
In [ ]:
bigram_transformer[my_sents[2]]
Out[ ]:
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(analyzer='word',ngram_range=(1,2),min_df=3,max_features=100)
cv_fit=cv.fit_transform(train_df.crowdtalk)
In [ ]:
print()
In [ ]:
print()
for k,v in zip(cv.get_feature_names(),cv_fit.toarray().sum(axis=0)):
print(k,v)
In [ ]:
model.train(bigram_transformer[my_sents], total_examples=len(train_df.crowdtalk.values), epochs=3)
Out[ ]:
In [ ]:
model.vocabulary.raw_vocab
Out[ ]:
In [ ]:
train_df
Out[ ]:
In [87]:
crowd_words=[]
lengths=[]
for s in train_df.crowdtalk.values:
words= s.split(" ")
lengths.append(len(words))
for w in words:
crowd_words.append(w)
In [88]:
len(set(crowd_words))
Out[88]:
In [ ]:
import jellyfish
from tqdm.notebook import tqdm
mins=[]
crowd_vocab=list(set(crowd_words))
for word in tqdm(not_present):
matches=[jellyfish.levenshtein_distance(word, b_token) for b_token in crowd_vocab]
mins.append(partial_idx)
if np.min(matches)==1:
print(word,crowd_vocab[np.argmin(matches)])
In [ ]:
mins
In [ ]:
import numpy as np
np.mean(lengths)
Out[ ]:
In [ ]:
english_words=[]
lengths=[]
for s in train_df.english.values:
words= s.split(" ")
lengths.append(len(words))
for w in words:
english_words.append(w.lower())
In [ ]:
len(english_words),len(crowd_words),len(set(english_words))
Out[ ]:
In [ ]:
english = train_df.english.values
crowdtalk = train_df.crowdtalk.values
In [ ]:
english
Out[ ]:
In [ ]:
processedLines = [gensim.utils.simple_preprocess(sentence) for sentence in english]
#eng_word_list = [word for words in processedLines for word in words]
eng_word_list = [word[0] for word in processedLines ] # only 1-th words (Bleu = 0.080) !!!
In [ ]:
processedLines = [gensim.utils.simple_preprocess(sentence) for sentence in crowdtalk]
#crowdtalk_word_list = [word for words in processedLines for word in words]
crowdtalk_word_list = [word[0] for word in processedLines] # only 1-th words (Bleu = 0.080) !!!
In [ ]:
dict1 = dict(zip(crowdtalk_word_list, eng_word_list))
Prediction Phase ✈¶
In [37]:
crowdtalk = test_df.crowdtalk.values
In [38]:
processedLines = [gensim.utils.simple_preprocess(sentence) for sentence in crowdtalk]
In [ ]:
!pip install jellyfish
In [35]:
!pip install gingerit
In [36]:
from gingerit.gingerit import GingerIt
text = 'according the to license he had me'
parser = GingerIt()
parser.parse(text)
Out[36]:
In [ ]:
!pip install -U git+https://github.com/PrithivirajDamodaran/Gramformer.git
In [50]:
!pip install spacy
In [ ]:
!python -m spacy download en_core_web_lg # Downloaing the model for english language will contains many pretrained preprocessing pipelines
In [55]:
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()
In [ ]:
In [101]:
from tqdm.notebook import tqdm
sentences3=[]
bi_words_list=list(bi_words)
followups=[]
for i in tqdm(range(len(processedLines))):
sentence=processedLines[i]
translation_tokens=[]
bi_sent=bigram_transformer[sentence]
for token in bi_sent:
if token in translation_dict:
translation_tokens.append(translation_dict[token])
# elif token[:-1] in translation_dict and (token[-1]=='s' or token[-1]=='z'):
# print("actually here")
# translation_tokens.append(translation_dict[token[:-1]]+'s')
# elif token+'s' in translation_dict:
# print("wow also here")
# translation_tokens.append(translation_dict[token+'s'][:-1])
sent_modified=[]
sent_modified.append(translation_tokens[0])
for i in range(1,len(translation_tokens)):
if not translation_tokens[i] == translation_tokens[i-1]:
sent_modified.append(translation_tokens[i])
final_sent=' '.join(sent_modified)
sent_modified=[]
doc=nlp(final_sent)
continue_flag=False
for i,t in enumerate(doc):
if continue_flag:
continue_flag=False
continue
#'PART','ADP','CCONJ'
if t.text=='the' and i<len(doc)-1 and (doc[i+1].pos_ in['PART','ADP','CCONJ']or doc[i+1].text=='that'):
sent_modified.append(doc[i+1].text)
sent_modified.append(t.text)
continue_flag=True
else:
sent_modified.append(t.text)
# else:
# partial_idx=np.argmin([jellyfish.levenshtein_distance(token, b_token) for b_token in bi_words_list])
# closest_word=bi_words_list[partial_idx]
# translation_tokens.append(translation_dict[closest_word])
sentences2.append(parser.parse(' '.join(sent_modified))['result'].replace(" ",' '))
In [107]:
len(sentences2[:test_df.shape[0]])
Out[107]:
In [106]:
test_df.shape[0]
Out[106]:
In [62]:
from collections import Counter
Counter(followups)
Out[62]:
In [ ]:
import jellyfish
Out[ ]:
Creating sentences by matching english word corresponding the new langauge word in the sentence using the dictionary mapping created.
In [ ]:
sentences3=[]
for sent in sentences2:
sentence_split=sent.split()
sent_modified=[]
sent_modified.append(sentence_split[0])
for i in range(1,len(sentence_split)):
if not sentence_split[i] == sentence_split[i-1]:
sent_modified.append(sentence_split[i])
else:
print("here")
sentences3.append(" ".join(sent_modified))
In [ ]:
sentence = []
for i in processedLines:
sentence_part = []
word = ''
for k, j in enumerate(i):
if j in dict1:
word = ''.join(dict1[j])
else:
word = ''.join(' ')
sentence_part.append(word)
temp = ' '.join(sentence_part)
sentence.append(temp)
In [108]:
test_df['prediction'] = sentences2[test_df.shape[0]:]
In [ ]:
from gingerit.gingerit import GingerIt
parser = GingerIt()
res=parser.parse('and of strange things my of beauty')['result'].replace(" ",' ')
In [76]:
reverse_trans_dict={v:k for k,v in translation_dict.items()}
In [ ]:
reverse_trans_dict
In [ ]:
for word in english_words:
if any([True for w in list(reverse_trans_dict.keys()) if word==w+'s']):
print(word,reverse_trans_dict[word],reverse_trans_dict[word[:-1]])
In [109]:
test_df.prediction
Out[109]:
In [ ]:
for s in sentences2:
if 'I and 'in s:
print(True)
In [92]:
test_df.to_csv('./translated.csv')
In [111]:
test_df
Out[111]:
Saving the prediction in the asset directory with the same as submission.csv.
In [112]:
!rm -rf assets
!mkdir assets
test_df.to_csv(os.path.join("assets", "submission.csv"), index=False)
Submitting our Predictions¶
Note : Please save the notebook before submitting it (Ctrl + S)
In [ ]:
%aicrowd notebook submit -c lingua-franca-translation -a assets --no-verify
Content
Comments
You must login before you can post a comment.