De-Shuffling Text
Solution for submission 148568
A detailed solution for submission 148568 submitted for challenge De-Shuffling Text
In [ ]:
!nvidia-smi
In [ ]:
!pip install aicrowd-cli
!rm -rf data
!mkdir data
In [ ]:
!pip install datasets transformers
In [ ]:
import pandas as pd
import numpy as np
import os
import torch
import datasets
from datasets import load_dataset
from transformers import EncoderDecoderModel, EncoderDecoderConfig, BertTokenizerFast, Seq2SeqTrainingArguments, Seq2SeqTrainer, BertConfig
In [ ]:
train_dataset = pd.read_csv("data/train.csv")
validation_dataset = pd.read_csv("data/val.csv")
test_dataset = pd.read_csv("data/test.csv")
In [ ]:
dataset = load_dataset('csv', data_files={"train" : ["data/train.csv"],
"validation": ["data/val.csv"],
"test" : ["data/test.csv"]})
In [ ]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
In [ ]:
MAX_TEXT_LENGTH = 150
MAX_LABEL_LENGTH = 150
def preprocess_function(sample):
# Getting text and label
text = sample["text"]
label = sample["label"]
# Tokenizing the text and label
inputs = tokenizer(text, padding="max_length", truncation=True, max_length=MAX_TEXT_LENGTH)
outputs = tokenizer(label, padding="max_length", truncation=True, max_length=MAX_LABEL_LENGTH)
sample["input_ids"] = inputs.input_ids
sample["attention_mask"] = inputs.attention_mask
sample["decoder_input_ids"] = outputs.input_ids
sample["decoder_attention_mask"] = outputs.attention_mask
sample["labels"] = outputs.input_ids
# The labels are used to calcuate the loss while training, and because we added padding to make all tokens to be of same size,
# we also need to convert the padding number ( 0 ) to ( -100 ), so that we can tell huggingface that these number can be ignorned while calcuating loss.
# Why specifically -100 ? It's simply an arbitrary number, again so that huggingface can ignore this number while calcuating loss
sample["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in sample["labels"]]
return sample
In [ ]:
BATCH_SIZE = 16
tokenized_datasets = dataset.map(preprocess_function, batch_size=BATCH_SIZE, batched=True)
In [ ]:
tokenized_datasets.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
In [ ]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
In [ ]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size
In [ ]:
N_EPOCHS = 10
args = Seq2SeqTrainingArguments(
"Scambled Text",
evaluation_strategy = "epoch",
per_device_train_batch_size=BATCH_SIZE,
per_device_eval_batch_size=BATCH_SIZE,
num_train_epochs=N_EPOCHS,
fp16=True,
save_strategy="epoch",
save_total_limit=5,
)
In [ ]:
trainer = Seq2SeqTrainer(
model=model,
args=args,
train_dataset=tokenized_datasets['train'],
eval_dataset=tokenized_datasets['validation'],
)
In [15]:
trainer.train()
Out[15]:
In [16]:
def generate_predictions(batch):
# Tokenizing the test
inputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=MAX_TEXT_LENGTH, return_tensors="pt")
# Sending the tensors to GPU
input_ids = inputs.input_ids.to("cuda")
attention_mask = inputs.attention_mask.to("cuda")
# Generating the predicted tokens ids
outputs = model.generate(input_ids, attention_mask=attention_mask)
# Converting the token ids to sentence
output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
batch["predictions"] = output_str
return batch
In [17]:
results = dataset['test'].map(generate_predictions, batched=True, batch_size=16)
In [18]:
test_dataset
Out[18]:
In [19]:
test_dataset['label'] = results['predictions']
test_dataset
Out[19]:
In [20]:
!mkdir assets
test_dataset.to_csv(os.path.join("assets", "submission.csv"), index=False)
In [ ]:
In [ ]:
Content
Comments
You must login before you can post a comment.