|
|
|
"""AI_t5_model2.ipynb |
|
|
|
Automatically generated by Colab. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1cLG3m6CnABOLIGgwQuZUJfRZjsMHk6y7 |
|
""" |
|
|
|
!pip install transformers[torch] accelerate |
|
|
|
|
|
!pip uninstall -y requests google-colab |
|
|
|
|
|
!pip install google-colab |
|
|
|
pip install requests==2.31.0 |
|
|
|
!pip install rouge_score |
|
!pip install evaluate |
|
|
|
|
|
import numpy as np |
|
import pandas as pd |
|
from datasets import Dataset, DatasetDict |
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, \ |
|
Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, get_scheduler |
|
import evaluate |
|
import nltk |
|
from nltk.tokenize import sent_tokenize |
|
import warnings |
|
warnings.simplefilter(action='ignore') |
|
|
|
data = pd.read_csv('news_summary.csv', encoding='cp437') |
|
data = data.dropna() |
|
data.info() |
|
|
|
|
|
|
|
|
|
|
|
def length(text): |
|
return len(text.split()) |
|
|
|
print('Mean headline length (words):', data['headlines'].apply(length).mean()) |
|
print('Mean text length (words):', data['ctext'].apply(length).mean()) |
|
|
|
|
|
|
|
train_size = int(0.8 * len(data)) |
|
val_size = int(0.1 * len(data)) |
|
test_size = len(data) - train_size - val_size |
|
|
|
train_data = data[:train_size] |
|
val_data = data[train_size:train_size+val_size] |
|
test_data = data[train_size+val_size:] |
|
|
|
train_dataset = Dataset.from_pandas(train_data) |
|
val_dataset = Dataset.from_pandas(val_data) |
|
test_dataset = Dataset.from_pandas(test_data) |
|
|
|
dataset = DatasetDict({ |
|
"train": train_dataset, |
|
"validation": val_dataset, |
|
"test": test_dataset |
|
}) |
|
|
|
dataset |
|
|
|
|
|
|
|
model_checkpoint = "google/mt5-small" |
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
|
|
|
|
|
|
|
max_input_length = 512 |
|
max_target_length = 30 |
|
|
|
def preprocess_function(examples): |
|
model_inputs = tokenizer( |
|
examples["ctext"], |
|
max_length=max_input_length, |
|
truncation=True, |
|
) |
|
labels = tokenizer( |
|
examples["headlines"], max_length=max_target_length, truncation=True |
|
) |
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
|
|
|
|
tokenized_datasets = dataset.map(preprocess_function, batched=True) |
|
|
|
|
|
|
|
rouge_score = evaluate.load("rouge") |
|
|
|
import nltk |
|
nltk.download('punkt') |
|
|
|
def three_sentence_summary(text): |
|
return "\n".join(sent_tokenize(text)[:3]) |
|
|
|
|
|
print(three_sentence_summary(dataset["train"][1]["ctext"])) |
|
|
|
def evaluate_baseline(dataset, metric): |
|
summaries = [three_sentence_summary(text) for text in dataset["ctext"]] |
|
return metric.compute(predictions=summaries, references=dataset["headlines"]) |
|
|
|
|
|
|
|
score = evaluate_baseline(dataset["validation"], rouge_score) |
|
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"] |
|
rouge_dict = dict((rn, round(score[rn] * 100, 2)) for rn in rouge_names) |
|
rouge_dict |
|
|
|
|
|
|
|
from huggingface_hub import notebook_login |
|
|
|
notebook_login() |
|
|
|
|
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) |
|
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model) |
|
|
|
|
|
|
|
batch_size = 8 |
|
num_train_epochs = 8 |
|
|
|
logging_steps = len(tokenized_datasets["train"]) // batch_size |
|
output_dir = "news-summary-t5-model-2" |
|
|
|
args = Seq2SeqTrainingArguments( |
|
output_dir=output_dir, |
|
evaluation_strategy="epoch", |
|
learning_rate=5.6e-5, |
|
per_device_train_batch_size=batch_size, |
|
per_device_eval_batch_size=batch_size, |
|
weight_decay=0.01, |
|
save_total_limit=3, |
|
num_train_epochs=num_train_epochs, |
|
predict_with_generate=True, |
|
logging_steps=logging_steps, |
|
push_to_hub=True, |
|
) |
|
|
|
|
|
|
|
def compute_metrics(eval_pred): |
|
predictions, labels = eval_pred |
|
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True) |
|
labels= np.where(labels != -100, labels, tokenizer.pad_token_id) |
|
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True) |
|
decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds] |
|
decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels] |
|
result = rouge_score.compute( |
|
predictions=decoded_preds, references=decoded_labels, use_stemmer=True |
|
) |
|
result = {key: value * 100 for key, value in result.items()} |
|
return {k: round(v, 4) for k, v in result.items()} |
|
|
|
|
|
|
|
tokenized_datasets = tokenized_datasets.remove_columns( |
|
dataset["train"].column_names |
|
) |
|
|
|
|
|
|
|
trainer = Seq2SeqTrainer( |
|
model, |
|
args, |
|
train_dataset=tokenized_datasets["train"], |
|
eval_dataset=tokenized_datasets["validation"], |
|
data_collator=data_collator, |
|
tokenizer=tokenizer, |
|
compute_metrics=compute_metrics, |
|
) |
|
|
|
|
|
|
|
trainer.train() |
|
|
|
|
|
|
|
trainer.evaluate() |
|
|
|
|
|
|
|
trainer.push_to_hub(commit_message="Training complete", tags="summarization") |
|
|
|
from transformers import pipeline |
|
|
|
hub_model_id = "shivraj221/news-summary-t5-model-2" |
|
summarizer = pipeline("summarization", model=hub_model_id) |
|
|
|
|
|
|
|
def print_summary(idx): |
|
review = dataset["test"][idx]["ctext"] |
|
title = dataset["test"][idx]["headlines"] |
|
summary = summarizer(dataset["test"][idx]["ctext"])[0]["summary_text"] |
|
print(f"'>>> Article: {review}'") |
|
print(f"\n'>>> Headline: {title}'") |
|
print(f"\n'>>> Summary: {summary}'") |
|
|
|
print_summary(20) |