CelebChat / run_eval.py
lhzstar
new commits
2a846a9
raw
history blame
4.8 kB
import itertools
import re
import spacy
import json
import evaluate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModel
import torch
from utils import *
from celebbot import CelebBot
QA_MODEL_ID = "google/flan-t5-xl"
SENTTR_MODEL_ID = "sentence-transformers/all-mpnet-base-v2"
celeb_names = ["Cate Blanchett", "David Beckham", "Emma Watson", "Lady Gaga", "Madonna", "Mark Zuckerberg"]
def evaluate_system():
device = 'cpu'
with open("data.json", encoding='utf-8') as json_file:
celeb_data = json.load(json_file)
references = [val['answers'] for key, val in list(celeb_data.items()) if key in celeb_names]
references = list(itertools.chain.from_iterable(references))
predictions = []
QA_tokenizer = AutoTokenizer.from_pretrained(QA_MODEL_ID)
QA_model = AutoModelForSeq2SeqLM.from_pretrained(QA_MODEL_ID).to(device)
sentTr_tokenizer = AutoTokenizer.from_pretrained(SENTTR_MODEL_ID)
sentTr_model = AutoModel.from_pretrained(SENTTR_MODEL_ID).to(device)
for name in celeb_names:
gender = celeb_data[name]["gender"]
knowledge = celeb_data[name]["knowledge"]
lname = name.split(" ")[-1]
lname_regex = re.compile(rf'\b({lname})\b')
name_regex = re.compile(rf'\b({name})\b')
lnames = lname+"’s" if not lname.endswith("s") else lname+"’"
lnames_regex = re.compile(rf'\b({lnames})\b')
names = name+"’s" if not name.endswith("s") else name+"’"
names_regex = re.compile(rf'\b({names})\b')
if gender == "M":
knowledge = re.sub(he_regex, "I", knowledge)
knowledge = re.sub(his_regex, "my", knowledge)
elif gender == "F":
knowledge = re.sub(she_regex, "I", knowledge)
knowledge = re.sub(her_regex, "my", knowledge)
knowledge = re.sub(names_regex, "my", knowledge)
knowledge = re.sub(lnames_regex, "my", knowledge)
knowledge = re.sub(name_regex, "I", knowledge)
knowledge = re.sub(lname_regex, "I", knowledge)
spacy_model = spacy.load("en_core_web_sm")
knowledge_sents = [i.text.strip() for i in spacy_model(knowledge).sents]
ai = CelebBot(name, QA_tokenizer, QA_model, sentTr_tokenizer, sentTr_model, spacy_model, knowledge_sents)
if re.search(re.compile(rf'\b(you|your|{ai.name})\b', flags=re.IGNORECASE), ai.text) != None:
instruction1 = f"You are a celebrity named {ai.name}. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
knowledge = ai.retrieve_knowledge_assertions()
else:
instruction1 = f"Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
queries = [f"Context: {instruction1} {knowledge}\n\nQuestion: {q}\n\nAnswer:" for q in celeb_data[name]["questions"]]
input_ids = ai.QA_tokenizer(f"{queries}", return_tensors="pt").input_ids.to(device)
outputs = ai.QA_model.generate(input_ids, max_length=1024)
predictions+= ai.QA_tokenizer.batch_decode(outputs, skip_special_tokens=True)
file = open('predictions.txt','w')
for prediction in predictions:
file.write(prediction+"\n")
file.close()
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references, max_order=4)
print(f"BLEU: {round(results['bleu'], 2)}")
meteor = evaluate.load("meteor")
results = meteor.compute(predictions=predictions, references=references)
print(f"METEOR: {round(results['meteor'], 2)}")
rouge = evaluate.load("rouge")
results = rouge.compute(predictions=predictions, references=references)
print(f"ROUGE: {round(results['rougeL'], 2)}")
bertscore = evaluate.load("bertscore")
results = bertscore.compute(predictions=predictions, references=references, rescale_with_baseline=True, lang="en")
print(f"F1: {round(sum(results['f1'])/len(results['f1']), 2)}")
if __name__ == "__main__":
evaluate_system()