Spaces:
Sleeping
Sleeping
File size: 3,419 Bytes
a3e3740 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import torch
from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer
from bert_score import score
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
import nltk
from nltk.util import ngrams
class RAGEvaluator:
def __init__(self):
self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
def load_gpt2_model(self):
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
return model, tokenizer
def evaluate_bleu_rouge(self, candidates, references):
bleu_score = corpus_bleu(candidates, [references]).score
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
return bleu_score, rouge1
def evaluate_bert_score(self, candidates, references):
P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
return P.mean().item(), R.mean().item(), F1.mean().item()
def evaluate_perplexity(self, text):
encodings = self.gpt2_tokenizer(text, return_tensors='pt')
max_length = self.gpt2_model.config.n_positions
stride = 512
lls = []
for i in range(0, encodings.input_ids.size(1), stride):
begin_loc = max(i + stride - max_length, 0)
end_loc = min(i + stride, encodings.input_ids.size(1))
trg_len = end_loc - i
input_ids = encodings.input_ids[:, begin_loc:end_loc]
target_ids = input_ids.clone()
target_ids[:, :-trg_len] = -100
with torch.no_grad():
outputs = self.gpt2_model(input_ids, labels=target_ids)
log_likelihood = outputs[0] * trg_len
lls.append(log_likelihood)
ppl = torch.exp(torch.stack(lls).sum() / end_loc)
return ppl.item()
def evaluate_diversity(self, texts):
all_tokens = [tok for text in texts for tok in text.split()]
unique_bigrams = set(ngrams(all_tokens, 2))
diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
return diversity_score
def evaluate_racial_bias(self, text):
results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
return bias_score
def evaluate_all(self, response, reference):
candidates = [response]
references = [reference]
bleu, rouge1 = self.evaluate_bleu_rouge(candidates, references)
bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
perplexity = self.evaluate_perplexity(response)
diversity = self.evaluate_diversity(candidates)
racial_bias = self.evaluate_racial_bias(response)
return {
"BLEU": bleu,
"ROUGE-1": rouge1,
"BERT P": bert_p,
"BERT R": bert_r,
"BERT F1": bert_f1,
"Perplexity": perplexity,
"Diversity": diversity,
"Racial Bias": racial_bias
}
|