Spaces:
Sleeping
Sleeping
from multiprocessing import Pool | |
from typing import List | |
import numpy as np | |
import torch | |
from pyscripts.utils.dialog_eval.vert import ( | |
get_auto_bleu2_geometric, | |
get_self_bleu2_geometric, | |
run_f, | |
) | |
from scipy.stats import gmean | |
from sklearn.metrics.pairwise import cosine_similarity | |
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer | |
def perplexity(LLM_Output: str, model_id: str = "gpt2") -> str: | |
""" | |
Compute the perplexity of the given text using a specified model from the | |
`evaluate` library (default: GPT-2). | |
Args: | |
LLM_Output str: | |
The text (string) for which perplexity is to be computed. | |
model_id (str, optional): | |
The identifier of the model to use for computing | |
perplexity. Defaults to "gpt2". | |
Returns: | |
str: | |
A formatted string showing the perplexity of the | |
provided text(s), for example: | |
"Perplexity: 45.23\n" | |
Raises: | |
ImportError: | |
If the `evaluate` library is not installed or cannot be imported. | |
Example: | |
>>> text = "Hello world, this is a test." | |
>>> result = perplexity(text, model_id="gpt2") | |
>>> print(result) | |
"Perplexity: 27.34\n" | |
""" | |
try: | |
import evaluate | |
except Exception as e: | |
print("Error: evaluate is not properly installed.") | |
raise e | |
perplexity = evaluate.load("perplexity", module_type="metric") | |
results = perplexity.compute(model_id=model_id, predictions=[LLM_Output]) | |
return f"Perplexity: {results['mean_perplexity']:.2f}\n" | |
def vert(LLM_response_arr: List[str]) -> str: | |
""" | |
Calculate and return Self BLEU-2, Auto BLEU-2 and VERT-2 | |
metrics for a list of LLM responses. | |
Args: | |
LLM_response_arr (List[str]): | |
A list of responses (strings) generated by the language | |
model acting as text dialog response generator. | |
Returns: | |
str: | |
A formatted string that includes each computed metric and the final | |
VERT value, for example: | |
"Self-BLEU2-geometric: 42.13 | |
Auto-BLEU2-geometric: 38.94 | |
VERT: 40.5 | |
" | |
Example: | |
>>> # Suppose we have the following LLM responses: | |
>>> responses = ["Hello world", "Foo bar", "Lorem ipsum dolor sit amet"] | |
>>> result = vert(responses) | |
>>> print(result) | |
"Self-BLEU2-geometric: 42.13 | |
Auto-BLEU2-geometric: 38.94 | |
VERT: 40.5 | |
" | |
""" | |
terms = [x.strip().split() for x in LLM_response_arr] | |
tasks = [ | |
("Self-BLEU2-geometric", get_self_bleu2_geometric), | |
("Auto-BLEU2-geometric", get_auto_bleu2_geometric), | |
] | |
n_processes = min(16, len(tasks)) | |
with Pool(n_processes) as pool: | |
metrics = pool.map(run_f, [(t[1], terms) for t in tasks]) | |
metric_arr = [] | |
str1 = "" | |
for (metric_name, _), metric in zip(tasks, metrics): | |
metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric)) | |
metric, sem = [round(100 * x, 2) for x in [metric, sem]] | |
metric_arr.append(metric) | |
str1 += f"{metric_name}: {metric}\n" | |
str1 += f"VERT: {round(gmean(metric_arr), 2)}\n" | |
return str1 | |
def bert_score( | |
total_response_arr: List[str], bert_model_name: str = "bert-base-uncased" | |
) -> str: | |
""" | |
Compute a cosine similarity score between the concatenated | |
context (all but the last element) | |
and the final response (last element) using a BERT-based model. | |
This serves as a simplified | |
measure of how closely the response aligns with the preceding context semantically. | |
Args: | |
total_response_arr (List[str]): | |
A list of strings. The last element represents the response, | |
while all other elements | |
are treated as the context. | |
bert_model_name (str, optional): | |
The name or path of the BERT model to use (from the Hugging Face Model Hub). | |
Defaults to "bert-base-uncased". | |
Returns: | |
str: | |
A string containing the cosine similarity | |
(as a percentage) followed by a newline. | |
For example: | |
"Cosine Similarity: 85.67\n" | |
Example: | |
>>> total_responses = [ | |
... "User: Hi, how are you?", | |
... "Assistant: I'm good! How can I help you today?", | |
... "User: Can you tell me a joke?", | |
... "Assistant: Sure! Here's one: Why did the chicken join a band?" | |
... ] | |
>>> result = bert_score(total_responses, bert_model_name="bert-base-uncased") | |
>>> print(result) | |
"Cosine Similarity: 75.89\n" | |
""" | |
def cosine_similarity_context_response(context, response, model, tokenizer): | |
# Tokenize and encode both context and response | |
context_inputs = tokenizer(context, return_tensors="pt", truncation=True) | |
response_inputs = tokenizer(response, return_tensors="pt", truncation=True) | |
for k in context_inputs: | |
context_inputs[k] = context_inputs[k].cuda() | |
for k in response_inputs: | |
response_inputs[k] = response_inputs[k].cuda() | |
# Get embeddings from the model | |
with torch.no_grad(): | |
context_embedding = model(**context_inputs).last_hidden_state.mean(dim=1) | |
response_embedding = model(**response_inputs).last_hidden_state.mean(dim=1) | |
# Compute cosine similarity | |
similarity = cosine_similarity( | |
context_embedding.cpu().numpy(), response_embedding.cpu().numpy() | |
) | |
return similarity[0][0] | |
bert_model = AutoModel.from_pretrained(bert_model_name).cuda() | |
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name) | |
similarity = cosine_similarity_context_response( | |
" ".join(total_response_arr[:-1]), | |
total_response_arr[-1], | |
bert_model, | |
bert_tokenizer, | |
) | |
return f"Cosine Similarity: {similarity*100:.2f}" + "\n" | |
def DialoGPT_perplexity( | |
user_utterance: str, | |
response: str, | |
dialog_model_name: str = "microsoft/DialoGPT-medium", | |
) -> str: | |
""" | |
Compute the perplexity of a response given a user utterance using a pre-trained | |
DialoGPT model. The function loads DialoGPT (medium by default) | |
from the Hugging Face Model Hub, then calculates the perplexity | |
for the | |
(context + response) sequence. | |
Args: | |
user_utterance (str): | |
The user utterance preceding the model's response. | |
response (str): | |
The generated response whose perplexity needs to be evaluated. | |
Returns: | |
str: | |
A formatted string containing the DialoGPT perplexity score. For example: | |
"DialoGPT Perplexity: 25.67\n" | |
Example: | |
>>> user_text = "Hi, how are you today?" | |
>>> system_response = "I'm good, thank you! How can I help you?" | |
>>> result = DialoGPT_perplexity(user_text, system_response) | |
>>> print(result) | |
"DialoGPT Perplexity: 31.45\n" | |
""" | |
def evaluate_response_with_dialoGPT(context, response, model, tokenizer): | |
""" | |
Evaluate the appropriateness of a response based on the | |
given context using DialoGPT. | |
Args: | |
context (str): The dialogue context (previous conversation). | |
response (str): The generated response to evaluate. | |
model: Pre-trained DialoGPT model. | |
tokenizer: Corresponding tokenizer for the DialoGPT model. | |
Returns: | |
float: Perplexity score of the response given the context. | |
""" | |
model.eval() | |
# Combine context and response as input | |
input_text = context + tokenizer.eos_token + response + tokenizer.eos_token | |
inputs = tokenizer(input_text, return_tensors="pt", truncation=True) | |
inputs["input_ids"] = inputs["input_ids"].cuda() | |
inputs["attention_mask"] = inputs["attention_mask"].cuda() | |
# import pdb;pdb.set_trace() | |
# Compute model outputs and loss | |
with torch.no_grad(): | |
outputs = model(**inputs, labels=inputs["input_ids"].cuda()) | |
loss = outputs.loss | |
# Calculate perplexity | |
perplexity = torch.exp(loss) | |
return perplexity.cpu().item() | |
# Load DialoGPT model and tokenizer | |
model_name = dialog_model_name | |
model = AutoModelForCausalLM.from_pretrained(model_name).cuda() | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
perplexity = evaluate_response_with_dialoGPT( | |
user_utterance, response, model, tokenizer | |
) | |
return f"DialoGPT Perplexity: {perplexity:.2f}" + "\n" | |