Siddhant
Update demo
b9a6dd9
from multiprocessing import Pool
from typing import List
import numpy as np
import torch
from pyscripts.utils.dialog_eval.vert import (
get_auto_bleu2_geometric,
get_self_bleu2_geometric,
run_f,
)
from scipy.stats import gmean
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
def perplexity(LLM_Output: str, model_id: str = "gpt2") -> str:
"""
Compute the perplexity of the given text using a specified model from the
`evaluate` library (default: GPT-2).
Args:
LLM_Output str:
The text (string) for which perplexity is to be computed.
model_id (str, optional):
The identifier of the model to use for computing
perplexity. Defaults to "gpt2".
Returns:
str:
A formatted string showing the perplexity of the
provided text(s), for example:
"Perplexity: 45.23\n"
Raises:
ImportError:
If the `evaluate` library is not installed or cannot be imported.
Example:
>>> text = "Hello world, this is a test."
>>> result = perplexity(text, model_id="gpt2")
>>> print(result)
"Perplexity: 27.34\n"
"""
try:
import evaluate
except Exception as e:
print("Error: evaluate is not properly installed.")
raise e
perplexity = evaluate.load("perplexity", module_type="metric")
results = perplexity.compute(model_id=model_id, predictions=[LLM_Output])
return f"Perplexity: {results['mean_perplexity']:.2f}\n"
def vert(LLM_response_arr: List[str]) -> str:
"""
Calculate and return Self BLEU-2, Auto BLEU-2 and VERT-2
metrics for a list of LLM responses.
Args:
LLM_response_arr (List[str]):
A list of responses (strings) generated by the language
model acting as text dialog response generator.
Returns:
str:
A formatted string that includes each computed metric and the final
VERT value, for example:
"Self-BLEU2-geometric: 42.13
Auto-BLEU2-geometric: 38.94
VERT: 40.5
"
Example:
>>> # Suppose we have the following LLM responses:
>>> responses = ["Hello world", "Foo bar", "Lorem ipsum dolor sit amet"]
>>> result = vert(responses)
>>> print(result)
"Self-BLEU2-geometric: 42.13
Auto-BLEU2-geometric: 38.94
VERT: 40.5
"
"""
terms = [x.strip().split() for x in LLM_response_arr]
tasks = [
("Self-BLEU2-geometric", get_self_bleu2_geometric),
("Auto-BLEU2-geometric", get_auto_bleu2_geometric),
]
n_processes = min(16, len(tasks))
with Pool(n_processes) as pool:
metrics = pool.map(run_f, [(t[1], terms) for t in tasks])
metric_arr = []
str1 = ""
for (metric_name, _), metric in zip(tasks, metrics):
metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric))
metric, sem = [round(100 * x, 2) for x in [metric, sem]]
metric_arr.append(metric)
str1 += f"{metric_name}: {metric}\n"
str1 += f"VERT: {round(gmean(metric_arr), 2)}\n"
return str1
def bert_score(
total_response_arr: List[str], bert_model_name: str = "bert-base-uncased"
) -> str:
"""
Compute a cosine similarity score between the concatenated
context (all but the last element)
and the final response (last element) using a BERT-based model.
This serves as a simplified
measure of how closely the response aligns with the preceding context semantically.
Args:
total_response_arr (List[str]):
A list of strings. The last element represents the response,
while all other elements
are treated as the context.
bert_model_name (str, optional):
The name or path of the BERT model to use (from the Hugging Face Model Hub).
Defaults to "bert-base-uncased".
Returns:
str:
A string containing the cosine similarity
(as a percentage) followed by a newline.
For example:
"Cosine Similarity: 85.67\n"
Example:
>>> total_responses = [
... "User: Hi, how are you?",
... "Assistant: I'm good! How can I help you today?",
... "User: Can you tell me a joke?",
... "Assistant: Sure! Here's one: Why did the chicken join a band?"
... ]
>>> result = bert_score(total_responses, bert_model_name="bert-base-uncased")
>>> print(result)
"Cosine Similarity: 75.89\n"
"""
def cosine_similarity_context_response(context, response, model, tokenizer):
# Tokenize and encode both context and response
context_inputs = tokenizer(context, return_tensors="pt", truncation=True)
response_inputs = tokenizer(response, return_tensors="pt", truncation=True)
for k in context_inputs:
context_inputs[k] = context_inputs[k].cuda()
for k in response_inputs:
response_inputs[k] = response_inputs[k].cuda()
# Get embeddings from the model
with torch.no_grad():
context_embedding = model(**context_inputs).last_hidden_state.mean(dim=1)
response_embedding = model(**response_inputs).last_hidden_state.mean(dim=1)
# Compute cosine similarity
similarity = cosine_similarity(
context_embedding.cpu().numpy(), response_embedding.cpu().numpy()
)
return similarity[0][0]
bert_model = AutoModel.from_pretrained(bert_model_name).cuda()
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
similarity = cosine_similarity_context_response(
" ".join(total_response_arr[:-1]),
total_response_arr[-1],
bert_model,
bert_tokenizer,
)
return f"Cosine Similarity: {similarity*100:.2f}" + "\n"
def DialoGPT_perplexity(
user_utterance: str,
response: str,
dialog_model_name: str = "microsoft/DialoGPT-medium",
) -> str:
"""
Compute the perplexity of a response given a user utterance using a pre-trained
DialoGPT model. The function loads DialoGPT (medium by default)
from the Hugging Face Model Hub, then calculates the perplexity
for the
(context + response) sequence.
Args:
user_utterance (str):
The user utterance preceding the model's response.
response (str):
The generated response whose perplexity needs to be evaluated.
Returns:
str:
A formatted string containing the DialoGPT perplexity score. For example:
"DialoGPT Perplexity: 25.67\n"
Example:
>>> user_text = "Hi, how are you today?"
>>> system_response = "I'm good, thank you! How can I help you?"
>>> result = DialoGPT_perplexity(user_text, system_response)
>>> print(result)
"DialoGPT Perplexity: 31.45\n"
"""
def evaluate_response_with_dialoGPT(context, response, model, tokenizer):
"""
Evaluate the appropriateness of a response based on the
given context using DialoGPT.
Args:
context (str): The dialogue context (previous conversation).
response (str): The generated response to evaluate.
model: Pre-trained DialoGPT model.
tokenizer: Corresponding tokenizer for the DialoGPT model.
Returns:
float: Perplexity score of the response given the context.
"""
model.eval()
# Combine context and response as input
input_text = context + tokenizer.eos_token + response + tokenizer.eos_token
inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
inputs["input_ids"] = inputs["input_ids"].cuda()
inputs["attention_mask"] = inputs["attention_mask"].cuda()
# import pdb;pdb.set_trace()
# Compute model outputs and loss
with torch.no_grad():
outputs = model(**inputs, labels=inputs["input_ids"].cuda())
loss = outputs.loss
# Calculate perplexity
perplexity = torch.exp(loss)
return perplexity.cpu().item()
# Load DialoGPT model and tokenizer
model_name = dialog_model_name
model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
tokenizer = AutoTokenizer.from_pretrained(model_name)
perplexity = evaluate_response_with_dialoGPT(
user_utterance, response, model, tokenizer
)
return f"DialoGPT Perplexity: {perplexity:.2f}" + "\n"