explainbility_benchmark / evaluator.py
Zekun Wu
update
ea070cc
raw
history blame
6.65 kB
import json
from assistants import GPTAgent
import json_repair
class evaluator:
def __init__(self, model_name='GPT4-turbo'):
self.model = GPTAgent(model_name)
def validate_scores(self, scores):
required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
for key in required_keys:
if key not in scores or not isinstance(scores[key], (int, float)) or not (0 <= scores[key] <= 1):
raise ValueError(f"Score for '{key}' is missing or out of range. Received: {scores.get(key)}")
return scores
def __call__(self, question,explanation):
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
and 1 indicates that the principle is fully satisfied.
Question:
{question}
Provided Explanation:
{explanation}
Evaluation Criteria:
Factually Correct:
Definition: The explanation must be accurate and relevant to the question and the subject matter.
Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
Useful:
Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
Context Specific:
Definition: The explanation should be relevant to the specific context or scenario implied by the question.
Score: (0-1) How well does the explanation address the specific context or scenario of the question?
User Specific:
Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
Provides Pluralism:
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
Score: (0-1) How well does the explanation provide or support multiple perspectives?
After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary.
Example JSON format:
{{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
Answer:
"""
response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=60).strip()
#response = """ {{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""
try:
scores = json.loads(response)
print(scores)
except json.JSONDecodeError:
# Attempt to repair the JSON if decoding fails
repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
try:
scores = json.loads(repaired_json)
except json.JSONDecodeError:
print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
return None
return self.validate_scores(scores)
def write_evaluation_commentary(scores):
evaluation_details = []
for principle, score in scores.items():
if principle == "Factually Correct":
if score >= 0.8:
comment = "Excellent accuracy! The information is precise and directly relevant to the question."
elif score >= 0.5:
comment = "Moderately accurate, but some details may not be completely correct or are somewhat irrelevant."
else:
comment = "The explanation contains significant inaccuracies or irrelevant information."
elif principle == "Useful":
if score >= 0.8:
comment = "Highly useful! The explanation clearly enhances understanding and aids in further reasoning or decision-making."
elif score >= 0.5:
comment = "Somewhat useful, though it could be more insightful or practical in aiding understanding."
else:
comment = "The explanation does little to help understand or apply the information provided."
elif principle == "Context Specific":
if score >= 0.8:
comment = "Perfectly tailored to the context of the question, addressing the specific scenario effectively."
elif score >= 0.5:
comment = "Generally addresses the context, but may miss specific details or nuances relevant to the question."
else:
comment = "Fails to address the context of the question, lacking relevance or specificity."
elif principle == "User Specific":
if score >= 0.8:
comment = "The explanation is well-adapted to the user's knowledge level and interests, demonstrating thoughtfulness."
elif score >= 0.5:
comment = "Moderately considerate of the user's knowledge level, but could be more tailored."
else:
comment = "Does not consider the user's background or interests, potentially leading to confusion or disinterest."
elif principle == "Provides Pluralism":
if score >= 0.8:
comment = "Provides an excellent range of perspectives or interpretations, fostering a comprehensive understanding."
elif score >= 0.5:
comment = "Offers some alternative perspectives, but more could be provided to enrich understanding."
else:
comment = "Lacks diversity in viewpoints, limiting the depth of exploration into the topic."
evaluation_details.append({'Principle': principle, 'Score': score, 'Commentary': comment})
return evaluation_details
if __name__ == '__main__':
eval = evaluator()
question = "What is the capital of France?"
explanation = "The capital of France is Paris."
print(eval(question, explanation))