Spaces:

holistic-ai
/

explainbility_benchmark

Sleeping

File size: 3,985 Bytes

0a026c0

import json

from assistants import GPTAgent
import json_repair

class evaluator:
    def __init__(self, model_name='GPT4-turbo'):
        self.model = GPTAgent(model_name)

    def validate_scores(self, scores):
        required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
        for key in required_keys:
            if key not in scores or not isinstance(scores[key], (int, float)) or not (0 <= scores[key] <= 1):
                raise ValueError(f"Score for '{key}' is missing or out of range. Received: {scores.get(key)}")
        return scores

    def __call__(self, question,explanation):

        evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by 
        an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle 
        should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, 
        and 1 indicates that the principle is fully satisfied.
        
        Question: 
        {question}
        
        Provided Explanation: 
        {explanation}
        
        Evaluation Criteria:

        Factually Correct:
        Definition: The explanation must be accurate and relevant to the question and the subject matter.
        Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
        
        Useful:
        Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
        Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
        
        Context Specific:
        Definition: The explanation should be relevant to the specific context or scenario implied by the question.
        Score: (0-1) How well does the explanation address the specific context or scenario of the question?
        
        User Specific:
        Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
        Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
        
        Provides Pluralism:
        Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
        Score: (0-1) How well does the explanation provide or support multiple perspectives?
    
        After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary. 
        
        Example JSON format:
        
        {{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
        
        Answer:
        """

        #response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=60).strip()
        response = """        {{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""

        try:
            scores = json.loads(response)
            print(scores)
        except json.JSONDecodeError:
            # Attempt to repair the JSON if decoding fails
            repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
            try:
                scores = json.loads(repaired_json)
            except json.JSONDecodeError:
                print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
                return None

        return self.validate_scores(scores)

if __name__ == '__main__':
    eval = evaluator()
    question = "What is the capital of France?"
    explanation = "The capital of France is Paris."
    print(eval(question, explanation))