Zekun Wu commited on
Commit
0a026c0
·
1 Parent(s): e6cc5be
__pycache__/assistants.cpython-310.pyc ADDED
Binary file (1.31 kB). View file
 
app.py CHANGED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from evaluator import evaluator
3
+
4
+ st.title('Natural Language Explanation Demo')
5
+
6
+
7
+ model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
8
+
9
+ question = st.text_input('Enter question:', '')
10
+ explaination = st.text_input('Enter explanation:', '')
11
+
12
+ if st.button('Evaluate Explanation'):
13
+ # print the question and explanation
14
+ st.write('### Question')
15
+ st.write(question)
16
+ st.write('### Explanation')
17
+ st.write(explaination)
18
+
19
+ # Evaluate the question and expl
20
+ if question and explaination:
21
+ eval = evaluator(model_name)
22
+ scores = eval(question,explaination) # You need to handle the model logic
23
+ st.write('### Scores')
24
+ for principle, score in scores.items():
25
+ st.write(f"{principle}: {score}")
26
+ else:
27
+ st.write('Please enter question and explanation to evaluate')
assistants.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import AzureOpenAI
2
+ import os
3
+ class GPTAgent:
4
+ def __init__(self, model_name):
5
+
6
+ self.client = AzureOpenAI(
7
+ api_key=os.getenv('AZURE_OPENAI_KEY'),
8
+ api_version=os.getenv('AZURE_OPENAI_VERSION'),
9
+ azure_endpoint=os.getenv('AZURE_OPENAI_ENDPOINT')
10
+ )
11
+ self.deployment_name = model_name
12
+
13
+ def invoke(self, text, **kwargs):
14
+ response = self.client.chat.completions.create(
15
+ model=self.deployment_name,
16
+ messages=[
17
+ {"role": "system", "content": "You are a helpful assistant."},
18
+ {"role": "user", "content": text}
19
+ ],
20
+ **kwargs
21
+ )
22
+ return response.choices[0].message.content
evaluator.py CHANGED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from assistants import GPTAgent
4
+ import json_repair
5
+
6
+ class evaluator:
7
+ def __init__(self, model_name='GPT4-turbo'):
8
+ self.model = GPTAgent(model_name)
9
+
10
+ def validate_scores(self, scores):
11
+ required_keys = ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]
12
+ for key in required_keys:
13
+ if key not in scores or not isinstance(scores[key], (int, float)) or not (0 <= scores[key] <= 1):
14
+ raise ValueError(f"Score for '{key}' is missing or out of range. Received: {scores.get(key)}")
15
+ return scores
16
+
17
+ def __call__(self, question,explanation):
18
+
19
+ evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
20
+ an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
21
+ should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all,
22
+ and 1 indicates that the principle is fully satisfied.
23
+
24
+ Question:
25
+ {question}
26
+
27
+ Provided Explanation:
28
+ {explanation}
29
+
30
+ Evaluation Criteria:
31
+
32
+ Factually Correct:
33
+ Definition: The explanation must be accurate and relevant to the question and the subject matter.
34
+ Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
35
+
36
+ Useful:
37
+ Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
38
+ Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
39
+
40
+ Context Specific:
41
+ Definition: The explanation should be relevant to the specific context or scenario implied by the question.
42
+ Score: (0-1) How well does the explanation address the specific context or scenario of the question?
43
+
44
+ User Specific:
45
+ Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
46
+ Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
47
+
48
+ Provides Pluralism:
49
+ Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
50
+ Score: (0-1) How well does the explanation provide or support multiple perspectives?
51
+
52
+ After evaluating the provided question and explanation based on the five principles, please format your scores in a JSON dictionary.
53
+
54
+ Example JSON format:
55
+
56
+ {{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}
57
+
58
+ Answer:
59
+ """
60
+
61
+ #response = self.model.invoke(evaluation_prompt,temperature=0.8, max_tokens=60).strip()
62
+ response = """ {{"Factually Correct": 0.9,"Useful": 0.85,"Context Specific": 0.8,"User Specific": 0.75,"Provides Pluralism": 0.7}}"""
63
+
64
+ try:
65
+ scores = json.loads(response)
66
+ print(scores)
67
+ except json.JSONDecodeError:
68
+ # Attempt to repair the JSON if decoding fails
69
+ repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
70
+ try:
71
+ scores = json.loads(repaired_json)
72
+ except json.JSONDecodeError:
73
+ print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
74
+ return None
75
+
76
+ return self.validate_scores(scores)
77
+
78
+ if __name__ == '__main__':
79
+ eval = evaluator()
80
+ question = "What is the capital of France?"
81
+ explanation = "The capital of France is Paris."
82
+ print(eval(question, explanation))
requirements.txt CHANGED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ backoff
2
+ openai
3
+ json-repair