Zekun Wu
commited on
Commit
·
d3bca1f
1
Parent(s):
68b50ab
update
Browse files- pages/1_Single_Evaluation.py +1 -1
- pages/4_Batch_Evaluation.py +1 -1
- pages/5_Conversation_Evaluation.py +127 -0
- util/evaluator.py +75 -4
pages/1_Single_Evaluation.py
CHANGED
@@ -80,7 +80,7 @@ else:
|
|
80 |
if st.button('Evaluate Explanation'):
|
81 |
if question and explanation:
|
82 |
eval = evaluator(model_name)
|
83 |
-
scores =
|
84 |
st.write('### Scores')
|
85 |
details = write_evaluation_commentary(scores)
|
86 |
df = pd.DataFrame(details)
|
|
|
80 |
if st.button('Evaluate Explanation'):
|
81 |
if question and explanation:
|
82 |
eval = evaluator(model_name)
|
83 |
+
scores = evaluate_single(question, explanation)
|
84 |
st.write('### Scores')
|
85 |
details = write_evaluation_commentary(scores)
|
86 |
df = pd.DataFrame(details)
|
pages/4_Batch_Evaluation.py
CHANGED
@@ -32,7 +32,7 @@ def batch_evaluate(uploaded_file):
|
|
32 |
for index, row in enumerate(df.itertuples(), start=1):
|
33 |
question = row.question
|
34 |
explanation = row.explanation
|
35 |
-
scores = eval_instance(question, explanation) # Evaluate using the evaluator
|
36 |
commentary_details = write_evaluation_commentary(scores) # Generate commentary based on scores
|
37 |
results.append({
|
38 |
'Question': question,
|
|
|
32 |
for index, row in enumerate(df.itertuples(), start=1):
|
33 |
question = row.question
|
34 |
explanation = row.explanation
|
35 |
+
scores = eval_instance.evaluate_single(question, explanation) # Evaluate using the evaluator
|
36 |
commentary_details = write_evaluation_commentary(scores) # Generate commentary based on scores
|
37 |
results.append({
|
38 |
'Question': question,
|
pages/5_Conversation_Evaluation.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
|
3 |
+
import pandas as pd
|
4 |
+
import streamlit as st
|
5 |
+
from util.evaluator import Evaluator, write_evaluation_commentary
|
6 |
+
import os
|
7 |
+
|
8 |
+
# Predefined examples
|
9 |
+
examples = {
|
10 |
+
'good': [
|
11 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
12 |
+
{"role": "user", "content": "What causes rainbows to appear in the sky?"},
|
13 |
+
{"role": "assistant",
|
14 |
+
"content": "Rainbows appear when sunlight is refracted, dispersed, and reflected inside water droplets in the atmosphere, resulting in a spectrum of light appearing in the sky."},
|
15 |
+
{"role": "user", "content": "That's interesting! Why does it create so many colors?"}
|
16 |
+
],
|
17 |
+
'bad': [
|
18 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
19 |
+
{"role": "user", "content": "What causes rainbows to appear in the sky?"},
|
20 |
+
{"role": "assistant",
|
21 |
+
"content": "Rainbows happen because light in the sky gets mixed up and sometimes shows colors when it's raining or when there is water around."},
|
22 |
+
{"role": "user", "content": "That doesn't seem very clear."}
|
23 |
+
]
|
24 |
+
}
|
25 |
+
|
26 |
+
|
27 |
+
# Function to check password
|
28 |
+
def check_password():
|
29 |
+
def password_entered():
|
30 |
+
if password_input == os.getenv('PASSWORD'):
|
31 |
+
st.session_state['password_correct'] = True
|
32 |
+
else:
|
33 |
+
st.error("Incorrect Password, please try again.")
|
34 |
+
|
35 |
+
password_input = st.text_input("Enter Password:", type="password")
|
36 |
+
submit_button = st.button("Submit", on_click=password_entered)
|
37 |
+
|
38 |
+
if submit_button and not st.session_state.get('password_correct', False):
|
39 |
+
st.error("Please enter a valid password to access the demo.")
|
40 |
+
|
41 |
+
|
42 |
+
# Title of the application
|
43 |
+
st.title('Single Evaluation of Conversations')
|
44 |
+
|
45 |
+
# Description of the application
|
46 |
+
st.sidebar.write("""
|
47 |
+
### Welcome to the Single Evaluation of Conversations Demo
|
48 |
+
This application allows you to evaluate the quality of conversations generated for various contexts using different language models. You can either use predefined examples or input your own conversations and contexts.
|
49 |
+
""")
|
50 |
+
|
51 |
+
# Explanation of principles
|
52 |
+
st.sidebar.write("""
|
53 |
+
### Explanation Principles
|
54 |
+
When evaluating conversations, consider the following principles mapped to user empowerment and regulatory compliance outcomes:
|
55 |
+
|
56 |
+
1. **Factually Correct**: The information should be accurate and relevant to empower users and meet external audit requirements.
|
57 |
+
2. **Useful**: Explanations should be clear and meaningful, helping users make informed decisions.
|
58 |
+
3. **Context Specific**: Explanations should be tailored to the context of use, enhancing their relevance and utility.
|
59 |
+
4. **User Specific**: Explanations should address the needs and preferences of the user, enabling better decision-making.
|
60 |
+
5. **Provide Pluralism**: Explanations should present diverse perspectives, allowing users to understand different viewpoints and make well-rounded decisions.
|
61 |
+
""")
|
62 |
+
|
63 |
+
# Check if password has been validated
|
64 |
+
if not st.session_state.get('password_correct', False):
|
65 |
+
check_password()
|
66 |
+
else:
|
67 |
+
st.sidebar.success("Password Verified. Proceed with the demo.")
|
68 |
+
model_name = st.selectbox('Select a model:', ['gpt4-1106', 'gpt35-1106'])
|
69 |
+
|
70 |
+
# User choice between predefined examples or their own input
|
71 |
+
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
72 |
+
|
73 |
+
if input_type == 'Use predefined example':
|
74 |
+
example_type = st.radio("Select an example type:", ('good', 'bad'))
|
75 |
+
conversation = examples[example_type]
|
76 |
+
context = "Example context"
|
77 |
+
else:
|
78 |
+
conversation_input = st.text_area('Enter your conversation (JSON format):',
|
79 |
+
'[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Who won the world series in 2020?"}, {"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."}]')
|
80 |
+
context_input = st.text_input('Enter your context:', 'general user')
|
81 |
+
|
82 |
+
try:
|
83 |
+
conversation = json.loads(conversation_input)
|
84 |
+
context = context_input
|
85 |
+
except json.JSONDecodeError:
|
86 |
+
st.error("Invalid JSON format for conversation.")
|
87 |
+
conversation = None
|
88 |
+
context = None
|
89 |
+
|
90 |
+
st.write('### Conversation')
|
91 |
+
if conversation:
|
92 |
+
st.write(conversation)
|
93 |
+
else:
|
94 |
+
st.write('No conversation entered yet.')
|
95 |
+
|
96 |
+
st.write('### Context')
|
97 |
+
if context:
|
98 |
+
st.write(context)
|
99 |
+
else:
|
100 |
+
st.write('No context entered yet.')
|
101 |
+
|
102 |
+
if st.button('Evaluate Conversation'):
|
103 |
+
if conversation and context:
|
104 |
+
eval = Evaluator(model_name)
|
105 |
+
scores = eval.evaluate_conversation(conversation, context)
|
106 |
+
st.write('### Scores')
|
107 |
+
details = write_evaluation_commentary(scores["aggregate_scores"])
|
108 |
+
df = pd.DataFrame(details)
|
109 |
+
st.write(df)
|
110 |
+
|
111 |
+
data = {
|
112 |
+
'Conversation': conversation,
|
113 |
+
'Context': context,
|
114 |
+
**{detail['Principle']: detail['Score'] for detail in details}
|
115 |
+
}
|
116 |
+
df = pd.DataFrame([data])
|
117 |
+
|
118 |
+
# Convert DataFrame to CSV for download
|
119 |
+
csv = df.to_csv(index=False)
|
120 |
+
st.download_button(
|
121 |
+
label="Download evaluation as CSV",
|
122 |
+
data=csv,
|
123 |
+
file_name='evaluation.csv',
|
124 |
+
mime='text/csv',
|
125 |
+
)
|
126 |
+
else:
|
127 |
+
st.error('Please enter both a conversation and a context to evaluate.')
|
util/evaluator.py
CHANGED
@@ -15,7 +15,7 @@ class evaluator:
|
|
15 |
|
16 |
return scores
|
17 |
|
18 |
-
def
|
19 |
|
20 |
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
|
21 |
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
|
@@ -76,6 +76,69 @@ class evaluator:
|
|
76 |
|
77 |
return self.validate_scores(scores)
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def write_evaluation_commentary(scores):
|
80 |
evaluation_details = []
|
81 |
for principle, score in scores.items():
|
@@ -124,7 +187,15 @@ def write_evaluation_commentary(scores):
|
|
124 |
return evaluation_details
|
125 |
|
126 |
if __name__ == '__main__':
|
|
|
127 |
eval = evaluator()
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
return scores
|
17 |
|
18 |
+
def evaluate_single(self, question,explanation):
|
19 |
|
20 |
evaluation_prompt = f"""You are provided with a user's question and the corresponding explanation generated by
|
21 |
an AI model. Your task is to evaluate the explanation based on the following five principles. Each principle
|
|
|
76 |
|
77 |
return self.validate_scores(scores)
|
78 |
|
79 |
+
def format_conversation(self, conversation):
|
80 |
+
formatted_conversation = "\n".join(
|
81 |
+
f"{exchange['role'].capitalize()}: {exchange['content']}" for exchange in conversation
|
82 |
+
)
|
83 |
+
return formatted_conversation
|
84 |
+
|
85 |
+
def evaluate_conversation(self, conversation, context):
|
86 |
+
formatted_conversation = self.format_conversation(conversation)
|
87 |
+
evaluation_prompt = f"""
|
88 |
+
You are provided with a conversation between a user and a chatbot and the context about them. Your task is to evaluate the chatbot explanation in the conversation based on the following five principles. Each principle should be scored on a scale from 0 to 1, where 0 indicates that the principle is not met at all, and 1 indicates that the principle is fully satisfied.
|
89 |
+
|
90 |
+
Conversation:
|
91 |
+
{formatted_conversation}
|
92 |
+
|
93 |
+
Context:
|
94 |
+
{context}
|
95 |
+
|
96 |
+
Evaluation Criteria:
|
97 |
+
|
98 |
+
Factually Correct:
|
99 |
+
Definition: The explanation must be accurate and relevant to the question and the subject matter.
|
100 |
+
Score: (0-1) How factually correct is the explanation? Consider the accuracy of the details provided and their relevance to the question.
|
101 |
+
|
102 |
+
Useful:
|
103 |
+
Definition: The explanation should enable the user to understand the answer better and should facilitate further reasoning or decision-making.
|
104 |
+
Score: (0-1) How useful is the explanation in helping the user understand the answer and make informed decisions?
|
105 |
+
|
106 |
+
Context Specific:
|
107 |
+
Definition: The explanation should be relevant to the specific context or scenario implied by the question.
|
108 |
+
Score: (0-1) How well does the explanation address the specific context or scenario of the question?
|
109 |
+
|
110 |
+
User Specific:
|
111 |
+
Definition: The explanation should cater to the knowledge level and interests of the user, assuming typical or specified user characteristics.
|
112 |
+
Score: (0-1) How well does the explanation cater to the needs and knowledge level of the intended user?
|
113 |
+
|
114 |
+
Provides Pluralism:
|
115 |
+
Definition: The explanation should offer or accommodate multiple viewpoints or interpretations, allowing the user to explore various perspectives.
|
116 |
+
Score: (0-1) How well does the explanation provide or support multiple perspectives?
|
117 |
+
|
118 |
+
After evaluating the provided conversation based on the context and five principles, please format your scores in a JSON dictionary. Directly provide me with the json without any additional text.
|
119 |
+
|
120 |
+
Example JSON format:
|
121 |
+
|
122 |
+
Answer: {{"Factually Correct": 0.9, "Useful": 0.85, "Context Specific": 0.8, "User Specific": 0.75, "Provides Pluralism": 0.7}}
|
123 |
+
|
124 |
+
Answer:
|
125 |
+
"""
|
126 |
+
|
127 |
+
print(evaluation_prompt)
|
128 |
+
|
129 |
+
response = self.model.invoke(evaluation_prompt, temperature=0, max_tokens=500).strip()
|
130 |
+
try:
|
131 |
+
scores = json.loads(response)
|
132 |
+
except json.JSONDecodeError:
|
133 |
+
repaired_json = json_repair.repair_json(response, skip_json_loads=True, return_objects=False)
|
134 |
+
try:
|
135 |
+
scores = json.loads(repaired_json)
|
136 |
+
except json.JSONDecodeError:
|
137 |
+
print("Failed to decode JSON response even after repair attempt. Skipping this batch.")
|
138 |
+
return {key: -1 for key in ["Factually Correct", "Useful", "Context Specific", "User Specific", "Provides Pluralism"]}
|
139 |
+
|
140 |
+
return self.validate_scores(scores)
|
141 |
+
|
142 |
def write_evaluation_commentary(scores):
|
143 |
evaluation_details = []
|
144 |
for principle, score in scores.items():
|
|
|
187 |
return evaluation_details
|
188 |
|
189 |
if __name__ == '__main__':
|
190 |
+
|
191 |
eval = evaluator()
|
192 |
+
conversation = [
|
193 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
194 |
+
{"role": "user", "content": "Who won the world series in 2020?"},
|
195 |
+
{"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
|
196 |
+
{"role": "user", "content": "Where was it played?"}
|
197 |
+
]
|
198 |
+
context = "general user, user_background is sports enthusiast"
|
199 |
+
results = eval.evaluate_conversation(conversation, context)
|
200 |
+
print(results)
|
201 |
+
print(write_evaluation_commentary(results))
|