Create single-evaluation
Browse files- pages/single-evaluation +80 -0
pages/single-evaluation
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import streamlit as st
|
3 |
+
from transformers import pipeline
|
4 |
+
import os
|
5 |
+
|
6 |
+
# Load models
|
7 |
+
model1 = pipeline("text-classification", model="vectara/hallucination_evaluation_model")
|
8 |
+
model2 = pipeline("text-classification", model="sileod/deberta-v3-base-tasksource-nli")
|
9 |
+
|
10 |
+
# Predefined examples
|
11 |
+
examples = {
|
12 |
+
'good': {
|
13 |
+
'question': "What causes rainbows to appear in the sky?",
|
14 |
+
'explanation': "Rainbows appear when sunlight is refracted, dispersed, and reflected inside water droplets in the atmosphere, resulting in a spectrum of light appearing in the sky.",
|
15 |
+
'ground_truth': "Correct"
|
16 |
+
},
|
17 |
+
'bad': {
|
18 |
+
'question': "What causes rainbows to appear in the sky?",
|
19 |
+
'explanation': "Rainbows happen because light in the sky gets mixed up and sometimes shows colors when it's raining or when there is water around.",
|
20 |
+
'ground_truth': "Incorrect"
|
21 |
+
}
|
22 |
+
}
|
23 |
+
|
24 |
+
# Function to evaluate explanations using the two models
|
25 |
+
def evaluate_explanation(question, explanation):
|
26 |
+
results1 = model1(explanation)
|
27 |
+
results2 = model2(explanation)
|
28 |
+
return results1, results2
|
29 |
+
|
30 |
+
# Function to compare vectors (simple difference in scores as example)
|
31 |
+
def compare_vectors(v1, v2):
|
32 |
+
diff = abs(v1[0]['score'] - v2[0]['score'])
|
33 |
+
return diff
|
34 |
+
|
35 |
+
# Title of the application
|
36 |
+
st.title('Dual Model Evaluation of Explanations')
|
37 |
+
|
38 |
+
# Check for password before allowing access
|
39 |
+
def check_password():
|
40 |
+
def password_entered():
|
41 |
+
if password_input == os.getenv('PASSWORD'):
|
42 |
+
st.session_state['password_correct'] = True
|
43 |
+
else:
|
44 |
+
st.error("Incorrect Password, please try again.")
|
45 |
+
|
46 |
+
password_input = st.text_input("Enter Password:", type="password")
|
47 |
+
submit_button = st.button("Submit", on_click=password_entered)
|
48 |
+
|
49 |
+
if submit_button and not st.session_state.get('password_correct', False):
|
50 |
+
st.error("Please enter a valid password to access the demo.")
|
51 |
+
|
52 |
+
# Password check
|
53 |
+
if not st.session_state.get('password_correct', False):
|
54 |
+
check_password()
|
55 |
+
else:
|
56 |
+
st.sidebar.success("Password Verified. Proceed with the demo.")
|
57 |
+
|
58 |
+
input_type = st.radio("Choose input type:", ('Use predefined example', 'Enter your own'))
|
59 |
+
if input_type == 'Use predefined example':
|
60 |
+
example_type = st.radio("Select an example type:", ('good', 'bad'))
|
61 |
+
selected_example = examples[example_type]
|
62 |
+
question = selected_example['question']
|
63 |
+
explanation = selected_example['explanation']
|
64 |
+
ground_truth = selected_example['ground_truth']
|
65 |
+
else:
|
66 |
+
question = st.text_input('Enter your question:', '')
|
67 |
+
explanation = st.text_input('Enter your explanation:', '')
|
68 |
+
ground_truth = st.text_input('Enter ground truth:', '')
|
69 |
+
|
70 |
+
if st.button('Evaluate Explanation'):
|
71 |
+
if question and explanation and ground_truth:
|
72 |
+
results1, results2 = evaluate_explanation(question, explanation)
|
73 |
+
diff = compare_vectors(results1, results2)
|
74 |
+
st.write('### Model 1 Results')
|
75 |
+
st.write(results1)
|
76 |
+
st.write('### Model 2 Results')
|
77 |
+
st.write(results2)
|
78 |
+
st.write(f'### Score Difference: {diff}')
|
79 |
+
else:
|
80 |
+
st.error('Please enter a question, explanation, and ground truth to evaluate.')
|