sandbox / prompts.py
justinxzhao's picture
Added per-response plots.
3e0f8f8
raw
history blame
4.26 kB
from judging_dataclasses import Criteria
PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the responses from the judges for a direct assessment.
Each judge was asked to give a rating for each of the following criteria, along with an explanation:
{criteria_list}
The possible options for each criterion are as follows:
{options}
The responses from the judges are as follows:
{judging_responses}
Please provide a JSON object with the following structure that includes the model name and the scores for each of the criteria, along with the explanation.
"""
DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.
[USER PROMPT START]
{user_prompt}
[USER PROMPT END]
Responses from other LLMs:
{responses_from_other_llms}
Please provide a response the combines the best aspects of the responses above."""
DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query.
[USER PROMPT START]
{user_prompt}
[USER PROMPT END]
The response is as follows:
[RESPONSE START]
{response}
[RESPONSE END]
Please evaluate the quality of the response based on the following criteria:
{criteria_list}
Options:
{options}
For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion."""
DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [
Criteria(
name="helpfulness",
description="Provides meaningful information and clear solutions that address the query.",
min_score=1,
max_score=7,
),
Criteria(
name="relevance",
description="Stays on topic and directly relates to the query without unnecessary details.",
min_score=1,
max_score=7,
),
Criteria(
name="conciseness",
description="Communicates clearly and efficiently, avoiding excess content while retaining substance.",
min_score=1,
max_score=7,
),
]
# 7-point likert scale.
SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Slightly Disagree",
"Neither Agree Nor Disagree",
"Slightly Agree",
"Agree",
"Strongly Agree",
]
# 6-point likert scale.
SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Slightly Disagree",
"Slightly Agree",
"Agree",
"Strongly Agree",
]
# 5-point likert scale.
FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Neither Agree Nor Disagree",
"Agree",
"Strongly Agree",
]
# 4-point likert scale.
FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Strongly Disagree",
"Disagree",
"Agree",
"Strongly Agree",
]
# 3-point likert scale.
THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
"Disagree",
"Neither Agree Nor Disagree",
"Agree",
]
# 2-point likert scale.
BINARY_DIRECT_ASSESSMENT_OPTIONS = [
"Disagree",
"Agree",
]
DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query.
[USER PROMPT START]
{prompt}
[USER PROMPT END]
[RESPONSE A START]
{first_completion}
[RESPONSE A END]
[RESPONSE B START]
{second_completion}
[RESPONSE B END]
Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}.
After providing your explanation, output your final verdict as one of the following options:
{pairwise_comparison_options}
"""
DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [
(
"helpfulness",
"Provides meaningful information and clear solutions that address the query.",
),
(
"relevance",
"Stays on topic and directly relates to the query without unnecessary details.",
),
(
"conciseness",
"Communicates clearly and efficiently, avoiding excess content while retaining substance.",
),
]
# COARSE WITH TIE.
DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [
("A>B", "Response A is better than Response B"),
("B<A", "Response B is better than Response A"),
("A=B", "Both responses are equally good"),
]