Spaces:

llm-council
/

sandbox

Sleeping

App Files Files Community

sandbox / prompts.py

justinxzhao

Added per-response plots.

3e0f8f8 4 months ago

raw

history blame

4.26 kB

	from judging_dataclasses import Criteria


	PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the responses from the judges for a direct assessment.

	Each judge was asked to give a rating for each of the following criteria, along with an explanation:
	{criteria_list}

	The possible options for each criterion are as follows:
	{options}

	The responses from the judges are as follows:
	{judging_responses}

	Please provide a JSON object with the following structure that includes the model name and the scores for each of the criteria, along with the explanation.
	"""


	DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.

	[USER PROMPT START]
	{user_prompt}
	[USER PROMPT END]

	Responses from other LLMs:
	{responses_from_other_llms}

	Please provide a response the combines the best aspects of the responses above."""


	DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query.

	[USER PROMPT START]
	{user_prompt}
	[USER PROMPT END]

	The response is as follows:

	[RESPONSE START]
	{response}
	[RESPONSE END]

	Please evaluate the quality of the response based on the following criteria:

	{criteria_list}

	Options:
	{options}

	For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion."""

	DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [
	Criteria(
	name="helpfulness",
	description="Provides meaningful information and clear solutions that address the query.",
	min_score=1,
	max_score=7,
	),
	Criteria(
	name="relevance",
	description="Stays on topic and directly relates to the query without unnecessary details.",
	min_score=1,
	max_score=7,
	),
	Criteria(
	name="conciseness",
	description="Communicates clearly and efficiently, avoiding excess content while retaining substance.",
	min_score=1,
	max_score=7,
	),
	]

	# 7-point likert scale.
	SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [
	"Strongly Disagree",
	"Disagree",
	"Slightly Disagree",
	"Neither Agree Nor Disagree",
	"Slightly Agree",
	"Agree",
	"Strongly Agree",
	]

	# 6-point likert scale.
	SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [
	"Strongly Disagree",
	"Disagree",
	"Slightly Disagree",
	"Slightly Agree",
	"Agree",
	"Strongly Agree",
	]

	# 5-point likert scale.
	FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
	"Strongly Disagree",
	"Disagree",
	"Neither Agree Nor Disagree",
	"Agree",
	"Strongly Agree",
	]

	# 4-point likert scale.
	FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [
	"Strongly Disagree",
	"Disagree",
	"Agree",
	"Strongly Agree",
	]

	# 3-point likert scale.
	THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
	"Disagree",
	"Neither Agree Nor Disagree",
	"Agree",
	]

	# 2-point likert scale.
	BINARY_DIRECT_ASSESSMENT_OPTIONS = [
	"Disagree",
	"Agree",
	]


	DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query.

	[USER PROMPT START]
	{prompt}
	[USER PROMPT END]

	[RESPONSE A START]
	{first_completion}
	[RESPONSE A END]

	[RESPONSE B START]
	{second_completion}
	[RESPONSE B END]

	Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}.

	After providing your explanation, output your final verdict as one of the following options:
	{pairwise_comparison_options}
	"""

	DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [
	(
	"helpfulness",
	"Provides meaningful information and clear solutions that address the query.",
	),
	(
	"relevance",
	"Stays on topic and directly relates to the query without unnecessary details.",
	),
	(
	"conciseness",
	"Communicates clearly and efficiently, avoiding excess content while retaining substance.",
	),
	]

	# COARSE WITH TIE.
	DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [
	("A>B", "Response A is better than Response B"),
	("B<A", "Response B is better than Response A"),
	("A=B", "Both responses are equally good"),
	]