from judging_dataclasses import Criteria PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the responses from the judges for a direct assessment. Each judge was asked to give a rating for each of the following criteria, along with an explanation: {criteria_list} The possible options for each criterion are as follows: {options} The responses from the judges are as follows: {judging_responses} Please provide a JSON object with the following structure that includes the model name and the scores for each of the criteria, along with the explanation. """ DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses. [USER PROMPT START] {user_prompt} [USER PROMPT END] Responses from other LLMs: {responses_from_other_llms} Please provide a response the combines the best aspects of the responses above.""" DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query. [USER PROMPT START] {user_prompt} [USER PROMPT END] The response is as follows: [RESPONSE START] {response} [RESPONSE END] Please evaluate the quality of the response based on the following criteria: {criteria_list} Options: {options} For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion.""" DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [ Criteria( name="helpfulness", description="Provides meaningful information and clear solutions that address the query.", min_score=1, max_score=7, ), Criteria( name="relevance", description="Stays on topic and directly relates to the query without unnecessary details.", min_score=1, max_score=7, ), Criteria( name="conciseness", description="Communicates clearly and efficiently, avoiding excess content while retaining substance.", min_score=1, max_score=7, ), ] # 7-point likert scale. SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [ "Strongly Disagree", "Disagree", "Slightly Disagree", "Neither Agree Nor Disagree", "Slightly Agree", "Agree", "Strongly Agree", ] # 6-point likert scale. SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [ "Strongly Disagree", "Disagree", "Slightly Disagree", "Slightly Agree", "Agree", "Strongly Agree", ] # 5-point likert scale. FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [ "Strongly Disagree", "Disagree", "Neither Agree Nor Disagree", "Agree", "Strongly Agree", ] # 4-point likert scale. FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [ "Strongly Disagree", "Disagree", "Agree", "Strongly Agree", ] # 3-point likert scale. THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [ "Disagree", "Neither Agree Nor Disagree", "Agree", ] # 2-point likert scale. BINARY_DIRECT_ASSESSMENT_OPTIONS = [ "Disagree", "Agree", ] DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query. [USER PROMPT START] {prompt} [USER PROMPT END] [RESPONSE A START] {first_completion} [RESPONSE A END] [RESPONSE B START] {second_completion} [RESPONSE B END] Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}. After providing your explanation, output your final verdict as one of the following options: {pairwise_comparison_options} """ DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [ ( "helpfulness", "Provides meaningful information and clear solutions that address the query.", ), ( "relevance", "Stays on topic and directly relates to the query without unnecessary details.", ), ( "conciseness", "Communicates clearly and efficiently, avoiding excess content while retaining substance.", ), ] # COARSE WITH TIE. DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [ ("A>B", "Response A is better than Response B"), ("B