from judging_dataclasses import Criteria


PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the responses from the judges for a direct assessment.

Each judge was asked to give a rating for each of the following criteria, along with an explanation:
{criteria_list}

The possible options for each criterion are as follows:
{options}

The responses from the judges are as follows:
{judging_responses}

Please provide a JSON object with the following structure that includes the model name and the scores for each of the criteria, along with the explanation.
"""


DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.

[USER PROMPT START]
{user_prompt}
[USER PROMPT END]

Responses from other LLMs:
{responses_from_other_llms}

Please provide a response the combines the best aspects of the responses above."""


DEFAULT_DIRECT_ASSESSMENT_PROMPT = """We are trying to assess the quality of a response to a user query.

[USER PROMPT START]
{user_prompt}
[USER PROMPT END]

The response is as follows: 

[RESPONSE START]
{response}
[RESPONSE END]

Please evaluate the quality of the response based on the following criteria:

{criteria_list}

Options:
{options}

For each of the criterion, provide a short explanation describing how you would evaluate the response based on that criterion. Then, provide your final rating for that criterion."""

DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST = [
    Criteria(
        name="helpfulness",
        description="Provides meaningful information and clear solutions that address the query.",
        min_score=1,
        max_score=7,
    ),
    Criteria(
        name="relevance",
        description="Stays on topic and directly relates to the query without unnecessary details.",
        min_score=1,
        max_score=7,
    ),
    Criteria(
        name="conciseness",
        description="Communicates clearly and efficiently, avoiding excess content while retaining substance.",
        min_score=1,
        max_score=7,
    ),
]

# 7-point likert scale.
SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Strongly Disagree",
    "Disagree",
    "Slightly Disagree",
    "Neither Agree Nor Disagree",
    "Slightly Agree",
    "Agree",
    "Strongly Agree",
]

# 6-point likert scale.
SIX_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Strongly Disagree",
    "Disagree",
    "Slightly Disagree",
    "Slightly Agree",
    "Agree",
    "Strongly Agree",
]

# 5-point likert scale.
FIVE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Strongly Disagree",
    "Disagree",
    "Neither Agree Nor Disagree",
    "Agree",
    "Strongly Agree",
]

# 4-point likert scale.
FOUR_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Strongly Disagree",
    "Disagree",
    "Agree",
    "Strongly Agree",
]

# 3-point likert scale.
THREE_POINT_DIRECT_ASSESSMENT_OPTIONS = [
    "Disagree",
    "Neither Agree Nor Disagree",
    "Agree",
]

# 2-point likert scale.
BINARY_DIRECT_ASSESSMENT_OPTIONS = [
    "Disagree",
    "Agree",
]


DEFAULT_PAIRWISE_COMPARISON_PROMPT = """We are trying to compare the quality of two responses to a user query.

[USER PROMPT START]
{prompt}
[USER PROMPT END]

[RESPONSE A START]
{first_completion}
[RESPONSE A END]

[RESPONSE B START]
{second_completion}
[RESPONSE B END]

Begin your evaluation by comparing the two responses and provide a short explanation. Some themes to consider in your evaluation: {themes_to_consider}.

After providing your explanation, output your final verdict as one of the following options:
{pairwise_comparison_options}
"""

DEFAULT_PAIRWISE_COMPARISON_THEMES_TO_CONSIDER = [
    (
        "helpfulness",
        "Provides meaningful information and clear solutions that address the query.",
    ),
    (
        "relevance",
        "Stays on topic and directly relates to the query without unnecessary details.",
    ),
    (
        "conciseness",
        "Communicates clearly and efficiently, avoiding excess content while retaining substance.",
    ),
]

# COARSE WITH TIE.
DEFAULT_PAIRWISE_COMPARISON_OPTIONS = [
    ("A>B", "Response A is better than Response B"),
    ("B<A", "Response B is better than Response A"),
    ("A=B", "Both responses are equally good"),
]