Spaces:

llm-council
/

sandbox

Sleeping

App Files Files Community

justinxzhao commited on Sep 30, 2024

Commit

3e0f8f8

1 Parent(s): 577870e

Added per-response plots.

Browse files

Files changed (4) hide show

app.py +215 -49
constants.py +10 -0
judging_dataclasses.py +15 -0
prompts.py +15 -0

app.py CHANGED Viewed

@@ -7,15 +7,18 @@ import anthropic
 from together import Together
 import google.generativeai as genai
 import time
-from typing import List, Optional, Literal, Union
 from constants import (
     LLM_COUNCIL_MEMBERS,
     PROVIDER_TO_AVATAR_MAP,
     AGGREGATORS,
 )
 from prompts import *
-from judging_dataclasses import *
 dotenv.load_dotenv()
@@ -40,6 +43,8 @@ openai_client = OpenAI(
 # anthropic_client = anthropic.Client(api_key=ANTHROPIC_API_KEY)
 anthropic_client = anthropic.Anthropic()
 def anthropic_streamlit_streamer(stream):
     """
@@ -142,19 +147,43 @@ def get_llm_response_stream(model_identifier, prompt):
 def get_response_key(model):
-    return model + ".response"
 def get_model_from_response_key(response_key):
-    return response_key.split(".")[0]
-def get_judging_key(judge_model, response_model):
-    return "judge." + judge_model + "." + response_model
 def get_aggregator_response_key(model):
-    return model + ".aggregator_response"
 # Streamlit form UI
@@ -177,12 +206,14 @@ def render_criteria_form(criteria_num):
 def get_response_mapping():
     # Inspect the session state for all the responses.
     # This is a dictionary mapping model names to their responses.
-    # The aggregator response is also included in this mapping under the key "<model>.aggregator_response".
     response_mapping = {}
     for key in st.session_state.keys():
-        if key.endswith(".response"):
             response_mapping[get_model_from_response_key(key)] = st.session_state[key]
-        if key.endswith(".aggregator_response"):
             response_mapping[key] = st.session_state[key]
     return response_mapping
@@ -210,9 +241,9 @@ def get_direct_assessment_prompt(
 def get_default_direct_assessment_prompt(user_prompt):
     return get_direct_assessment_prompt(
-        DEFAULT_DIRECT_ASSESSMENT_PROMPT,
         user_prompt=user_prompt,
-        response="{{response}}",
         criteria_list=DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST,
         options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
     )
@@ -220,7 +251,10 @@ def get_default_direct_assessment_prompt(user_prompt):
 def get_aggregator_prompt(aggregator_prompt, user_prompt, llms):
     responses_from_other_llms = "\n\n".join(
-        [f"{model}: {st.session_state.get(get_response_key(model))}" for model in llms]
     )
     return aggregator_prompt.format(
         user_prompt=user_prompt,
@@ -236,6 +270,100 @@ def get_default_aggregator_prompt(user_prompt, llms):
     )
 # Main Streamlit App
 def main():
     st.set_page_config(
@@ -291,7 +419,6 @@ def main():
         selected_models = llm_council_selector()
         st.write("Selected Models:", selected_models)
         selected_aggregator = aggregator_selector()
-        # st.write("Selected Aggregator:", selected_aggregator)
         # Prompt input
         user_prompt = st.text_area("Enter your prompt:")
@@ -299,19 +426,26 @@ def main():
         if st.button("Submit"):
             st.write("Responses:")
             # Fetching and streaming responses from each selected model
-            # TODO: Make this asynchronous?
-            for model in selected_models:
-                with st.chat_message(
-                    model,
-                    avatar=PROVIDER_TO_AVATAR_MAP[model],
-                ):
-                    message_placeholder = st.empty()
-                    stream = get_llm_response_stream(model, user_prompt)
-                    if stream:
-                        st.session_state[get_response_key(model)] = (
-                            message_placeholder.write_stream(stream)
-                        )
             # Get the aggregator prompt.
             aggregator_prompt = get_default_aggregator_prompt(
@@ -319,10 +453,12 @@ def main():
             )
             with st.expander("Aggregator Prompt"):
-                st.write(aggregator_prompt)
             # Fetching and streaming response from the aggregator
-            st.write(f"Mixture-of-Agents response from {selected_aggregator}:")
             with st.chat_message(
                 selected_aggregator,
                 avatar=PROVIDER_TO_AVATAR_MAP[selected_aggregator],
@@ -348,11 +484,12 @@ def main():
         # Depending on the assessment type, render different forms
         if assessment_type == "Direct Assessment":
-            direct_assessment_prompt = st.text_area(
-                "Prompt for the Direct Assessment",
-                value=get_default_direct_assessment_prompt(user_prompt=user_prompt),
-                height=500,
-            )
             # TODO: Add option to edit criteria list with a basic text field.
             criteria_list = DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST
@@ -365,7 +502,7 @@ def main():
                 response_judging_columns = st.columns(3)
-                responses_for_judging_to_streamlit_column_index_map = {
                     model: response_judging_columns[i % 3]
                     for i, model in enumerate(responses_for_judging.keys())
                 }
@@ -373,37 +510,42 @@ def main():
                 # Get judging responses.
                 for response_model, response in responses_for_judging.items():
-                    st_column = response_judging_columns[
-                        responses_for_judging_to_streamlit_column_index_map[
-                            response_model
-                        ]
                     ]
                     with st_column:
-                        st.write(f"Judging {response_model}")
                         judging_prompt = get_direct_assessment_prompt(
-                            direct_assessment_prompt,
-                            user_prompt,
-                            response,
-                            criteria_list,
-                            SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
                         )
                         for judging_model in selected_models:
-                            with st.expander("Detailed assessments", expanded=True):
                                 with st.chat_message(
                                     judging_model,
                                     avatar=PROVIDER_TO_AVATAR_MAP[judging_model],
                                 ):
-                                    st.write(f"Judge: {judging_model}")
                                     message_placeholder = st.empty()
                                     judging_stream = get_llm_response_stream(
                                         judging_model, judging_prompt
                                     )
                                     if judging_stream:
                                         st.session_state[
-                                            get_judging_key(
                                                 judging_model, response_model
                                             )
                                         ] = message_placeholder.write_stream(
@@ -412,6 +554,30 @@ def main():
                         # When all of the judging is finished for the given response, get the actual
                         # values, parsed (use gpt-4o-mini for now) with json mode.
                         # TODO.
         elif assessment_type == "Pairwise Comparison":
             pairwise_comparison_prompt = st.text_area(

 from together import Together
 import google.generativeai as genai
 import time
+from typing import List, Optional, Literal, Union, Dict
 from constants import (
     LLM_COUNCIL_MEMBERS,
     PROVIDER_TO_AVATAR_MAP,
     AGGREGATORS,
+    LLM_TO_UI_NAME_MAP,
 )
 from prompts import *
+from judging_dataclasses import DirectAssessmentJudgingResponse
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
 dotenv.load_dotenv()
 # anthropic_client = anthropic.Client(api_key=ANTHROPIC_API_KEY)
 anthropic_client = anthropic.Anthropic()
+client = OpenAI()
 def anthropic_streamlit_streamer(stream):
     """
 def get_response_key(model):
+    return model + "__response"
 def get_model_from_response_key(response_key):
+    return response_key.split("__")[0]
+def get_direct_assessment_judging_key(judge_model, response_model):
+    return "direct_assessment_judge__" + judge_model + "__" + response_model
 def get_aggregator_response_key(model):
+    return model + "__aggregator_response"
+def create_dataframe_for_direct_assessment_judging_response(
+    response: DirectAssessmentJudgingResponse,
+):
+    # Initialize empty list to collect data
+    data = []
+    # Loop through models
+    for judging_model in response.judging_models:
+        model_name = judging_model.model
+        # Loop through criteria_scores
+        for criteria_score in judging_model.criteria_scores:
+            data.append(
+                {
+                    "llm_judge_model": model_name,
+                    "criteria": criteria_score.criterion,
+                    "score": criteria_score.score,
+                    "explanation": criteria_score.explanation,
+                }
+            )
+    # Create DataFrame
+    return pd.DataFrame(data)
 # Streamlit form UI
 def get_response_mapping():
     # Inspect the session state for all the responses.
     # This is a dictionary mapping model names to their responses.
+    # The aggregator response is also included in this mapping under the key "<model>__aggregator_response".
     response_mapping = {}
     for key in st.session_state.keys():
+        if "judge" in key:
+            continue
+        if key.endswith("__response"):
             response_mapping[get_model_from_response_key(key)] = st.session_state[key]
+        if key.endswith("__aggregator_response"):
             response_mapping[key] = st.session_state[key]
     return response_mapping
 def get_default_direct_assessment_prompt(user_prompt):
     return get_direct_assessment_prompt(
+        direct_assessment_prompt=DEFAULT_DIRECT_ASSESSMENT_PROMPT,
         user_prompt=user_prompt,
+        response="{response}",
         criteria_list=DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST,
         options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
     )
 def get_aggregator_prompt(aggregator_prompt, user_prompt, llms):
     responses_from_other_llms = "\n\n".join(
+        [
+            f"{get_ui_friendly_name(model)} START\n{st.session_state.get(get_response_key(model))}\n\n{get_ui_friendly_name(model)} END\n\n\n"
+            for model in llms
+        ]
     )
     return aggregator_prompt.format(
         user_prompt=user_prompt,
     )
+def get_ui_friendly_name(llm):
+    return LLM_TO_UI_NAME_MAP.get(llm, llm)
+def get_parse_judging_response_for_direct_assessment_prompt(
+    judging_responses: dict[str, str],
+    criteria_list,
+    options,
+):
+    formatted_judging_responses = "\n\n".join(
+        [
+            f"{get_ui_friendly_name(model)} START\n{judging_responses[model]}\n\n{get_ui_friendly_name(model)} END\n\n\n"
+            for model in judging_responses.keys()
+        ]
+    )
+    return PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT.format(
+        judging_responses=formatted_judging_responses,
+        criteria_list=format_criteria_list(criteria_list),
+        options=format_likert_comparison_options(options),
+    )
+def get_model_from_direct_assessment_judging_key(judging_key):
+    return judging_key.split("__")[1]
+def get_direct_assessment_judging_responses():
+    # Get the judging responses from the session state.
+    judging_responses = {}
+    for key in st.session_state.keys():
+        if key.startswith("direct_assessment_judge__"):
+            judging_responses[get_model_from_direct_assessment_judging_key(key)] = (
+                st.session_state[key]
+            )
+    return judging_responses
+def parse_judging_responses(prompt: str) -> DirectAssessmentJudgingResponse:
+    completion = client.beta.chat.completions.parse(
+        model="gpt-4o-mini",
+        messages=[
+            {
+                "role": "system",
+                "content": "Parse the judging responses into structured data.",
+            },
+            {"role": "user", "content": prompt},
+        ],
+        response_format=DirectAssessmentJudgingResponse,
+    )
+    return completion.choices[0].message.parsed
+def plot_criteria_scores(df):
+    # Group by criteria and calculate mean and std over all judges.
+    grouped = df.groupby(["criteria"]).agg({"score": ["mean", "std"]}).reset_index()
+    # Flatten the MultiIndex columns
+    grouped.columns = ["criteria", "mean_score", "std_score"]
+    # Fill NaN std with zeros (in case there's only one score per group)
+    grouped["std_score"] = grouped["std_score"].fillna(0)
+    # Set up the plot
+    plt.figure(figsize=(8, 5))
+    # Create a horizontal bar plot
+    ax = sns.barplot(
+        data=grouped,
+        x="mean_score",
+        y="criteria",
+        hue="criteria",
+        errorbar=None,  # Updated parameter
+        orient="h",
+    )
+    # Add error bars manually
+    # Iterate over the bars and add error bars
+    for i, (mean, std) in enumerate(zip(grouped["mean_score"], grouped["std_score"])):
+        # Get the current bar
+        bar = ax.patches[i]
+        # Calculate the center of the bar
+        center = bar.get_y() + bar.get_height() / 2
+        # Add the error bar
+        ax.errorbar(x=mean, y=center, xerr=std, ecolor="black", capsize=3, fmt="none")
+    # Set labels and title
+    ax.set_xlabel("")
+    ax.set_ylabel("")
+    plt.tight_layout()
+    # Display the plot in Streamlit
+    st.pyplot(plt.gcf())
 # Main Streamlit App
 def main():
     st.set_page_config(
         selected_models = llm_council_selector()
         st.write("Selected Models:", selected_models)
         selected_aggregator = aggregator_selector()
         # Prompt input
         user_prompt = st.text_area("Enter your prompt:")
         if st.button("Submit"):
             st.write("Responses:")
+            response_columns = st.columns(3)
+            selected_models_to_streamlit_column_map = {
+                model: response_columns[i] for i, model in enumerate(selected_models)
+            }
             # Fetching and streaming responses from each selected model
+            for selected_model in selected_models:
+                with selected_models_to_streamlit_column_map[selected_model]:
+                    st.write(get_ui_friendly_name(selected_model))
+                    with st.chat_message(
+                        selected_model,
+                        avatar=PROVIDER_TO_AVATAR_MAP[selected_model],
+                    ):
+                        message_placeholder = st.empty()
+                        stream = get_llm_response_stream(selected_model, user_prompt)
+                        if stream:
+                            st.session_state[get_response_key(selected_model)] = (
+                                message_placeholder.write_stream(stream)
+                            )
             # Get the aggregator prompt.
             aggregator_prompt = get_default_aggregator_prompt(
             )
             with st.expander("Aggregator Prompt"):
+                st.code(aggregator_prompt)
             # Fetching and streaming response from the aggregator
+            st.write(
+                f"Mixture-of-Agents response from {get_ui_friendly_name(selected_aggregator)}"
+            )
             with st.chat_message(
                 selected_aggregator,
                 avatar=PROVIDER_TO_AVATAR_MAP[selected_aggregator],
         # Depending on the assessment type, render different forms
         if assessment_type == "Direct Assessment":
+            with st.expander("Direct Assessment Prompt"):
+                direct_assessment_prompt = st.text_area(
+                    "Prompt for the Direct Assessment",
+                    value=get_default_direct_assessment_prompt(user_prompt=user_prompt),
+                    height=500,
+                )
             # TODO: Add option to edit criteria list with a basic text field.
             criteria_list = DEFAULT_DIRECT_ASSESSMENT_CRITERIA_LIST
                 response_judging_columns = st.columns(3)
+                responses_for_judging_to_streamlit_column_map = {
                     model: response_judging_columns[i % 3]
                     for i, model in enumerate(responses_for_judging.keys())
                 }
                 # Get judging responses.
                 for response_model, response in responses_for_judging.items():
+                    st_column = responses_for_judging_to_streamlit_column_map[
+                        response_model
                     ]
                     with st_column:
+                        if "aggregator_response" in response_model:
+                            judging_model_header = "Mixture-of-Agents Response"
+                        else:
+                            judging_model_header = get_ui_friendly_name(response_model)
+                        st.write(f"Judging for {judging_model_header}")
                         judging_prompt = get_direct_assessment_prompt(
+                            direct_assessment_prompt=direct_assessment_prompt,
+                            user_prompt=user_prompt,
+                            response=response,
+                            criteria_list=criteria_list,
+                            options=SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
                         )
+                        with st.expander("Final Judging Prompt"):
+                            st.code(judging_prompt)
                         for judging_model in selected_models:
+                            with st.expander(
+                                get_ui_friendly_name(judging_model), expanded=False
+                            ):
                                 with st.chat_message(
                                     judging_model,
                                     avatar=PROVIDER_TO_AVATAR_MAP[judging_model],
                                 ):
                                     message_placeholder = st.empty()
                                     judging_stream = get_llm_response_stream(
                                         judging_model, judging_prompt
                                     )
                                     if judging_stream:
                                         st.session_state[
+                                            get_direct_assessment_judging_key(
                                                 judging_model, response_model
                                             )
                                         ] = message_placeholder.write_stream(
                         # When all of the judging is finished for the given response, get the actual
                         # values, parsed (use gpt-4o-mini for now) with json mode.
                         # TODO.
+                        judging_responses = get_direct_assessment_judging_responses()
+                        parse_judging_response_prompt = (
+                            get_parse_judging_response_for_direct_assessment_prompt(
+                                judging_responses,
+                                criteria_list,
+                                SEVEN_POINT_DIRECT_ASSESSMENT_OPTIONS,
+                            )
+                        )
+                        # Issue the prompt to openai mini with structured outputs
+                        parsed_judging_responses = parse_judging_responses(
+                            parse_judging_response_prompt
+                        )
+                        df = create_dataframe_for_direct_assessment_judging_response(
+                            parsed_judging_responses
+                        )
+                        st.write(df)
+                        # Log the output using st.write() under an st.expander
+                        # with st.expander("Parsed Judging Responses", expanded=True):
+                        # st.write(parsed_judging_responses)
+                        plot_criteria_scores(df)
+                        # TODO: Use parsed_judging_responses for further processing or display
         elif assessment_type == "Pairwise Comparison":
             pairwise_comparison_prompt = st.text_area(

constants.py CHANGED Viewed

@@ -24,6 +24,16 @@ PROVIDER_TO_AVATAR_MAP = {
     "anthropic://claude-3-haiku-20240307": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjQgMjQiPjxwYXRoIGZpbGw9ImN1cnJlbnRDb2xvciIgZD0iTTE3LjMwNCAzLjU0MWgtMy42NzJsNi42OTYgMTYuOTE4SDI0Wm0tMTAuNjA4IDBMMCAyMC40NTloMy43NDRsMS4zNy0zLjU1M2g3LjAwNWwxLjM2OSAzLjU1M2gzLjc0NEwxMC41MzYgMy41NDFabS0uMzcxIDEwLjIyM0w4LjYxNiA3LjgybDIuMjkxIDUuOTQ1WiIvPjwvc3ZnPg==",
 }
 # AGGREGATORS = ["openai://gpt-4o-mini", "openai://gpt-4o"]
 AGGREGATORS = ["together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"]

     "anthropic://claude-3-haiku-20240307": "data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxZW0iIGhlaWdodD0iMWVtIiB2aWV3Qm94PSIwIDAgMjQgMjQiPjxwYXRoIGZpbGw9ImN1cnJlbnRDb2xvciIgZD0iTTE3LjMwNCAzLjU0MWgtMy42NzJsNi42OTYgMTYuOTE4SDI0Wm0tMTAuNjA4IDBMMCAyMC40NTloMy43NDRsMS4zNy0zLjU1M2g3LjAwNWwxLjM2OSAzLjU1M2gzLjc0NEwxMC41MzYgMy41NDFabS0uMzcxIDEwLjIyM0w4LjYxNiA3LjgybDIuMjkxIDUuOTQ1WiIvPjwvc3ZnPg==",
 }
+LLM_TO_UI_NAME_MAP = {
+    "openai://gpt-4o-mini": "GPT-4 Turbo Mini",
+    "anthropic://claude-3-5-sonnet": "Claude 3 Sonnet",
+    "vertex://gemini-1.5-flash-001": "Gemini 1.5 Flash",
+    "together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo": "Llama 3.1 8B Instruct",
+    "together://meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo": "Llama 3.1 70B Instruct",
+    "together://meta-llama/Llama-3.2-3B-Instruct-Turbo": "Llama 3.2 3B Instruct",
+    "anthropic://claude-3-haiku-20240307": "Claude 3 Haiku",
+}
 # AGGREGATORS = ["openai://gpt-4o-mini", "openai://gpt-4o"]
 AGGREGATORS = ["together://meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"]

judging_dataclasses.py CHANGED Viewed

@@ -26,3 +26,18 @@ class PairwiseComparison(BaseModel):
 class JudgingConfig(BaseModel):
     assessment: Union[DirectAssessment, PairwiseComparison]

 class JudgingConfig(BaseModel):
     assessment: Union[DirectAssessment, PairwiseComparison]
+class DirectAssessmentCriterionScore(BaseModel):
+    criterion: str
+    score: int
+    explanation: str
+class DirectAssessmentCriteriaScores(BaseModel):
+    model: str
+    criteria_scores: List[DirectAssessmentCriterionScore]
+class DirectAssessmentJudgingResponse(BaseModel):
+    judging_models: List[DirectAssessmentCriteriaScores]

prompts.py CHANGED Viewed

@@ -1,6 +1,21 @@
 from judging_dataclasses import Criteria
 DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.
 [USER PROMPT START]

 from judging_dataclasses import Criteria
+PARSE_JUDGING_RESPONSE_FOR_DIRECT_ASSESSMENT_PROMPT = """We are trying to parse the responses from the judges for a direct assessment.
+Each judge was asked to give a rating for each of the following criteria, along with an explanation:
+{criteria_list}
+The possible options for each criterion are as follows:
+{options}
+The responses from the judges are as follows:
+{judging_responses}
+Please provide a JSON object with the following structure that includes the model name and the scores for each of the criteria, along with the explanation.
+"""
 DEFAULT_AGGREGATOR_PROMPT = """We are trying to come up with the best response to a user query based on an aggregation of other responses.
 [USER PROMPT START]