Spaces:

hallucinations-leaderboard
/

leaderboard

Running on CPU Upgrade

pminervini commited on Dec 2, 2023

Commit

90dff75

•

1 Parent(s): 669da77

update

Files changed (3) hide show

src/backend/envs.py CHANGED Viewed

@@ -20,8 +20,9 @@ class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # task0 = Task("anli_r1", "acc", "ANLI")
     # task1 = Task("logiqa", "acc_norm", "LogiQA")
-    task0 = Task("nq_open", "em", "NQ Open", 64) # 64 ?
-    task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64 ?
 # NUM_FEWSHOT = 64  # Change with your few shot

     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     # task0 = Task("anli_r1", "acc", "ANLI")
     # task1 = Task("logiqa", "acc_norm", "LogiQA")
+    task0 = Task("nq_open", "em", "NQ Open", 64)  # 64, as in the ATLAS paper
+    task1 = Task("triviaqa", "em", "TriviaQA", 64)  # 64, as in the ATLAS paper
+    task2 = Task("truthfulqa:mc", "mc2", "TruthfulQA", 0)  # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
 # NUM_FEWSHOT = 64  # Change with your few shot

src/display/utils.py CHANGED Viewed

@@ -73,18 +73,6 @@ class EvalQueueColumn:  # Queue column
     status = ColumnContent("status", "str", True)
-# Define the human baselines
-human_baseline_row = {
-    AutoEvalColumn.model.name: "<p>Human performance</p>",
-    AutoEvalColumn.revision.name: "N/A",
-    AutoEvalColumn.precision.name: None,
-    AutoEvalColumn.average.name: 100.0,
-    AutoEvalColumn.nqopen.name: 100.0,
-    AutoEvalColumn.triviaqa.name: 100.0,
-    AutoEvalColumn.dummy.name: "human_baseline",
-    AutoEvalColumn.model_type.name: "",
-}
 @dataclass
 class ModelDetails:
     name: str

     status = ColumnContent("status", "str", True)
 @dataclass
 class ModelDetails:
     name: str

src/tools/plots.py CHANGED Viewed

@@ -92,10 +92,6 @@ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) ->
     # Filter the DataFrame based on the specified metrics
     df = df[df["task"].isin(metrics)]
-    # Filter the human baselines based on the specified metrics
-    from src.display.utils import human_baseline_row as HUMAN_BASELINE
-    filtered_human_baselines = {k: v for k, v in HUMAN_BASELINE.items() if k in metrics}
     # Create a line figure using plotly express with specified markers and custom data
     fig = px.line(
         df,
@@ -129,21 +125,6 @@ def create_metric_plot_obj(df: pd.DataFrame, metrics: list[str], title: str) ->
     for trace in fig.data:
         metric_color_mapping[trace.name] = trace.line.color
-    # Iterate over filtered human baselines and add horizontal lines to the figure
-    for metric, value in filtered_human_baselines.items():
-        color = metric_color_mapping.get(metric, "blue")  # Retrieve color from mapping; default to blue if not found
-        location = "top left" if metric == "HellaSwag" else "bottom left"  # Set annotation position
-        # Add horizontal line with matched color and positioned annotation
-        fig.add_hline(
-            y=value,
-            line_dash="dot",
-            annotation_text=f"{metric} human baseline",
-            annotation_position=location,
-            annotation_font_size=10,
-            annotation_font_color=color,
-            line_color=color,
-        )
     return fig

     # Filter the DataFrame based on the specified metrics
     df = df[df["task"].isin(metrics)]
     # Create a line figure using plotly express with specified markers and custom data
     fig = px.line(
         df,
     for trace in fig.data:
         metric_color_mapping[trace.name] = trace.line.color
     return fig