Spaces:

yale-nlp
/

InstruSumEval

Runtime error

App Files Files Community

henryL7 commited on Jul 12

Commit

9bd8edd

•

1 Parent(s): 18902c1

update layout

Browse files

Files changed (5) hide show

.gitignore +2 -0
app.py +4 -0
data/models.yaml +49 -8
src/about.py +6 -4
src/populate.py +7 -2

.gitignore CHANGED Viewed

@@ -14,3 +14,5 @@ logs/
 envs/
 tmp.py

 envs/
 tmp.py
+print.py
+leaderboard.tex

app.py CHANGED Viewed

@@ -45,6 +45,10 @@ with demo:
             )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             with gr.Row():

             )
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            # with gr.Column(scale=2):
+            # gr.Markdown("""
+            #             ![](src/logo.png)
+            #             """)
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
             with gr.Row():

data/models.yaml CHANGED Viewed

@@ -1,82 +1,123 @@
 - name: 'tulu-2-dpo-70b'
   fdir: 'tulu-2-dpo-70b'
 - name: 'tulu-2-70b'
   fdir: 'tulu-2-70b'
 - name: 'llama-2-70b'
   fdir: 'llama-2-70b'
 - name: 'tulu-2-dpo-13b'
   fdir: 'tulu-2-dpo-13b'
 - name: 'tulu-2-13b'
   fdir: 'tulu-2-13b'
 - name: 'llama-2-13b'
   fdir: 'llama-2-13b'
 - name: 'tulu-2-dpo-7b'
   fdir: 'tulu-2-dpo-7b'
 - name: 'tulu-2-7b'
   fdir: 'tulu-2-7b'
 - name: 'llama-2-7b'
   fdir: 'llama-2-7b'
 - name: 'gemini-1.0-pro'
   fdir: 'gemini-1.0-pro'
 - name: 'gemini-1.5-pro'
   fdir: 'gemini-1.5-pro'
 - name: 'gemini-1.5-flash'
   fdir: 'gemini-1.5-flash'
 - name: 'llama-3-8b'
   fdir: 'llama-3-8b'
 - name: 'gpt-3.5-turbo-0125'
   fdir: 'gpt-3.5-turbo-0125'
 - name: 'gpt-4-0314'
   fdir: 'gpt-4-0314'
 - name: 'gpt-4-0613'
   fdir: 'gpt-4-0613'
 - name: 'gpt-4-1106-preview'
   fdir: 'gpt-4-1106-preview'
 - name: 'gpt-4-0125-preview'
   fdir: 'gpt-4-0125-preview'
 - name: 'gpt-4-turbo-2024-04-09'
   fdir: 'gpt-4-turbo-2024-04-09'
 - name: 'gpt-4o'
   fdir: 'gpt-4o'
 - name: 'claude-3-opus'
   fdir: 'claude-3-opus-20240229'
 - name: 'claude-3-haiku'
   fdir: 'claude-3-haiku-20240307'
 - name: 'claude-3-sonnet'
   fdir: 'claude-3-sonnet-20240229'
 - name: 'claude-2.1'
   fdir: 'claude-2.1'
 - name: 'claude-instant-1.2'
   fdir: 'claude-instant-1.2'
 - name: 'command-r-plus'
   fdir: 'command-r-plus'
-- name: 'llama-3-70b'
-  fdir: 'llama-3-70b'
 - name: 'mistral-7b-v0.2'
   fdir: 'mistral-7b-v0.2'
 - name: 'mistral-7b-v0.1'
   fdir: 'mistral-7b-v0.1'
 - name: 'mixtral-8x7b'
   fdir: 'mixtral-8x7b'
 - name: 'yi-1.5-34b'
   fdir: 'yi-1.5-34b'
 - name: 'yi-1.5-9b'
   fdir: 'yi-1.5-9b'
 - name: 'qwen-1.5-72b'
   fdir: 'qwen-1.5-72b'
 - name: 'qwen-1.5-32b'
   fdir: 'qwen-1.5-32b'
 - name: 'qwen-2-72b'
   fdir: 'qwen-2-72b'
 - name: 'gemma-7b'
   fdir: 'gemma-7b'
 - name: 'gemma-2b'
   fdir: 'gemma-2b'
-- name: 'mistral-7b-v0.3'
-  fdir: 'mistral-7b-v0.3'
 - name: 'glm-4-9b'
   fdir: 'glm-4-9b'
-- name: 'mistral-large'
-  fdir: 'mistral-large'
-- name: 'claude-3.5-sonnet'
-  fdir: 'claude3.5-sonnet'

 - name: 'tulu-2-dpo-70b'
   fdir: 'tulu-2-dpo-70b'
+  url: 'https://huggingface.co/allenai/tulu-2-dpo-70b'
 - name: 'tulu-2-70b'
   fdir: 'tulu-2-70b'
+  url: 'https://huggingface.co/allenai/tulu-2-70b'
 - name: 'llama-2-70b'
   fdir: 'llama-2-70b'
+  url: 'https://huggingface.co/meta-llama/Llama-2-70b-chat-hf'
 - name: 'tulu-2-dpo-13b'
   fdir: 'tulu-2-dpo-13b'
+  url: 'https://huggingface.co/allenai/tulu-2-dpo-13b'
 - name: 'tulu-2-13b'
   fdir: 'tulu-2-13b'
+  url: 'https://huggingface.co/allenai/tulu-2-13b'
 - name: 'llama-2-13b'
   fdir: 'llama-2-13b'
+  url: 'https://huggingface.co/meta-llama/Llama-2-13b-chat-hf'
 - name: 'tulu-2-dpo-7b'
   fdir: 'tulu-2-dpo-7b'
+  url: 'https://huggingface.co/allenai/tulu-2-dpo-7b'
 - name: 'tulu-2-7b'
   fdir: 'tulu-2-7b'
+  url: 'https://huggingface.co/allenai/tulu-2-7b'
 - name: 'llama-2-7b'
   fdir: 'llama-2-7b'
+  url: 'https://huggingface.co/meta-llama/Llama-2-7b-chat-hf'
 - name: 'gemini-1.0-pro'
   fdir: 'gemini-1.0-pro'
+  url: 'https://deepmind.google/technologies/gemini/pro/'
 - name: 'gemini-1.5-pro'
   fdir: 'gemini-1.5-pro'
+  url: 'https://deepmind.google/technologies/gemini/pro/'
 - name: 'gemini-1.5-flash'
   fdir: 'gemini-1.5-flash'
+  url: 'https://deepmind.google/technologies/gemini/flash/'
 - name: 'llama-3-8b'
   fdir: 'llama-3-8b'
+  url: 'https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct'
+- name: 'llama-3-70b'
+  fdir: 'llama-3-70b'
+  url: 'https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct'
 - name: 'gpt-3.5-turbo-0125'
   fdir: 'gpt-3.5-turbo-0125'
+  url: 'https://platform.openai.com/docs/models/gpt-3-5-turbo'
 - name: 'gpt-4-0314'
   fdir: 'gpt-4-0314'
+  url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
 - name: 'gpt-4-0613'
   fdir: 'gpt-4-0613'
+  url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
 - name: 'gpt-4-1106-preview'
   fdir: 'gpt-4-1106-preview'
+  url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
 - name: 'gpt-4-0125-preview'
   fdir: 'gpt-4-0125-preview'
+  url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
 - name: 'gpt-4-turbo-2024-04-09'
   fdir: 'gpt-4-turbo-2024-04-09'
+  url: 'https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4'
 - name: 'gpt-4o'
   fdir: 'gpt-4o'
+  url: 'https://platform.openai.com/docs/models/gpt-4o'
+- name: 'claude-3.5-sonnet'
+  fdir: 'claude3.5-sonnet'
+  url: 'https://docs.anthropic.com/en/docs/about-claude/models'
 - name: 'claude-3-opus'
   fdir: 'claude-3-opus-20240229'
+  url: 'https://docs.anthropic.com/en/docs/about-claude/models'
 - name: 'claude-3-haiku'
   fdir: 'claude-3-haiku-20240307'
+  url: 'https://docs.anthropic.com/en/docs/about-claude/models'
 - name: 'claude-3-sonnet'
   fdir: 'claude-3-sonnet-20240229'
+  url: 'https://docs.anthropic.com/en/docs/about-claude/models'
 - name: 'claude-2.1'
   fdir: 'claude-2.1'
+  url: 'https://docs.anthropic.com/en/docs/about-claude/models'
 - name: 'claude-instant-1.2'
   fdir: 'claude-instant-1.2'
+  url: 'https://docs.anthropic.com/en/docs/about-claude/models'
 - name: 'command-r-plus'
   fdir: 'command-r-plus'
+  url: 'https://huggingface.co/CohereForAI/c4ai-command-r-plus'
+- name: 'mistral-7b-v0.3'
+  fdir: 'mistral-7b-v0.3'
+  url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.3'
 - name: 'mistral-7b-v0.2'
   fdir: 'mistral-7b-v0.2'
+  url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2'
 - name: 'mistral-7b-v0.1'
   fdir: 'mistral-7b-v0.1'
+  url: 'https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1'
 - name: 'mixtral-8x7b'
   fdir: 'mixtral-8x7b'
+  url: 'https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1'
+- name: 'mistral-large'
+  fdir: 'mistral-large'
+  url: 'https://mistral.ai/news/mistral-large/'
 - name: 'yi-1.5-34b'
   fdir: 'yi-1.5-34b'
+  url: 'https://huggingface.co/01-ai/Yi-1.5-34B-Chat'
 - name: 'yi-1.5-9b'
   fdir: 'yi-1.5-9b'
+  url: 'https://huggingface.co/01-ai/Yi-1.5-9B-Chat'
 - name: 'qwen-1.5-72b'
   fdir: 'qwen-1.5-72b'
+  url: 'https://huggingface.co/Qwen/Qwen1.5-72B-Chat'
 - name: 'qwen-1.5-32b'
   fdir: 'qwen-1.5-32b'
+  url: 'https://huggingface.co/Qwen/Qwen1.5-32B-Chat'
 - name: 'qwen-2-72b'
   fdir: 'qwen-2-72b'
+  url: 'https://huggingface.co/Qwen/Qwen2-72B-Instruct'
 - name: 'gemma-7b'
   fdir: 'gemma-7b'
+  url: 'https://huggingface.co/google/gemma-7b-it'
 - name: 'gemma-2b'
   fdir: 'gemma-2b'
+  url: 'https://huggingface.co/google/gemma-2b-it'
 - name: 'glm-4-9b'
   fdir: 'glm-4-9b'
+  url: 'https://huggingface.co/THUDM/glm-4-9b-chat'

src/about.py CHANGED Viewed

@@ -20,6 +20,8 @@ INTRODUCTION_TEXT = """
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
 ### Task
 The LLMs are evaluated as judges in a pairwise comparison task.
 Each judge is presented with two **instruction-controllable** summaries and asked to select the better one.
@@ -47,9 +49,9 @@ The [prompt](https://github.com/princeton-nlp/LLMBar/blob/main/LLMEvaluator/eval
 """
 CITATION_BUTTON_LABEL = "Please cite our paper if you use InstruSum in your work."
-CITATION_BUTTON_TEXT = r"""@article{liu2023benchmarking,
-  title={Benchmarking generation and evaluation capabilities of large language models for instruction controllable summarization},
   author={Liu, Yixin and Fabbri, Alexander R and Chen, Jiawen and Zhao, Yilun and Han, Simeng and Joty, Shafiq and Liu, Pengfei and Radev, Dragomir and Wu, Chien-Sheng and Cohan, Arman},
-  journal={arXiv preprint arXiv:2311.09184},
-  year={2023}
 }"""

 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
+![](src/logo.png)
 ### Task
 The LLMs are evaluated as judges in a pairwise comparison task.
 Each judge is presented with two **instruction-controllable** summaries and asked to select the better one.
 """
 CITATION_BUTTON_LABEL = "Please cite our paper if you use InstruSum in your work."
+CITATION_BUTTON_TEXT = r"""@inproceedings{liu2024benchmarking,
+  title={Benchmarking Generation and Evaluation Capabilities of Large Language Models for Instruction Controllable Summarization},
   author={Liu, Yixin and Fabbri, Alexander R and Chen, Jiawen and Zhao, Yilun and Han, Simeng and Joty, Shafiq and Liu, Pengfei and Radev, Dragomir and Wu, Chien-Sheng and Cohan, Arman},
+booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
+    year = "2024",
 }"""

src/populate.py CHANGED Viewed

@@ -8,7 +8,7 @@ import numpy as np
 from datasets import load_dataset
 from .envs import TOKEN
-TYPES = ["str", "number", "number", "number", "number", "number"]
 def read_json(file_path: str) -> list[dict]:
@@ -95,7 +95,12 @@ def load_leaderboard() -> pd.DataFrame:
         acc, agr, models_acc, models_agr = pairwise_meta_eval(
             human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
         )
-        predictions["Model"].append(model["name"])
         predictions["Accuracy"].append(acc)
         predictions["Agreement"].append(agr)
         predictions["Self-Accuracy"].append(models_acc)

 from datasets import load_dataset
 from .envs import TOKEN
+TYPES = ["number", "html", "number", "number", "number", "number"]
 def read_json(file_path: str) -> list[dict]:
         acc, agr, models_acc, models_agr = pairwise_meta_eval(
             human_responses, f"./predictions/{fdir}.jsonl", f"./predictions/{fdir}_swap.jsonl"
         )
+        # predictions["Model"].append(model["name"])
+        # predictions["Model"].append(f"[{model['name']}]({model['url']})")
+        link = model['url']
+        model_name = model['name']
+        output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+        predictions["Model"].append(output)
         predictions["Accuracy"].append(acc)
         predictions["Agreement"].append(agr)
         predictions["Self-Accuracy"].append(models_acc)