lixuejing commited on
Commit
05d96a1
Β·
1 Parent(s): 89e1f12
src/display/utils.py CHANGED
@@ -19,6 +19,7 @@ class ColumnContent:
19
  displayed_by_default: bool
20
  hidden: bool = False
21
  never_hidden: bool = False
 
22
 
23
  ## Leaderboard columns
24
  auto_eval_column_dict = []
 
19
  displayed_by_default: bool
20
  hidden: bool = False
21
  never_hidden: bool = False
22
+ dummy: bool = False
23
 
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
src/leaderboard/filter_models.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from src.display.formatting import model_hyperlink
2
+ from src.display.utils import AutoEvalColumn
3
+
4
+ # Models which have been flagged by users as being problematic for a reason or another
5
+ # (Model name to forum discussion link)
6
+ FLAGGED_MODELS = {
7
+ "merged": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
8
+ "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
9
+ "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
10
+ "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
11
+ "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
12
+ "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
13
+ "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
14
+ "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
15
+ "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
16
+ "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
17
+ "fblgit/una-xaberius-34b-v1beta": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/444",
18
+ "jan-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
19
+ "rwitz2/go-bruins-v2.1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
20
+ "rwitz2/go-bruins-v2.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
21
+ "GreenNode/GreenNodeLM-v3olet-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
22
+ "GreenNode/GreenNodeLM-7B-v4leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
23
+ "GreenNode/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
24
+ "viethq188/LeoScorpius-7B-Chat-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
25
+ "GreenNode/GreenNodeLM-7B-v2leo": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
26
+ "janai-hq/trinity-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
27
+ "ignos/LeoScorpius-GreenNode-Alpaca-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
28
+ "fblgit/una-cybertron-7b-v3-OMA": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
29
+ "mncai/mistral-7b-dpo-merge-v1.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
30
+ "mncai/mistral-7b-dpo-v6": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
31
+ "Toten5/LeoScorpius-GreenNode-7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
32
+ "GreenNode/GreenNodeLM-7B-v1olet": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
33
+ "quantumaikr/quantum-dpo-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
34
+ "quantumaikr/quantum-v0.01": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
35
+ "quantumaikr/quantum-trinity-v0.1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
36
+ "mncai/mistral-7b-dpo-v5": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
37
+ "cookinai/BruinHermes": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
38
+ "jan-ai/Pandora-10.7B-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
39
+ "v1olet/v1olet_marcoroni-go-bruins-merge-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
40
+ "v1olet/v1olet_merged_dpo_7B_v3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
41
+ "rwitz2/pee": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
42
+ "zyh3826 / GML-Mistral-merged-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/503",
43
+ "dillfrescott/trinity-medium": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/474",
44
+ "udkai/Garrulus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/526",
45
+ "dfurman/GarrulusMarcoro-7B-v0.1": "https://huggingface.co/dfurman/GarrulusMarcoro-7B-v0.1/discussions/1",
46
+ "udkai/Turdus": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
47
+ "eren23/slerp-test-turdus-beagle": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
48
+ "abideen/NexoNimbus-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
49
+ "alnrg2arg/test2_3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
50
+ "nfaheem/Marcoroni-7b-DPO-Merge": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
51
+ "CultriX/MergeTrix-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
52
+ "liminerity/Blur-7b-v1.21": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/548",
53
+ # Merges not indicated
54
+ "gagan3012/MetaModelv2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
55
+ "gagan3012/MetaModelv3": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
56
+ "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
57
+ "kyujinpy/Sakura-SOLAR-Instruct-DPO-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
58
+ "kyujinpy/Sakura-SOLRCA-Math-Instruct-DPO-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
59
+ "kyujinpy/Sakura-SOLRCA-Instruct-DPO": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
60
+ "fblgit/LUNA-SOLARkrautLM-Instruct": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
61
+ "perlthoughts/Marcoroni-8x7B-v3-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
62
+ "rwitz/go-bruins-v2": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
63
+ "rwitz/go-bruins": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
64
+ "Walmart-the-bag/Solar-10.7B-Cato": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
65
+ "aqweteddy/mistral_tv-neural-marconroni": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
66
+ "NExtNewChattingAI/shark_tank_ai_7_b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
67
+ "Q-bert/MetaMath-Cybertron": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
68
+ "OpenPipe/mistral-ft-optimized-1227": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
69
+ "perlthoughts/Falkor-7b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
70
+ "v1olet/v1olet_merged_dpo_7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
71
+ "Ba2han/BruinsV2-OpHermesNeu-11B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
72
+ "DopeorNope/You_can_cry_Snowman-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
73
+ "PistachioAlt/Synatra-MCS-7B-v0.3-RP-Slerp": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
74
+ "Weyaxi/MetaMath-una-cybertron-v2-bf16-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
75
+ "Weyaxi/OpenHermes-2.5-neural-chat-7b-v3-2-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
76
+ "perlthoughts/Falkor-8x7B-MoE": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
77
+ "elinas/chronos007-70b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
78
+ "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Linear": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
79
+ "Weyaxi/MetaMath-neural-chat-7b-v3-2-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
80
+ "diffnamehard/Mistral-CatMacaroni-slerp-uncensored-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
81
+ "Weyaxi/neural-chat-7b-v3-1-OpenHermes-2.5-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
82
+ "Weyaxi/MetaMath-NeuralHermes-2.5-Mistral-7B-Ties": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
83
+ "Walmart-the-bag/Misted-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
84
+ "garage-bAInd/Camel-Platypus2-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
85
+ "Weyaxi/OpenOrca-Zephyr-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
86
+ "uukuguy/speechless-mistral-7b-dare-0.85": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/510",
87
+ "DopeorNope/SOLARC-M-10.7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
88
+ "cloudyu/Mixtral_11Bx2_MoE_19B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
89
+ "DopeorNope/SOLARC-MOE-10.7Bx6 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
90
+ "DopeorNope/SOLARC-MOE-10.7Bx4": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
91
+ "gagan3012/MetaModelv2 ": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/511",
92
+ }
93
+
94
+ # Models which have been requested by orgs to not be submitted on the leaderboard
95
+ DO_NOT_SUBMIT_MODELS = [
96
+ "Voicelab/trurl-2-13b", # trained on MMLU
97
+ "TigerResearch/tigerbot-70b-chat", # per authors request
98
+ "TigerResearch/tigerbot-70b-chat-v2", # per authors request
99
+ "TigerResearch/tigerbot-70b-chat-v4-4k", # per authors request
100
+ ]
101
+
102
+
103
+ def flag_models(leaderboard_data: list[dict]):
104
+ for model_data in leaderboard_data:
105
+ # Merges and moes are flagged automatically
106
+ if model_data[AutoEvalColumn.flagged.name] == True:
107
+ flag_key = "merged"
108
+ else:
109
+ flag_key = model_data["model_name_for_query"]
110
+
111
+ if flag_key in FLAGGED_MODELS:
112
+ issue_num = FLAGGED_MODELS[flag_key].split("/")[-1]
113
+ issue_link = model_hyperlink(
114
+ FLAGGED_MODELS[flag_key],
115
+ f"See discussion #{issue_num}",
116
+ )
117
+ model_data[
118
+ AutoEvalColumn.model.name
119
+ ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
120
+ model_data[AutoEvalColumn.flagged.name] = True
121
+ else:
122
+ model_data[AutoEvalColumn.flagged.name] = False
123
+
124
+
125
+ def remove_forbidden_models(leaderboard_data: list[dict]):
126
+ indices_to_remove = []
127
+ for ix, model in enumerate(leaderboard_data):
128
+ if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
129
+ indices_to_remove.append(ix)
130
+
131
+ for ix in reversed(indices_to_remove):
132
+ leaderboard_data.pop(ix)
133
+ return leaderboard_data
134
+
135
+
136
+ def filter_models_flags(leaderboard_data: list[dict]):
137
+ leaderboard_data = remove_forbidden_models(leaderboard_data)
138
+ flag_models(leaderboard_data)
src/leaderboard/read_evals.py CHANGED
@@ -31,6 +31,10 @@ class EvalResult:
31
  num_params: int = 0
32
  date: str = "" # submission date of request file
33
  still_on_hub: bool = False
 
 
 
 
34
 
35
  @classmethod
36
  def init_from_json_file(self, json_filepath):
@@ -104,12 +108,22 @@ class EvalResult:
104
  self.likes = request.get("likes", 0)
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
 
 
107
  except Exception:
 
108
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
 
 
 
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -118,20 +132,30 @@ class EvalResult:
118
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
119
  AutoEvalColumn.architecture.name: self.architecture,
120
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
 
121
  AutoEvalColumn.revision.name: self.revision,
122
  AutoEvalColumn.average.name: average,
123
  AutoEvalColumn.license.name: self.license,
124
  AutoEvalColumn.likes.name: self.likes,
125
  AutoEvalColumn.params.name: self.num_params,
126
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
 
 
 
127
  }
128
 
129
  for task in Tasks:
130
- data_dict[task.value.col_name] = self.results[task.value.benchmark]
 
 
 
 
 
 
 
131
 
132
  return data_dict
133
 
134
-
135
  def get_request_file_for_model(requests_path, model_name, precision):
136
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
  request_files = os.path.join(
 
31
  num_params: int = 0
32
  date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
+ is_merge: bool = False
35
+ flagged: bool = False
36
+ status: str = "FINISHED"
37
+ tags: list = None
38
 
39
  @classmethod
40
  def init_from_json_file(self, json_filepath):
 
108
  self.likes = request.get("likes", 0)
109
  self.num_params = request.get("params", 0)
110
  self.date = request.get("submitted_time", "")
111
+ self.architecture = request.get("architectures", "Unknown")
112
+ self.status = request.get("status", "FAILED")
113
  except Exception:
114
+ self.status = "FAILED"
115
  print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
116
 
117
  def to_dict(self):
118
  """Converts the Eval Result to a dict compatible with our dataframe display"""
119
+ average = 0
120
+ nums = 0
121
+ for v in self.results.values():
122
+ if v is not None and v != 0:
123
+ average += v
124
+ nums += 1
125
+ average = average/nums
126
+
127
  data_dict = {
128
  "eval_name": self.eval_name, # not a column, just a save name,
129
  AutoEvalColumn.precision.name: self.precision.value.name,
 
132
  AutoEvalColumn.weight_type.name: self.weight_type.value.name,
133
  AutoEvalColumn.architecture.name: self.architecture,
134
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
135
+ AutoEvalColumn.dummy.name: self.full_model,
136
  AutoEvalColumn.revision.name: self.revision,
137
  AutoEvalColumn.average.name: average,
138
  AutoEvalColumn.license.name: self.license,
139
  AutoEvalColumn.likes.name: self.likes,
140
  AutoEvalColumn.params.name: self.num_params,
141
  AutoEvalColumn.still_on_hub.name: self.still_on_hub,
142
+ AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
143
+ AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
144
+ AutoEvalColumn.flagged.name: self.flagged
145
  }
146
 
147
  for task in Tasks:
148
+ #data_dict[task.value.col_name] = self.results.get(task.value.benchmark, 0)
149
+ if task.value.col_name != "CLCC-H":
150
+ data_dict[task.value.col_name] = self.results.get(task.value.benchmark, 0)
151
+ else:
152
+ if self.results.get(task.value.benchmark, 0) == 0:
153
+ data_dict[task.value.col_name] = "-"
154
+ else:
155
+ data_dict[task.value.col_name] = "%.2f" % self.results.get(task.value.benchmark, 0)
156
 
157
  return data_dict
158
 
 
159
  def get_request_file_for_model(requests_path, model_name, precision):
160
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
161
  request_files = os.path.join(