Spaces:
Runtime error
Runtime error
Lakoc
commited on
Commit
·
b66f230
1
Parent(s):
49d6897
v0.0.1
Browse files- app.py +84 -345
- compare_significance.py +112 -75
- content.py +15 -43
- model_compare.py +17 -45
- server.py +144 -0
- tasks_metadata.json +204 -0
app.py
CHANGED
@@ -1,390 +1,129 @@
|
|
1 |
-
import glob
|
2 |
import os
|
3 |
-
import logging
|
4 |
|
5 |
-
import pandas as pd
|
6 |
import gradio as gr
|
|
|
7 |
from gradio.themes.utils.sizes import text_md
|
8 |
|
9 |
-
from content import (HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN
|
10 |
-
|
11 |
-
|
12 |
-
import json
|
13 |
-
from datetime import datetime
|
14 |
-
from pathlib import Path
|
15 |
-
from uuid import uuid4
|
16 |
-
import time
|
17 |
-
import gradio as gr
|
18 |
-
|
19 |
-
from huggingface_hub import HfApi, snapshot_download
|
20 |
-
|
21 |
-
from compare_significance import check_significance, SUPPORTED_METRICS
|
22 |
-
from model_compare import ModelCompare
|
23 |
-
|
24 |
-
JSON_DATASET_DIR = Path("../json_dataset")
|
25 |
-
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
|
26 |
-
|
27 |
-
JSON_DATASET_PATH = JSON_DATASET_DIR / f"train-{uuid4()}.json"
|
28 |
-
|
29 |
-
api = HfApi()
|
30 |
-
|
31 |
-
ORG= "CZLC"
|
32 |
-
REPO = f"{ORG}/LLM_benchmark_data"
|
33 |
-
|
34 |
-
def greet(name: str) -> str:
|
35 |
-
return "Hello " + name + "!"
|
36 |
-
|
37 |
-
|
38 |
-
DATASET_VERSIONS = ['dev-set-1', 'dev-set-2']
|
39 |
|
40 |
-
|
41 |
|
42 |
-
class LeaderboardServer:
|
43 |
-
def __init__(self, server_address):
|
44 |
-
self.server_address = server_address
|
45 |
-
self.repo_type = "dataset"
|
46 |
-
self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
|
47 |
-
self.submisssion_id_to_file = {} # Map submission ids to file paths
|
48 |
-
|
49 |
-
def on_submit(self):
|
50 |
-
self.local_leaderboard = snapshot_download(self.server_address,repo_type=self.repo_type, token=HF_TOKEN,local_dir = "./")
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
new_results = []
|
56 |
-
submission_ids = set()
|
57 |
-
|
58 |
-
# pre-computed ranks
|
59 |
-
with open(os.path.join(self.local_leaderboard, "metadata", "ranks.json")) as ranks_file:
|
60 |
-
ranks = json.load(ranks_file)
|
61 |
-
model_compare = ModelCompare()
|
62 |
-
ranks = model_compare.get_tasks_ranks(ranks)
|
63 |
-
|
64 |
-
# Models data
|
65 |
-
for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
|
66 |
-
data = json.load(open(submission))
|
67 |
-
submission_id = data["metadata"]["model_description"]
|
68 |
-
|
69 |
-
if submission_id in submission_ids:
|
70 |
-
continue
|
71 |
-
submission_ids.add(submission_id)
|
72 |
-
|
73 |
-
self.submisssion_id_to_file[submission_id] = submission
|
74 |
-
|
75 |
-
|
76 |
-
local_results = {task: list(task_ranks).index(submission_id)+1 for task, task_ranks in ranks.items()}
|
77 |
-
local_results["submission_id"] = submission_id
|
78 |
-
results.append(local_results)
|
79 |
-
dataframe = pd.DataFrame.from_records(results)
|
80 |
-
# Reorder to have the id (model description) first
|
81 |
-
df_order = ["submission_id"] + [col for col in dataframe.columns if col != "submission_id"]
|
82 |
-
dataframe = dataframe[df_order]
|
83 |
-
return dataframe
|
84 |
-
|
85 |
-
def compute_ranks(self):
|
86 |
-
''' Compute rankings on every submit '''
|
87 |
-
|
88 |
-
self.get_leaderboard()
|
89 |
|
90 |
-
ids = list(self.submisssion_id_to_file.keys())
|
91 |
-
rankings = {id: {} for id in ids}
|
92 |
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
rankings[modelA_id][modelB_id] = {
|
99 |
-
task: data["significant"] for task,data in res.items()
|
100 |
-
}
|
101 |
-
rankings[modelB_id][modelA_id] = {
|
102 |
-
task: not data["significant"] for task,data in res.items()
|
103 |
-
}
|
104 |
-
|
105 |
-
return rankings
|
106 |
-
|
107 |
-
|
108 |
-
def compare_models(self, modelA, modelB):
|
109 |
-
modelA_path = self.submisssion_id_to_file.get(modelA)
|
110 |
-
modelB_path = self.submisssion_id_to_file.get(modelB)
|
111 |
-
return check_significance(modelA_path, modelB_path)
|
112 |
-
|
113 |
-
|
114 |
-
def get_rankings(self):
|
115 |
-
# TODO retrieve saved rankings for models on tasks
|
116 |
-
pass
|
117 |
|
118 |
-
def save_json(self,file, submission_name) -> None:
|
119 |
-
filename = os.path.basename(file)
|
120 |
-
api.upload_file(
|
121 |
-
path_or_fileobj=file,
|
122 |
-
path_in_repo=f"data/{submission_name}_{filename}",
|
123 |
-
repo_id=self.server_address,
|
124 |
-
repo_type=self.repo_type,
|
125 |
-
token=HF_TOKEN,
|
126 |
-
)
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
-
leaderboard_server
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
|
|
|
|
|
|
|
|
137 |
|
138 |
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
|
141 |
-
# if __name__ == '__main__':
|
142 |
with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
|
143 |
app_state = gr.State({})
|
144 |
-
# with gr.Row():
|
145 |
-
# greet_name = gr.Textbox(label="Name")
|
146 |
-
# greet_output = gr.Textbox(label="Greetings")
|
147 |
-
# greet_btn = gr.Button("Greet")
|
148 |
-
# greet_btn.click(fn=greet, inputs=greet_name, outputs=greet_output).success(
|
149 |
-
# fn=save_json,
|
150 |
-
# inputs=[greet_name, greet_output],
|
151 |
-
# outputs=None,
|
152 |
-
# )
|
153 |
|
154 |
with gr.Row():
|
155 |
with gr.Row():
|
156 |
gr.Markdown(HEADER_MARKDOWN)
|
157 |
|
158 |
with gr.Row():
|
159 |
-
|
160 |
-
# Leaderboards Tab #
|
161 |
-
####################
|
162 |
-
def populate_leaderboard(leaderboard_type, dataset_version):
|
163 |
-
gr.Info('Loading leaderboard...')
|
164 |
-
time.sleep(1)
|
165 |
-
leaderboard_df = leaderboard_server.get_leaderboard()
|
166 |
-
# leaderboard_df = lb_server.get_leaderboard(
|
167 |
-
# submission_type=leaderboard_type, dataset_version=dataset_version)
|
168 |
-
# if leaderboard_df.empty:
|
169 |
-
return leaderboard_df
|
170 |
-
# return leaderboard_df
|
171 |
-
|
172 |
-
|
173 |
-
def create_leaderboard_tab(tab_name: str, idx: int, dataset_version_dropdown: gr.Dropdown):
|
174 |
-
# dataset_version = dataset_version_dropdown.value
|
175 |
-
print(f'Creating tab for {tab_name}, idx={idx}, dataset_version={dataset_version_dropdown}')
|
176 |
-
with gr.Tab(id=tab_name, label=tab_name) as leaderboard_tab:
|
177 |
-
leaderboard_table = gr.DataFrame(populate_leaderboard(tab_name, None)) if idx == 0 \
|
178 |
-
else gr.DataFrame(pd.DataFrame(columns=['No submissions yet']))
|
179 |
-
leaderboard_tab.select(fn=populate_leaderboard,
|
180 |
-
inputs=[gr.Text(tab_name, visible=False)],
|
181 |
-
outputs=[leaderboard_table])
|
182 |
-
return leaderboard_table
|
183 |
-
|
184 |
-
def on_dropdown_change():
|
185 |
-
first_tab_name = LEADERBOARD_TYPES[0]
|
186 |
-
leaderboard_server.on_submit()
|
187 |
-
|
188 |
-
return gr.Tabs(selected=first_tab_name), populate_leaderboard(first_tab_name, None)
|
189 |
-
|
190 |
-
|
191 |
with gr.Tab('Leaderboard') as leaderboards_tab:
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
# with gr.Column():
|
196 |
-
# dataset_version_drop = gr.Dropdown(choices=DATASET_VERSIONS, multiselect=False,
|
197 |
-
# value=DATASET_VERSIONS[-1], label="Dataset",
|
198 |
-
# interactive=True)
|
199 |
-
# with gr.Column():
|
200 |
-
# gr.Markdown('') # Empty column for spacing
|
201 |
-
# with gr.Column():
|
202 |
-
# gr.Markdown('') # Empty column for spacing
|
203 |
-
# with gr.Column():
|
204 |
-
# gr.Markdown('') # Empty column for spacing
|
205 |
-
# with gr.Row():
|
206 |
-
# with gr.Tabs() as leaderboards_tabs:
|
207 |
-
# leaderboard_tables_list = []
|
208 |
-
# for leaderboard_idx, leaderboard_type in enumerate(LEADERBOARD_TYPES):
|
209 |
-
# l_tab = create_leaderboard_tab(leaderboard_type, leaderboard_idx, None)
|
210 |
-
# leaderboard_tables_list.append(l_tab)
|
211 |
-
|
212 |
-
# change the table based on the selected model
|
213 |
-
def on_dropdown_change(model_detail):
|
214 |
-
leaderboard = leaderboard_server.get_leaderboard()
|
215 |
-
return leaderboard[leaderboard["submission_id"] == model_detail]
|
216 |
|
217 |
-
results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None, visible=True)
|
218 |
-
model_detail = gr.Dropdown(choices=list(leaderboard_server.get_leaderboard()["submission_id"]), label="Select model", interactive=True)
|
219 |
-
model_detail_button = gr.Button("Show model detail", interactive=True)
|
220 |
-
model_detail_button.click(
|
221 |
-
fn=on_dropdown_change,
|
222 |
-
inputs=[model_detail],
|
223 |
-
outputs=[results_table]
|
224 |
-
)
|
225 |
-
|
226 |
-
# results_table.select(fn=on_dropdown_change, inputs=[model_detail], outputs=[results_table])
|
227 |
-
|
228 |
-
# dataset_version_drop.select(fn=on_dropdown_change, inputs=[dataset_version_drop],
|
229 |
-
# outputs=[leaderboards_tabs, leaderboard_tables_list[0]])
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
##################
|
234 |
-
# Submission Tab #
|
235 |
-
##################
|
236 |
with gr.Tab('Submission'):
|
237 |
with gr.Column():
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
if not team_name or not submission_zip or not submission_type:
|
243 |
-
raise ValueError('Please fill in all fields')
|
244 |
-
if not os.path.exists(submission_zip):
|
245 |
-
raise ValueError('File does not exist')
|
246 |
-
# if not submission_zip.endswith('.zip'):
|
247 |
-
# raise ValueError('File must be a zip')
|
248 |
-
# if not token:
|
249 |
-
# raise ValueError('Please insert a valid Hugging Face token')
|
250 |
-
|
251 |
-
def process_submission(team_name, submission, submission_type, description,
|
252 |
-
app_state, request: gr.Request):
|
253 |
-
logging.info(f'{team_name}: new submission for track: {submission_type}')
|
254 |
-
try:
|
255 |
-
token = app_state.get('hf_token')
|
256 |
-
validate_submission_inputs(team_name, submission, submission_type, token)
|
257 |
-
except ValueError as err:
|
258 |
-
gr.Warning(str(err))
|
259 |
-
return
|
260 |
-
|
261 |
-
|
262 |
-
# metadata = {'challenge_name': CHALLENGE_NAME,
|
263 |
-
# "dataset_version": DATASET_VERSIONS[-1],
|
264 |
-
# 'team_name': team_name,
|
265 |
-
# 'submission_type': submission_type,
|
266 |
-
# 'description': description,
|
267 |
-
# 'token': token,
|
268 |
-
# 'file_name': os.path.basename(submission_zip),
|
269 |
-
# 'file_size_mb': os.path.getsize(submission_zip) / 1024 / 1024,
|
270 |
-
# 'ip': request.client.host}
|
271 |
-
leaderboard_server.save_json(submission,team_name)
|
272 |
-
|
273 |
-
try:
|
274 |
-
gr.Info('Processing submission...')
|
275 |
-
# response = lb_server.add_submission(token=token, file_path=submission_zip, metadata=metadata)
|
276 |
-
# if 'error' in response:
|
277 |
-
# gr.Warning(f'Failed to process submission - {response["error"]}')
|
278 |
-
# else:
|
279 |
-
gr.Info('Done processing submission')
|
280 |
-
except Exception as e:
|
281 |
-
gr.Warning(f'Submission failed to upload - {e}')
|
282 |
-
|
283 |
-
def on_submit_done():
|
284 |
-
on_dropdown_change()
|
285 |
-
leaderboard_server.on_submit()
|
286 |
-
# leaderboard_tab.children[0] = gr.DataFrame(populate_leaderboard(None, None))
|
287 |
-
# leaderboard_tab.render()
|
288 |
-
return gr.update(value='Submit', interactive=True)
|
289 |
-
|
290 |
-
def show_leaderboard():
|
291 |
-
gr.Info("Loding leaderboard...")
|
292 |
-
return leaderboard_server.get_leaderboard()
|
293 |
|
294 |
-
gr.Markdown(
|
295 |
-
"""
|
296 |
-
# Model submission
|
297 |
-
Model can be compared with other models and submitted\n
|
298 |
-
Click **Compare results** to compare your model with other models in the leaderboard\n
|
299 |
-
Click **Submit results** to submit your model to the leaderboard
|
300 |
-
(Comparison by itself is not a submission)
|
301 |
-
"""
|
302 |
-
)
|
303 |
-
|
304 |
-
submission_team_name_tb = gr.Textbox(label='Team Name')
|
305 |
-
# submission_type_radio = gr.Radio(label='Submission Track', choices=LEADERBOARD_TYPES)
|
306 |
with gr.Row():
|
307 |
description_tb = gr.Textbox(label='Description', type='text')
|
308 |
link_to_model_tb = gr.Textbox(label='Link to model', type='text')
|
309 |
|
310 |
-
with gr.Row():
|
311 |
-
hf_token_tb = gr.Textbox(label='Token', type='password')
|
312 |
-
submissions_24h_txt = gr.Textbox(label='Submissions 24h', value='')
|
313 |
-
|
314 |
submission_file_path = gr.File(label='Upload your results', type='filepath')
|
315 |
-
|
316 |
-
|
317 |
-
# Button that triggers shows the current leaderboard
|
318 |
-
show_results_button = gr.Button("Compare results", interactive=True)
|
319 |
-
show_results_button.click(
|
320 |
-
fn=show_leaderboard,
|
321 |
-
outputs=[compare_results_button]
|
322 |
-
)
|
323 |
-
|
324 |
-
submission_btn = gr.Button(value='Submit results', interactive=True)
|
325 |
-
submission_btn.click(
|
326 |
-
fn=on_submit_pressed,
|
327 |
-
outputs=[submission_btn]
|
328 |
-
).then(
|
329 |
-
fn=process_submission,
|
330 |
-
inputs=[submission_team_name_tb, submission_file_path, description_tb, app_state]
|
331 |
-
).then(
|
332 |
-
fn=on_submit_done,
|
333 |
-
outputs=[submission_btn]
|
334 |
-
)
|
335 |
-
|
336 |
-
# .then(
|
337 |
-
# fn=on_dropdown_change,
|
338 |
-
# outputs=[leaderboards_tabs, leaderboard_tables_list[0]]
|
339 |
-
# )
|
340 |
-
|
341 |
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
# return pd.DataFrame(columns=['Please insert your Hugging Face token'])
|
349 |
-
# # submissions = lb_server.get_submissions_by_hf_token(hf_token=hf_token)
|
350 |
-
# # if submissions.empty:
|
351 |
-
# # submissions = pd.DataFrame(columns=['No submissions yet'])
|
352 |
-
# # return submissions
|
353 |
-
#
|
354 |
-
# gr.Markdown(MY_SUBMISSIONS_TAB_TITLE_MARKDOWN)
|
355 |
-
# my_submissions_table = gr.DataFrame()
|
356 |
-
#
|
357 |
-
# my_submissions_tab.select(fn=on_my_submissions_tab_select, inputs=[app_state],
|
358 |
-
# outputs=[my_submissions_table])
|
359 |
-
# my_submissions_token_tb = gr.Textbox(label='Token', type='password')
|
360 |
-
|
361 |
-
def on_token_insert(hf_token, app_state):
|
362 |
-
gr.Info(f'Verifying token...')
|
363 |
|
364 |
-
|
365 |
-
|
366 |
-
|
|
|
|
|
|
|
367 |
|
368 |
-
|
369 |
-
# Invalid token
|
370 |
-
app_state['hf_token'] = None
|
371 |
-
submissions_24h_str = ''
|
372 |
-
team_submissions_df = pd.DataFrame(columns=['Invalid Token'])
|
373 |
-
gr.Warning('Invalid token')
|
374 |
|
375 |
-
|
376 |
-
|
377 |
-
# submissions_24h_str = f'{submission_count}/{MAX_SUBMISSIONS_PER_24H}'
|
378 |
-
# team_submissions_df = lb_server.get_submissions_by_hf_token(hf_token=hf_token)
|
379 |
-
# if team_submissions_df.empty:
|
380 |
-
# team_submissions_df = pd.DataFrame(columns=['No submissions yet'])
|
381 |
-
# gr.Info('Token verified!')
|
382 |
|
383 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
384 |
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
389 |
|
390 |
main.launch()
|
|
|
|
|
1 |
import os
|
|
|
2 |
|
|
|
3 |
import gradio as gr
|
4 |
+
import pandas as pd
|
5 |
from gradio.themes.utils.sizes import text_md
|
6 |
|
7 |
+
from content import (HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN)
|
8 |
+
from server import LeaderboardServer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
+
leaderboard_server = LeaderboardServer()
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
def on_submit_pressed():
|
14 |
+
return gr.update(value='Processing submission...', interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
|
|
|
|
16 |
|
17 |
+
def validate_submission_inputs(team_name, submission_id, link_to_model, submission_file):
|
18 |
+
if not team_name or not submission_id or not link_to_model or not submission_file:
|
19 |
+
raise ValueError('Please fill in all fields')
|
20 |
+
if not os.path.exists(submission_file):
|
21 |
+
raise ValueError('File does not exist')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
def process_submission(team_name, submission_id, description, link_to_model, submission_file):
|
25 |
+
try:
|
26 |
+
validate_submission_inputs(team_name, submission_id, link_to_model, submission_file)
|
27 |
+
metadata = {
|
28 |
+
"team_name": team_name,
|
29 |
+
"submission_id": submission_id,
|
30 |
+
"description": description,
|
31 |
+
"link_to_model": link_to_model,
|
32 |
+
}
|
33 |
+
gr.Info('Submission valid, running local tournament...')
|
34 |
|
35 |
+
leaderboard_server.prepare_model_for_submission(submission_file, metadata)
|
36 |
+
except ValueError as err:
|
37 |
+
gr.Warning(str(err))
|
38 |
+
return gr.update(visible=False), gr.update(visible=True), gr.update(interactive=True,
|
39 |
+
visible=True), gr.update(
|
40 |
+
interactive=True, visible=True), gr.update(visible=True), gr.update(
|
41 |
+
value=leaderboard_server.get_leaderboard(leaderboard_server.pre_submit[0]), visible=True)
|
42 |
|
43 |
|
44 |
+
def submit_results():
|
45 |
+
leaderboard_server.save_pre_submit()
|
46 |
+
leaderboard_server.update_leaderboard()
|
47 |
+
gr.Info('Submission successful!')
|
48 |
+
return gr.update(value='Pre-submit model', visible=True, interactive=True), gr.update(
|
49 |
+
visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
|
50 |
+
visible=False), gr.update(visible=False), gr.DataFrame(
|
51 |
+
value=leaderboard_server.get_leaderboard(), visible=True)
|
52 |
|
53 |
|
54 |
+
def erase_presubmit():
|
55 |
+
leaderboard_server.pre_submit = None
|
56 |
+
return gr.update(value='Pre-submit model', visible=True, interactive=True), gr.update(
|
57 |
+
visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(
|
58 |
+
visible=False), gr.update(visible=False)
|
59 |
|
60 |
|
|
|
61 |
with (gr.Blocks(theme=gr.themes.Soft(text_size=text_md), css="footer {visibility: hidden}") as main):
|
62 |
app_state = gr.State({})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
with gr.Row():
|
65 |
with gr.Row():
|
66 |
gr.Markdown(HEADER_MARKDOWN)
|
67 |
|
68 |
with gr.Row():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
with gr.Tab('Leaderboard') as leaderboards_tab:
|
70 |
+
gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
|
71 |
+
results_table = gr.DataFrame(leaderboard_server.get_leaderboard(), interactive=False, label=None,
|
72 |
+
visible=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
with gr.Tab('Submission'):
|
75 |
with gr.Column():
|
76 |
+
gr.Markdown(SUBMISSION_TAB_TITLE_MARKDOWN)
|
77 |
+
with gr.Row():
|
78 |
+
submission_team_name_tb = gr.Textbox(label='Team Name')
|
79 |
+
submission_id_tb = gr.Textbox(label='Submission ID')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
with gr.Row():
|
82 |
description_tb = gr.Textbox(label='Description', type='text')
|
83 |
link_to_model_tb = gr.Textbox(label='Link to model', type='text')
|
84 |
|
|
|
|
|
|
|
|
|
85 |
submission_file_path = gr.File(label='Upload your results', type='filepath')
|
86 |
+
pre_submission_btn = gr.Button(value='Pre-submit model', interactive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
+
submit_prompt = gr.Markdown(
|
89 |
+
"""
|
90 |
+
Do you really want to submit a model? This action is irreversible.
|
91 |
+
""",
|
92 |
+
visible=False
|
93 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
|
95 |
+
pre_submit_info = gr.Markdown(
|
96 |
+
"""
|
97 |
+
This is how will ranking look like after your submission:
|
98 |
+
""",
|
99 |
+
visible=False
|
100 |
+
)
|
101 |
|
102 |
+
pre_submit_table = gr.DataFrame(pd.DataFrame(), interactive=False, label=None, visible=False)
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
+
submission_btn_yes = gr.Button(value='Submit model', interactive=False, visible=False)
|
105 |
+
submission_btn_no = gr.Button(value='Reverse process', interactive=False, visible=False)
|
|
|
|
|
|
|
|
|
|
|
106 |
|
107 |
+
pre_submission_btn.click(
|
108 |
+
fn=on_submit_pressed,
|
109 |
+
outputs=[pre_submission_btn]
|
110 |
+
).then(
|
111 |
+
fn=process_submission,
|
112 |
+
inputs=[submission_team_name_tb, submission_id_tb, description_tb, link_to_model_tb,
|
113 |
+
submission_file_path],
|
114 |
+
outputs=[pre_submission_btn, submit_prompt, submission_btn_yes, submission_btn_no, pre_submit_info,
|
115 |
+
pre_submit_table]
|
116 |
+
)
|
117 |
|
118 |
+
submission_btn_yes.click(
|
119 |
+
fn=submit_results,
|
120 |
+
outputs=[pre_submission_btn, submission_btn_yes, submission_btn_no, submit_prompt, pre_submit_info,
|
121 |
+
pre_submit_table, results_table]
|
122 |
+
)
|
123 |
+
submission_btn_no.click(
|
124 |
+
fn=erase_presubmit,
|
125 |
+
outputs=[pre_submission_btn, submission_btn_yes, submission_btn_no, submit_prompt, pre_submit_info,
|
126 |
+
pre_submit_table]
|
127 |
+
)
|
128 |
|
129 |
main.launch()
|
compare_significance.py
CHANGED
@@ -3,20 +3,18 @@ import json
|
|
3 |
from collections import defaultdict
|
4 |
from typing import Sequence
|
5 |
|
6 |
-
import numpy
|
7 |
import numpy as np
|
8 |
-
from
|
|
|
9 |
from sklearn.metrics import roc_curve, auc
|
10 |
from tqdm import tqdm
|
11 |
|
12 |
-
# from leaderboard import SUPPORTED_METRICS
|
13 |
-
|
14 |
SUPPORTED_METRICS = [
|
15 |
"avg_mcauroc", # for classification tasks
|
16 |
-
"
|
17 |
"acc", # for multichoice tasks
|
18 |
-
"
|
19 |
-
"
|
20 |
]
|
21 |
|
22 |
|
@@ -44,43 +42,70 @@ def _get_CMs(i, probabilities, references, thresholds):
|
|
44 |
return confusion_matrices
|
45 |
|
46 |
|
47 |
-
def
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
scores_B = [1 if pred == ref else 0 for pred, ref in zip(predsB, referencesB)]
|
52 |
t, p = ttest_rel(scores_A, scores_B)
|
53 |
# correct for one-tailed test
|
54 |
p_value = p / 2
|
55 |
-
delta = np.mean(scores_A) - np.mean(scores_B)
|
56 |
return p_value, delta
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
def
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
def compute_significance_avg_mcauroc(probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
|
71 |
probsB: Sequence[Sequence[float]], referencesB: Sequence[int]):
|
72 |
# compute MC-AUC for model A
|
73 |
-
model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=
|
74 |
-
model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=
|
|
|
75 |
|
76 |
# one-tailed test
|
77 |
p_value = ((model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
|
78 |
/ (len(model_A_scores) * len(model_B_scores)))
|
79 |
|
80 |
-
delta = np.mean(model_A_scores) - np.mean(model_B_scores)
|
81 |
return p_value, delta
|
82 |
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
|
85 |
n_classes = list(range(len(probs[0])))
|
86 |
fpr = dict()
|
@@ -93,23 +118,24 @@ def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
|
|
93 |
y_score=[prob[i] for prob in probs])
|
94 |
|
95 |
confusion_matrices = _get_CMs(i, probs, references, thresholds)
|
|
|
96 |
|
97 |
λ = 1.0 # <- Flat prior
|
98 |
# λ = 0.5 # <- Jeffrey's prior
|
99 |
|
100 |
# sample variates for every threshold
|
101 |
-
tpr_variates_for_each_fpr = []
|
102 |
-
for k in range(len(thresholds[i])):
|
103 |
-
|
104 |
-
|
|
|
105 |
|
106 |
# fprs x tpr_variates
|
107 |
-
tpr_variates_for_each_fpr = np.array(tpr_variates_for_each_fpr)
|
108 |
|
109 |
# now pick 1 variate for each fpr, and compute AUC
|
110 |
auc_scores = []
|
111 |
-
for tpr_variates in
|
112 |
-
desc=f"Computing AUCs for class {i + 1}/{len(n_classes)}"):
|
113 |
auc_score = auc(fpr[i], tpr_variates)
|
114 |
# if numpy.isnan(auc_score):
|
115 |
# auc_score = 0
|
@@ -141,18 +167,27 @@ def read_json(file_path):
|
|
141 |
golds = unzipped_list[0]
|
142 |
probs = unzipped_list[1]
|
143 |
data[task] = (golds, probs), metric
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
metricA = dataA[task][1]
|
157 |
metricB = dataB[task][1]
|
158 |
assert metricA == metricB
|
@@ -160,32 +195,33 @@ def check_significance_task(fileA, fileB, task, significance_level=0.05):
|
|
160 |
|
161 |
if metricA == "avg_mcauroc":
|
162 |
p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
raise NotImplementedError("Exact match is not supported yet.")
|
170 |
-
elif metricA == "rouge":
|
171 |
-
raise NotImplementedError("Rouge is not supported yet.")
|
172 |
-
elif metricA == "ppl":
|
173 |
-
raise NotImplementedError("Perplexity is not supported yet.")
|
174 |
else:
|
175 |
raise ValueError(f"Unsupported metric {metricA}")
|
176 |
-
|
|
|
|
|
|
|
|
|
177 |
"significant": not (p_value > significance_level),
|
178 |
"p_value": p_value,
|
179 |
"delta": delta,
|
180 |
}
|
181 |
-
|
182 |
|
183 |
def check_significance(fileA, fileB, significance_level=0.05):
|
184 |
-
dataA
|
185 |
-
dataB
|
186 |
-
|
187 |
decisions = dict()
|
188 |
-
|
|
|
|
|
189 |
metricA = dataA[task][1]
|
190 |
metricB = dataB[task][1]
|
191 |
assert metricA == metricB
|
@@ -195,37 +231,38 @@ def check_significance(fileA, fileB, significance_level=0.05):
|
|
195 |
p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
196 |
probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
|
197 |
|
198 |
-
elif metricA
|
199 |
-
p_value, delta =
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
elif metricA == "rouge":
|
204 |
-
raise NotImplementedError("Rouge is not supported yet.")
|
205 |
-
elif metricA == "ppl":
|
206 |
-
raise NotImplementedError("Perplexity is not supported yet.")
|
207 |
else:
|
208 |
raise ValueError(f"Unsupported metric {metricA}")
|
|
|
|
|
209 |
decisions[task] = {
|
210 |
"significant": not (p_value > significance_level),
|
211 |
"p_value": p_value,
|
212 |
"delta": delta,
|
213 |
}
|
|
|
214 |
return decisions
|
215 |
|
216 |
|
217 |
def main():
|
218 |
parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
|
219 |
-
parser.add_argument("--modelA", help="ModelA
|
220 |
-
parser.add_argument("--modelB", help="ModelB
|
221 |
parser.add_argument("--significance_level", type=float, default=0.05, help="Significance level (e.g., 0.05)")
|
222 |
args = parser.parse_args()
|
223 |
|
224 |
result = check_significance(args.modelA, args.modelB, args.significance_level)
|
225 |
print(json.dumps(result, indent=2))
|
226 |
|
|
|
227 |
# harness already returns stderr estimate for sampling distribution
|
228 |
# see https://github.com/EleutherAI/lm-evaluation-harness/blob/6433bd3fe3033d302b22cdcd53af237e9039ef29/lm_eval/api/metrics.py#L213
|
229 |
|
230 |
if __name__ == "__main__":
|
|
|
231 |
main()
|
|
|
3 |
from collections import defaultdict
|
4 |
from typing import Sequence
|
5 |
|
|
|
6 |
import numpy as np
|
7 |
+
from numba import njit, prange
|
8 |
+
from scipy.stats import ttest_rel
|
9 |
from sklearn.metrics import roc_curve, auc
|
10 |
from tqdm import tqdm
|
11 |
|
|
|
|
|
12 |
SUPPORTED_METRICS = [
|
13 |
"avg_mcauroc", # for classification tasks
|
14 |
+
"exact_match", # for QA tasks
|
15 |
"acc", # for multichoice tasks
|
16 |
+
"rouge_raw_r2_mid_f", # for summarization tasks
|
17 |
+
"word_perplexity", # for language modeling tasks
|
18 |
]
|
19 |
|
20 |
|
|
|
42 |
return confusion_matrices
|
43 |
|
44 |
|
45 |
+
def compute_significance_ttest(scores_A, scores_B):
|
46 |
+
delta = np.mean(scores_A) - np.mean(scores_B)
|
47 |
+
if delta <= 0:
|
48 |
+
return 1.0, delta
|
|
|
49 |
t, p = ttest_rel(scores_A, scores_B)
|
50 |
# correct for one-tailed test
|
51 |
p_value = p / 2
|
|
|
52 |
return p_value, delta
|
53 |
|
54 |
+
|
55 |
+
@njit(parallel=True)
|
56 |
+
def compute_significance_bootstrap(scores_A, scores_B):
|
57 |
+
n = len(scores_A)
|
58 |
+
R = 1_000
|
59 |
+
delta_orig = np.mean(scores_A) - np.mean(scores_B)
|
60 |
+
|
61 |
+
if delta_orig <= 0:
|
62 |
+
return 1.0, delta_orig
|
63 |
+
r = 0
|
64 |
+
for _ in prange(R):
|
65 |
+
samples = np.random.choice(n, n, replace=True)
|
66 |
+
temp_A = scores_A[samples]
|
67 |
+
temp_B = scores_B[samples]
|
68 |
+
delta = np.mean(temp_A) - np.mean(temp_B)
|
69 |
+
if delta > 2 * delta_orig:
|
70 |
+
r += 1
|
71 |
+
|
72 |
+
pval = r / R
|
73 |
+
return pval, delta_orig
|
74 |
+
|
75 |
+
|
76 |
def compute_significance_avg_mcauroc(probsA: Sequence[Sequence[float]], referencesA: Sequence[int],
|
77 |
probsB: Sequence[Sequence[float]], referencesB: Sequence[int]):
|
78 |
# compute MC-AUC for model A
|
79 |
+
model_A_scores = get_mc_auc_samples(probsA, referencesA, Nsamples=100)
|
80 |
+
model_B_scores = get_mc_auc_samples(probsB, referencesB, Nsamples=100)
|
81 |
+
delta = np.mean(model_A_scores) - np.mean(model_B_scores)
|
82 |
|
83 |
# one-tailed test
|
84 |
p_value = ((model_A_scores[:, np.newaxis] <= model_B_scores[np.newaxis, :]).sum()
|
85 |
/ (len(model_A_scores) * len(model_B_scores)))
|
86 |
|
|
|
87 |
return p_value, delta
|
88 |
|
89 |
|
90 |
+
# Helper function to convert confusion matrices to numba-compatible arrays
|
91 |
+
def convert_confusion_matrices(confusion_matrices):
|
92 |
+
num_thresholds = len(confusion_matrices)
|
93 |
+
tp = np.empty(num_thresholds)
|
94 |
+
fn = np.empty(num_thresholds)
|
95 |
+
for k in range(num_thresholds):
|
96 |
+
tp[k] = confusion_matrices[k]["TP"]
|
97 |
+
fn[k] = confusion_matrices[k]["FN"]
|
98 |
+
return tp, fn
|
99 |
+
|
100 |
+
|
101 |
+
@njit(parallel=True)
|
102 |
+
def compute_tpr_variates(tp, fn, λ, Nsamples, num_thresholds):
|
103 |
+
tpr_variates_for_each_fpr = np.empty((num_thresholds, Nsamples))
|
104 |
+
for k in prange(num_thresholds):
|
105 |
+
tpr_variates_for_each_fpr[k, :] = np.random.beta(tp[k] + λ, fn[k] + λ, Nsamples)
|
106 |
+
return tpr_variates_for_each_fpr
|
107 |
+
|
108 |
+
|
109 |
def get_mc_auc_samples(probs, references, Nsamples=1_000_000):
|
110 |
n_classes = list(range(len(probs[0])))
|
111 |
fpr = dict()
|
|
|
118 |
y_score=[prob[i] for prob in probs])
|
119 |
|
120 |
confusion_matrices = _get_CMs(i, probs, references, thresholds)
|
121 |
+
tp, fn = convert_confusion_matrices(confusion_matrices)
|
122 |
|
123 |
λ = 1.0 # <- Flat prior
|
124 |
# λ = 0.5 # <- Jeffrey's prior
|
125 |
|
126 |
# sample variates for every threshold
|
127 |
+
# tpr_variates_for_each_fpr = []
|
128 |
+
# for k in range(len(thresholds[i])):
|
129 |
+
# tpr_variates_for_each_fpr.append(
|
130 |
+
# numpy.random.beta(confusion_matrices[k]["TP"] + λ, confusion_matrices[k]["FN"] + λ, Nsamples))
|
131 |
+
tpr_variates_for_each_fpr = compute_tpr_variates(tp, fn, λ, Nsamples, len(thresholds[i]))
|
132 |
|
133 |
# fprs x tpr_variates
|
134 |
+
# tpr_variates_for_each_fpr = np.array(tpr_variates_for_each_fpr)
|
135 |
|
136 |
# now pick 1 variate for each fpr, and compute AUC
|
137 |
auc_scores = []
|
138 |
+
for tpr_variates in tpr_variates_for_each_fpr.T:
|
|
|
139 |
auc_score = auc(fpr[i], tpr_variates)
|
140 |
# if numpy.isnan(auc_score):
|
141 |
# auc_score = 0
|
|
|
167 |
golds = unzipped_list[0]
|
168 |
probs = unzipped_list[1]
|
169 |
data[task] = (golds, probs), metric
|
170 |
+
else:
|
171 |
+
scores = [line[metric] for line in fc["predictions"][task]]
|
172 |
+
data[task] = scores, metric
|
173 |
+
|
174 |
+
# make sure all tasks are submitted
|
175 |
+
METADATA_FILE = "tasks_metadata.json"
|
176 |
+
with open(METADATA_FILE, "r") as f:
|
177 |
+
metadata = json.load(f)
|
178 |
+
|
179 |
+
all_tasks = list(metadata["tasks"].keys())
|
180 |
+
all_missing_tasks = []
|
181 |
+
for task in all_tasks:
|
182 |
+
if task not in data:
|
183 |
+
all_missing_tasks.append(task)
|
184 |
+
if len(all_missing_tasks) > 0:
|
185 |
+
EOLN = "\n"
|
186 |
+
raise ValueError(f"Missing tasks in {file_path}: {EOLN.join(all_missing_tasks)}")
|
187 |
+
return data
|
188 |
+
|
189 |
+
|
190 |
+
def process_task(task, dataA, dataB, significance_level):
|
191 |
metricA = dataA[task][1]
|
192 |
metricB = dataB[task][1]
|
193 |
assert metricA == metricB
|
|
|
195 |
|
196 |
if metricA == "avg_mcauroc":
|
197 |
p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
198 |
+
probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
|
199 |
+
elif metricA in ["acc", "exact_match"]:
|
200 |
+
p_value, delta = compute_significance_ttest(scores_A=dataA[task][0], scores_B=dataB[task][0])
|
201 |
+
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
202 |
+
p_value, delta = compute_significance_bootstrap(scores_A=np.array(dataA[task][0]),
|
203 |
+
scores_B=np.array(dataB[task][0]))
|
|
|
|
|
|
|
|
|
|
|
204 |
else:
|
205 |
raise ValueError(f"Unsupported metric {metricA}")
|
206 |
+
|
207 |
+
if delta <= 0:
|
208 |
+
p_value = 1.0
|
209 |
+
|
210 |
+
return task, {
|
211 |
"significant": not (p_value > significance_level),
|
212 |
"p_value": p_value,
|
213 |
"delta": delta,
|
214 |
}
|
215 |
+
|
216 |
|
217 |
def check_significance(fileA, fileB, significance_level=0.05):
|
218 |
+
dataA = read_json(fileA)
|
219 |
+
dataB = read_json(fileB)
|
220 |
+
|
221 |
decisions = dict()
|
222 |
+
_iter = tqdm(list(dataA.keys()))
|
223 |
+
for task in _iter:
|
224 |
+
_iter.set_description(f"Processing task: {task}")
|
225 |
metricA = dataA[task][1]
|
226 |
metricB = dataB[task][1]
|
227 |
assert metricA == metricB
|
|
|
231 |
p_value, delta = compute_significance_avg_mcauroc(probsA=dataA[task][0][1], referencesA=dataA[task][0][0],
|
232 |
probsB=dataB[task][0][1], referencesB=dataB[task][0][0])
|
233 |
|
234 |
+
elif metricA in ["acc", "exact_match"]:
|
235 |
+
p_value, delta = compute_significance_ttest(scores_A=dataA[task][0], scores_B=dataB[task][0])
|
236 |
+
elif metricA in ["rouge_raw_r2_mid_f", "word_perplexity"]:
|
237 |
+
p_value, delta = compute_significance_bootstrap(scores_A=np.array(dataA[task][0]),
|
238 |
+
scores_B=np.array(dataB[task][0]))
|
|
|
|
|
|
|
|
|
239 |
else:
|
240 |
raise ValueError(f"Unsupported metric {metricA}")
|
241 |
+
if delta <= 0:
|
242 |
+
p_value = 1.0
|
243 |
decisions[task] = {
|
244 |
"significant": not (p_value > significance_level),
|
245 |
"p_value": p_value,
|
246 |
"delta": delta,
|
247 |
}
|
248 |
+
|
249 |
return decisions
|
250 |
|
251 |
|
252 |
def main():
|
253 |
parser = argparse.ArgumentParser(description="One-tailed test if model A improves over model B.")
|
254 |
+
parser.add_argument("--modelA", help="ModelA JSON file from lm harness.")
|
255 |
+
parser.add_argument("--modelB", help="ModelB JSON file from lm harness.")
|
256 |
parser.add_argument("--significance_level", type=float, default=0.05, help="Significance level (e.g., 0.05)")
|
257 |
args = parser.parse_args()
|
258 |
|
259 |
result = check_significance(args.modelA, args.modelB, args.significance_level)
|
260 |
print(json.dumps(result, indent=2))
|
261 |
|
262 |
+
|
263 |
# harness already returns stderr estimate for sampling distribution
|
264 |
# see https://github.com/EleutherAI/lm-evaluation-harness/blob/6433bd3fe3033d302b22cdcd53af237e9039ef29/lm_eval/api/metrics.py#L213
|
265 |
|
266 |
if __name__ == "__main__":
|
267 |
+
check_significance("../csmpt.json", "../llama3_instruct.json", 0.05)
|
268 |
main()
|
content.py
CHANGED
@@ -2,55 +2,27 @@
|
|
2 |
This file contains the text content for the leaderboard client.
|
3 |
"""
|
4 |
|
5 |
-
|
6 |
-
#
|
7 |
-
|
8 |
-
|
9 |
-
# For details, visit:
|
10 |
-
# 1. [DASR](https://www.chimechallenge.org/current/task1/index)
|
11 |
-
# 2. [NOTSOFAR](https://www.chimechallenge.org/current/task2/index)
|
12 |
-
# 3. [MMCSG](https://www.chimechallenge.org/current/task3/index)
|
13 |
-
#
|
14 |
-
#
|
15 |
-
# ### DASR and NOTSOFAR - the scientific story
|
16 |
-
# Both tasks focus on distant automatic speech recognition and speaker diarization, offering a fundamental comparison
|
17 |
-
# among different system designs:
|
18 |
-
# - Single-channel (SC), 1 device (NOTSOFAR-SC)
|
19 |
-
# - Multi-channel (MC), known-geometry, 1 device (NOTSOFAR-MC)
|
20 |
-
# - Multi-channel (MC), geometry-agnostic, multiple devices (DASR-Constrained-LM and DASR-Unconstrained-LM)
|
21 |
-
#
|
22 |
-
# Featured in both tasks, the NOTSOFAR recorded meeting dataset is leveraged as a common benchmark:
|
23 |
-
# each geometry-agnostic MC system submitted to DASR tracks (constrained or not) will also be **automatically submitted**
|
24 |
-
# to the known-geometry single-device NOTSOFAR-MC track. These entries will be marked with "DASR" to denote their origin.
|
25 |
-
# """
|
26 |
-
HEADER_MARKDOWN = """ """
|
27 |
|
28 |
LEADERBOARD_TAB_TITLE_MARKDOWN = """
|
29 |
-
|
30 |
-
|
31 |
-
|
|
|
32 |
|
33 |
SUBMISSION_TAB_TITLE_MARKDOWN = """
|
34 |
## Submission
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
- *Team Name:* The name of your team, as it will appear on the leaderboard'
|
39 |
-
- *Results:* Results zip file to submit
|
40 |
-
- *Submission track:* The track to submit results to
|
41 |
-
- *Token:* Your Hugging Face token
|
42 |
- *Description:* Short description of your submission (optional)
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
**Team creation:** Upon the first submission, your team name is associated with your Hugging Face user account.
|
48 |
-
Any token generated by your account can be used. All team members should use this specific user's token for
|
49 |
-
future submissions.
|
50 |
-
|
51 |
-
**Submission limit:** 5 submissions per team every 24 hours. Each participant should only belong to one team.
|
52 |
-
Changing team names is allowed, but it is not intended to bypass the daily submission limit.
|
53 |
-
"""
|
54 |
-
|
55 |
-
SUBMISSION_TAB_TITLE_MARKDOWN = """
|
56 |
"""
|
|
|
2 |
This file contains the text content for the leaderboard client.
|
3 |
"""
|
4 |
|
5 |
+
HEADER_MARKDOWN = """
|
6 |
+
# BenCzechMark
|
7 |
+
Welcome to the leaderboard! Here you can submit your model and compare it with the existing models.
|
8 |
+
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
LEADERBOARD_TAB_TITLE_MARKDOWN = """
|
11 |
+
## Leaderboard
|
12 |
+
The leaderboard below shows the current ranking of the models...
|
13 |
+
|
14 |
+
"""
|
15 |
|
16 |
SUBMISSION_TAB_TITLE_MARKDOWN = """
|
17 |
## Submission
|
18 |
+
To submit your model, please fill in the form below.
|
19 |
|
20 |
+
- *Team name:* The name of your team, as it will appear on the leaderboard'
|
21 |
+
- *Submission ID:* Results json file to submit
|
|
|
|
|
|
|
|
|
22 |
- *Description:* Short description of your submission (optional)
|
23 |
+
- *Link to model:* Link to the model's repository or documentation
|
24 |
|
25 |
+
After filling in the form, click the **Pre-submit model** button.
|
26 |
+
This will run a comparison of your model with the existing leaderboard models.
|
27 |
+
After the tournament is complete, you will be able to submit your model to the leaderboard.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
"""
|
model_compare.py
CHANGED
@@ -1,62 +1,34 @@
|
|
1 |
-
|
2 |
from functools import cmp_to_key
|
3 |
-
|
4 |
-
|
5 |
-
class ModelCompare
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
"propaganda_relativizace",
|
10 |
-
"propaganda_argumentace",
|
11 |
-
"propaganda_lokace",
|
12 |
-
"propaganda_nazor",
|
13 |
-
"propaganda_emoce",
|
14 |
-
"propaganda_fabulace",
|
15 |
-
"propaganda_nalepkovani",
|
16 |
-
"propaganda_zamereni",
|
17 |
-
"propaganda_zanr",
|
18 |
-
"propaganda_rusko",
|
19 |
-
"propaganda_strach",
|
20 |
-
"benczechmark_sentiment"]
|
21 |
-
|
22 |
-
def __init__(self, ranks:dict=None):
|
23 |
self.ranks = ranks
|
|
|
24 |
|
25 |
-
def compare_models(self,
|
26 |
if not self.ranks:
|
27 |
raise Exception("Missing model rankings")
|
28 |
-
|
29 |
-
res = self.ranks[
|
30 |
-
if res
|
31 |
return 1
|
32 |
-
elif res
|
33 |
return -1
|
34 |
else:
|
35 |
return -1
|
36 |
|
|
|
|
|
37 |
|
38 |
-
def get_tasks_ranks(self, ranks:dict) -> dict:
|
39 |
-
'''Order models based on the significance improvement'''
|
40 |
-
|
41 |
self.ranks = ranks
|
42 |
-
|
43 |
tasks_ranks = {}
|
44 |
-
|
45 |
models = ranks.keys()
|
46 |
-
for task in self.
|
47 |
self.current_task = task
|
48 |
tasks_ranks[task] = sorted(models, key=cmp_to_key(self.compare_models))
|
49 |
return tasks_ranks
|
50 |
-
|
51 |
-
|
52 |
-
# models = {
|
53 |
-
# model1 : {
|
54 |
-
# task1 : order_idx
|
55 |
-
# task2 : order_idx
|
56 |
-
# task3 : order_idx
|
57 |
-
# }
|
58 |
-
# }
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
1 |
from functools import cmp_to_key
|
2 |
+
|
3 |
+
|
4 |
+
class ModelCompare:
|
5 |
+
|
6 |
+
def __init__(self, tasks, ranks: dict = None):
|
7 |
+
self.current_task = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
self.ranks = ranks
|
9 |
+
self.tasks = tasks
|
10 |
|
11 |
+
def compare_models(self, model_a, model_b):
|
12 |
if not self.ranks:
|
13 |
raise Exception("Missing model rankings")
|
14 |
+
|
15 |
+
res = self.ranks[model_a][model_b][self.current_task]
|
16 |
+
if res:
|
17 |
return 1
|
18 |
+
elif not res:
|
19 |
return -1
|
20 |
else:
|
21 |
return -1
|
22 |
|
23 |
+
def get_tasks_ranks(self, ranks: dict) -> dict:
|
24 |
+
"""Order models based on the significance improvement"""
|
25 |
|
|
|
|
|
|
|
26 |
self.ranks = ranks
|
27 |
+
|
28 |
tasks_ranks = {}
|
29 |
+
|
30 |
models = ranks.keys()
|
31 |
+
for task in self.tasks:
|
32 |
self.current_task = task
|
33 |
tasks_ranks[task] = sorted(models, key=cmp_to_key(self.compare_models))
|
34 |
return tasks_ranks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
server.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import glob
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import pandas as pd
|
8 |
+
from huggingface_hub import HfApi, snapshot_download
|
9 |
+
|
10 |
+
from compare_significance import check_significance
|
11 |
+
from model_compare import ModelCompare
|
12 |
+
|
13 |
+
api = HfApi()
|
14 |
+
|
15 |
+
ORG = "CZLC"
|
16 |
+
REPO = f"{ORG}/LLM_benchmark_data"
|
17 |
+
HF_TOKEN = os.environ.get("HF_TOKEN")
|
18 |
+
TASKS_METADATA_PATH = "./tasks_metadata.json"
|
19 |
+
|
20 |
+
|
21 |
+
class LeaderboardServer:
|
22 |
+
def __init__(self):
|
23 |
+
self.server_address = REPO
|
24 |
+
self.repo_type = "dataset"
|
25 |
+
self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,
|
26 |
+
local_dir="./")
|
27 |
+
self.submisssion_id_to_file = {} # Map submission ids to file paths
|
28 |
+
self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))['tasks']
|
29 |
+
self.submission_ids = set()
|
30 |
+
self.comparer = ModelCompare(self.tasks_metadata.keys())
|
31 |
+
self.fetch_existing_models()
|
32 |
+
self.tournament_results = self.load_tournament_results()
|
33 |
+
self.pre_submit = None
|
34 |
+
|
35 |
+
def update_leaderboard(self):
|
36 |
+
self.local_leaderboard = snapshot_download(self.server_address, repo_type=self.repo_type, token=HF_TOKEN,
|
37 |
+
local_dir="./")
|
38 |
+
self.fetch_existing_models()
|
39 |
+
self.tournament_results = self.load_tournament_results()
|
40 |
+
|
41 |
+
def load_tournament_results(self):
|
42 |
+
metadata_rank_paths = os.path.join(self.local_leaderboard, "tournament.json")
|
43 |
+
if not os.path.exists(metadata_rank_paths):
|
44 |
+
return {}
|
45 |
+
with open(metadata_rank_paths) as ranks_file:
|
46 |
+
results = json.load(ranks_file)
|
47 |
+
return results
|
48 |
+
|
49 |
+
def fetch_existing_models(self):
|
50 |
+
# Models data
|
51 |
+
for submission in glob.glob(os.path.join(self.local_leaderboard, "data") + "/*.json"):
|
52 |
+
data = json.load(open(submission))
|
53 |
+
metadata = data.get('metadata')
|
54 |
+
if metadata is None:
|
55 |
+
continue
|
56 |
+
submission_id = metadata["team_name"] + "_" + metadata["submission_id"]
|
57 |
+
self.submission_ids.add(submission_id)
|
58 |
+
|
59 |
+
self.submisssion_id_to_file[submission_id] = submission
|
60 |
+
|
61 |
+
def get_leaderboard(self, tournament_results=None):
|
62 |
+
rank_based_on = tournament_results if tournament_results else self.tournament_results
|
63 |
+
|
64 |
+
if len(rank_based_on) == 0:
|
65 |
+
return pd.DataFrame(columns=['No submissions yet'])
|
66 |
+
else:
|
67 |
+
ranks = self.comparer.get_tasks_ranks(rank_based_on)
|
68 |
+
results = []
|
69 |
+
for submission in rank_based_on.keys():
|
70 |
+
path = self.submisssion_id_to_file.get(submission)
|
71 |
+
if path is None:
|
72 |
+
if self.pre_submit and submission == self.pre_submit[1]:
|
73 |
+
data = json.load(open(self.pre_submit[2]))
|
74 |
+
else:
|
75 |
+
raise gr.Error(f"Internal error: Submission [{submission}] not found")
|
76 |
+
elif path:
|
77 |
+
data = json.load(open(path))
|
78 |
+
else:
|
79 |
+
raise gr.Error(f"Submission [{submission}] not found")
|
80 |
+
submission_id = data["metadata"]["team_name"] + "_" + data["metadata"]["submission_id"]
|
81 |
+
|
82 |
+
local_results = {task: list(task_ranks).index(submission_id) + 1 for task, task_ranks in ranks.items()}
|
83 |
+
local_results["submission_id"] = submission_id
|
84 |
+
if self.pre_submit and submission == self.pre_submit[1]:
|
85 |
+
results.insert(0, local_results)
|
86 |
+
else:
|
87 |
+
results.append(local_results)
|
88 |
+
dataframe = pd.DataFrame.from_records(results)
|
89 |
+
df_order = ["submission_id"] + [col for col in dataframe.columns if col != "submission_id"]
|
90 |
+
dataframe = dataframe[df_order]
|
91 |
+
dataframe = dataframe.rename(columns={key: value["name"] for key, value in self.tasks_metadata.items()})
|
92 |
+
return dataframe
|
93 |
+
|
94 |
+
def start_tournament(self, new_model_id, new_model_file):
|
95 |
+
new_tournament = copy.deepcopy(self.tournament_results)
|
96 |
+
new_tournament[new_model_id] = {}
|
97 |
+
new_tournament[new_model_id][new_model_id] = {task: False for task in self.tasks_metadata.keys()}
|
98 |
+
|
99 |
+
for model in self.submission_ids:
|
100 |
+
res = check_significance(new_model_file, self.submisssion_id_to_file[model])
|
101 |
+
res_inverse = check_significance(self.submisssion_id_to_file[model], new_model_file)
|
102 |
+
new_tournament[new_model_id][model] = {
|
103 |
+
task: data["significant"] for task, data in res.items()
|
104 |
+
}
|
105 |
+
new_tournament[model][new_model_id] = {
|
106 |
+
task: data["significant"] for task, data in res_inverse.items()
|
107 |
+
}
|
108 |
+
return new_tournament
|
109 |
+
|
110 |
+
def prepare_model_for_submission(self, file, metadata) -> None:
|
111 |
+
with open(file, "r") as f:
|
112 |
+
data = json.load(f)
|
113 |
+
data["metadata"] = metadata
|
114 |
+
with open(file, "w") as f:
|
115 |
+
json.dump(data, f)
|
116 |
+
|
117 |
+
model_id = metadata["team_name"] + "_" + metadata["submission_id"]
|
118 |
+
tournament_results = self.start_tournament(model_id, file)
|
119 |
+
self.pre_submit = tournament_results, model_id, file
|
120 |
+
|
121 |
+
def save_pre_submit(self):
|
122 |
+
if self.pre_submit:
|
123 |
+
tournament_results, model_id, file = self.pre_submit
|
124 |
+
filename = os.path.basename(file)
|
125 |
+
api.upload_file(
|
126 |
+
path_or_fileobj=file,
|
127 |
+
path_in_repo=f"data/{model_id}_{filename}",
|
128 |
+
repo_id=self.server_address,
|
129 |
+
repo_type=self.repo_type,
|
130 |
+
token=HF_TOKEN,
|
131 |
+
)
|
132 |
+
|
133 |
+
# Temporary save tournament results
|
134 |
+
tournament_results_path = os.path.join(self.local_leaderboard, "tournament.json")
|
135 |
+
with open(tournament_results_path, "w") as f:
|
136 |
+
json.dump(tournament_results, f)
|
137 |
+
|
138 |
+
api.upload_file(
|
139 |
+
path_or_fileobj=tournament_results_path,
|
140 |
+
path_in_repo="tournament.json",
|
141 |
+
repo_id=self.server_address,
|
142 |
+
repo_type=self.repo_type,
|
143 |
+
token=HF_TOKEN,
|
144 |
+
)
|
tasks_metadata.json
ADDED
@@ -0,0 +1,204 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"tasks": {
|
3 |
+
"benczechmark_propaganda_argumentace": {
|
4 |
+
"task": "benczechmark_propaganda_argumentace",
|
5 |
+
"name": "P-Argumentace",
|
6 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_argumentace"
|
7 |
+
},
|
8 |
+
"benczechmark_propaganda_fabulace": {
|
9 |
+
"task": "benczechmark_propaganda_fabulace",
|
10 |
+
"name": "P-Fabulace",
|
11 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_fabulace"
|
12 |
+
},
|
13 |
+
"benczechmark_propaganda_nazor": {
|
14 |
+
"task": "benczechmark_propaganda_nazor",
|
15 |
+
"name": "P-Názor",
|
16 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_nazor"
|
17 |
+
},
|
18 |
+
"benczechmark_propaganda_strach": {
|
19 |
+
"task": "benczechmark_propaganda_strach",
|
20 |
+
"name": "P-Strach",
|
21 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_strach"
|
22 |
+
},
|
23 |
+
"benczechmark_propaganda_zamereni": {
|
24 |
+
"task": "benczechmark_propaganda_zamereni",
|
25 |
+
"name": "P-Zaměření",
|
26 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_zamereni"
|
27 |
+
},
|
28 |
+
"benczechmark_propaganda_demonizace": {
|
29 |
+
"task": "benczechmark_propaganda_demonizace",
|
30 |
+
"name": "P-Demonizace",
|
31 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_demonizace"
|
32 |
+
},
|
33 |
+
"benczechmark_propaganda_lokace": {
|
34 |
+
"task": "benczechmark_propaganda_lokace",
|
35 |
+
"name": "P-Lokace",
|
36 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_lokace"
|
37 |
+
},
|
38 |
+
"benczechmark_propaganda_relativizace": {
|
39 |
+
"task": "benczechmark_propaganda_relativizace",
|
40 |
+
"name": "P-Relativizace",
|
41 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_relativizace"
|
42 |
+
},
|
43 |
+
"benczechmark_propaganda_vina": {
|
44 |
+
"task": "benczechmark_propaganda_vina",
|
45 |
+
"name": "P-Vina",
|
46 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_vina"
|
47 |
+
},
|
48 |
+
"benczechmark_propaganda_zanr": {
|
49 |
+
"task": "benczechmark_propaganda_zanr",
|
50 |
+
"name": "P-Žánr",
|
51 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_zanr"
|
52 |
+
},
|
53 |
+
"benczechmark_propaganda_emoce": {
|
54 |
+
"task": "benczechmark_propaganda_emoce",
|
55 |
+
"name": "P-Emoce",
|
56 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_emoce"
|
57 |
+
},
|
58 |
+
"benczechmark_propaganda_nalepkovani": {
|
59 |
+
"task": "benczechmark_propaganda_nalepkovani",
|
60 |
+
"name": "P-Nalepkování",
|
61 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_nalepkovani"
|
62 |
+
},
|
63 |
+
"benczechmark_propaganda_rusko": {
|
64 |
+
"task": "benczechmark_propaganda_rusko",
|
65 |
+
"name": "P-Rusko",
|
66 |
+
"source_url": "https://huggingface.co/datasets/CZLC/propaganda_rusko"
|
67 |
+
},
|
68 |
+
"benczechmark_sentiment_mall": {
|
69 |
+
"task": "benczechmark_sentiment_mall",
|
70 |
+
"name": "S-Mall",
|
71 |
+
"source_url": "https://huggingface.co/datasets/CZLC/mall_sentiment_balanced"
|
72 |
+
},
|
73 |
+
"benczechmark_sentiment_fb": {
|
74 |
+
"task": "benczechmark_sentiment_fb",
|
75 |
+
"name": "S-FB",
|
76 |
+
"source_url": "https://huggingface.co/datasets/CZLC/fb_sentiment_balanced"
|
77 |
+
},
|
78 |
+
"benczechmark_sentiment_csfd": {
|
79 |
+
"task": "benczechmark_sentiment_csfd",
|
80 |
+
"name": "S-CSFD",
|
81 |
+
"source_url": "https://huggingface.co/datasets/CZLC/csfd_sentiment_balanced"
|
82 |
+
},
|
83 |
+
"benczechmark_summarization": {
|
84 |
+
"task": "benczechmark_summarization",
|
85 |
+
"name": "Summarization",
|
86 |
+
"source_url": "https://huggingface.co/datasets/CZLC/sumeczech_downsampled"
|
87 |
+
},
|
88 |
+
"benczechmark_grammarerrorcorrection": {
|
89 |
+
"task": "benczechmark_grammarerrorcorrection",
|
90 |
+
"name": "Grammar Error Correction",
|
91 |
+
"source_url": "https://huggingface.co/datasets/CZLC/cs_gec"
|
92 |
+
},
|
93 |
+
"benczechmark_cs_naturalquestions": {
|
94 |
+
"task": "benczechmark_cs_naturalquestions",
|
95 |
+
"name": "CS Natural Questions",
|
96 |
+
"source_url": "https://huggingface.co/datasets/CZLC/cs_naturalquestions"
|
97 |
+
},
|
98 |
+
"benczechmark_cs_sqad32": {
|
99 |
+
"task": "benczechmark_cs_sqad32",
|
100 |
+
"name": "CS SQAD 3.2",
|
101 |
+
"source_url": "https://huggingface.co/datasets/CZLC/SQAD_3.2"
|
102 |
+
},
|
103 |
+
"benczechmark_cs_triviaQA": {
|
104 |
+
"task": "benczechmark_cs_triviaQA",
|
105 |
+
"name": "CS TriviaQA",
|
106 |
+
"source_url": "https://huggingface.co/datasets/CZLC/cs_triviaqa"
|
107 |
+
},
|
108 |
+
"benczechmark_csfever_nli": {
|
109 |
+
"task": "benczechmark_csfever_nli",
|
110 |
+
"name": "CSFever NLI",
|
111 |
+
"source_url": "https://huggingface.co/datasets/CZLC/ctu-aic/csfever_nli"
|
112 |
+
},
|
113 |
+
"benczechmark_ctkfacts_nli": {
|
114 |
+
"task": "benczechmark_ctkfacts_nli",
|
115 |
+
"name": "CTKFacts NLI",
|
116 |
+
"source_url": "https://huggingface.co/datasets/CZLC/ctu-aic/ctkfacts_nli"
|
117 |
+
},
|
118 |
+
"benczechmark_cs_ner": {
|
119 |
+
"task": "benczechmark_cs_ner",
|
120 |
+
"name": "CS NER",
|
121 |
+
"source_url": "https://huggingface.co/datasets/CZLC/fewshot-goes-multilingual/cs_czech-named-entity-corpus_2.0"
|
122 |
+
},
|
123 |
+
"benczechmark_hellaswag": {
|
124 |
+
"task": "benczechmark_hellaswag",
|
125 |
+
"name": "HellaSwag",
|
126 |
+
"source_url": "https://huggingface.co/datasets/CZLC/cs_hellaswag"
|
127 |
+
},
|
128 |
+
"benczechmark_histcorpus": {
|
129 |
+
"task": "benczechmark_histcorpus",
|
130 |
+
"name": "HistCorpus",
|
131 |
+
"source_url": "https://huggingface.co/datasets/CZLC/benczechmark_histcorpus"
|
132 |
+
},
|
133 |
+
"benczechmark_klokan_qa": {
|
134 |
+
"task": "benczechmark_klokan_qa",
|
135 |
+
"name": "Klokan QA",
|
136 |
+
"source_url": "https://huggingface.co/datasets/hynky/klokan-qa"
|
137 |
+
},
|
138 |
+
"benczechmark_cs_court_decisions_ner": {
|
139 |
+
"task": "benczechmark_cs_court_decisions_ner",
|
140 |
+
"name": "CS Court Decisions NER",
|
141 |
+
"source_url": "https://huggingface.co/datasets/CZLC/fewshot-goes-multilingual/cs_czech-court-decisions-ner"
|
142 |
+
},
|
143 |
+
"benczechmark_umimeto_biology": {
|
144 |
+
"task": "benczechmark_umimeto_biology",
|
145 |
+
"name": "Umimeto.cz - Biology",
|
146 |
+
"source_url": "https://huggingface.co/datasets/CZLC/umimeto-biology"
|
147 |
+
},
|
148 |
+
"benczechmark_umimeto_chemistry": {
|
149 |
+
"task": "benczechmark_umimeto_chemistry",
|
150 |
+
"name": "Umimeto.cz - Chemistry",
|
151 |
+
"source_url": "https://huggingface.co/datasets/CZLC/umimeto-chemistry"
|
152 |
+
},
|
153 |
+
"benczechmark_umimeto_czech": {
|
154 |
+
"task": "benczechmark_umimeto_czech",
|
155 |
+
"name": "Umimeto.cz - Czech",
|
156 |
+
"source_url": "https://huggingface.co/datasets/CZLC/umimeto-czech"
|
157 |
+
},
|
158 |
+
"benczechmark_umimeto_history": {
|
159 |
+
"task": "benczechmark_umimeto_history",
|
160 |
+
"name": "Umimeto.cz - History",
|
161 |
+
"source_url": "https://huggingface.co/datasets/CZLC/umimeto-history"
|
162 |
+
},
|
163 |
+
"benczechmark_umimeto_informatics": {
|
164 |
+
"task": "benczechmark_umimeto_informatics",
|
165 |
+
"name": "Umimeto.cz - Informatics",
|
166 |
+
"source_url": "https://huggingface.co/datasets/CZLC/umimeto-informatics"
|
167 |
+
},
|
168 |
+
"benczechmark_umimeto_math": {
|
169 |
+
"task": "benczechmark_umimeto_math",
|
170 |
+
"name": "Umimeto.cz - Math",
|
171 |
+
"source_url": "https://huggingface.co/datasets/CZLC/umimeto-math"
|
172 |
+
},
|
173 |
+
"benczechmark_umimeto_physics": {
|
174 |
+
"task": "benczechmark_umimeto_physics",
|
175 |
+
"name": "Umimeto.cz - Physics",
|
176 |
+
"source_url": "https://huggingface.co/datasets/CZLC/umimeto-physics"
|
177 |
+
},
|
178 |
+
"benczechmark_cermat_czmath_mc": {
|
179 |
+
"task": "benczechmark_cermat_czmath_mc",
|
180 |
+
"name": "Cermat Czech Math MC",
|
181 |
+
"source_url": "https://huggingface.co/datasets/CZLC/cermat_math_mc"
|
182 |
+
},
|
183 |
+
"benczechmark_cermat_czmath_open": {
|
184 |
+
"task": "benczechmark_cermat_czmath_open",
|
185 |
+
"name": "Cermat Czech Math Open",
|
186 |
+
"source_url": "https://huggingface.co/datasets/CZLC/cermat_math_open"
|
187 |
+
},
|
188 |
+
"benczechmark_cermat_czech_tf": {
|
189 |
+
"task": "benczechmark_cermat_czech_tf",
|
190 |
+
"name": "Cermat Czech Language TF",
|
191 |
+
"source_url": "https://huggingface.co/datasets/CZLC/cermat_czech_tf"
|
192 |
+
},
|
193 |
+
"benczechmark_cermat_czech_mc": {
|
194 |
+
"task": "benczechmark_cermat_czech_mc",
|
195 |
+
"name": "Cermat Czech Language MC",
|
196 |
+
"source_url": "https://huggingface.co/datasets/CZLC/cermat_czech_mc"
|
197 |
+
},
|
198 |
+
"benczechmark_cermat_czech_open": {
|
199 |
+
"task": "benczechmark_cermat_czech_open",
|
200 |
+
"name": "Cermat Czech Language Open",
|
201 |
+
"source_url": "https://huggingface.co/datasets/CZLC/cermat_czech_open"
|
202 |
+
}
|
203 |
+
}
|
204 |
+
}
|