CultriX commited on
Commit
7d1b966
·
verified ·
1 Parent(s): c270341

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +29 -224
app.py CHANGED
@@ -10,14 +10,9 @@ from yall import create_yall
10
  import plotly.graph_objs as go
11
  from huggingface_hub import ModelCard
12
 
13
-
14
-
15
  def calculate_pages(df, items_per_page):
16
  return -(-len(df) // items_per_page) # Equivalent to math.ceil(len(df) / items_per_page)
17
 
18
-
19
-
20
- # Function to get model info from Hugging Face API using caching
21
  @st.cache_data
22
  def cached_model_info(api, model):
23
  try:
@@ -25,7 +20,6 @@ def cached_model_info(api, model):
25
  except (RepositoryNotFoundError, RevisionNotFoundError):
26
  return None
27
 
28
- # Function to get model info from DataFrame and update it with likes and tags
29
  @st.cache_data
30
  def get_model_info(df):
31
  api = HfApi()
@@ -40,143 +34,47 @@ def get_model_info(df):
40
  df.loc[index, 'Tags'] = ''
41
  return df
42
 
43
- # Function to convert markdown table to DataFrame and extract Hugging Face URLs
44
  def convert_markdown_table_to_dataframe(md_content):
45
- """
46
- Converts markdown table to Pandas DataFrame, handling special characters and links,
47
- extracts Hugging Face URLs, and adds them to a new column.
48
- """
49
- # Remove leading and trailing | characters
50
  cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
51
-
52
- # Create DataFrame from cleaned content
53
  df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
54
-
55
- # Remove the first row after the header
56
  df = df.drop(0, axis=0)
57
-
58
- # Strip whitespace from column names
59
  df.columns = df.columns.str.strip()
60
-
61
- # Extract Hugging Face URLs and add them to a new column
62
  model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
63
  df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
64
-
65
- # Clean Model column to have only the model link text
66
  df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
67
-
68
  return df
69
 
70
- @st.cache_data
71
- def get_model_info(df):
72
- api = HfApi()
73
-
74
- # Initialize new columns for likes and tags
75
- df['Likes'] = None
76
- df['Tags'] = None
77
-
78
- # Iterate through DataFrame rows
79
- for index, row in df.iterrows():
80
- model = row['Model'].strip()
81
- try:
82
- model_info = api.model_info(repo_id=str(model))
83
- df.loc[index, 'Likes'] = model_info.likes
84
- df.loc[index, 'Tags'] = ', '.join(model_info.tags)
85
-
86
- except (RepositoryNotFoundError, RevisionNotFoundError):
87
- df.loc[index, 'Likes'] = -1
88
- df.loc[index, 'Tags'] = ''
89
-
90
- return df
91
-
92
- #def calculate_highest_combined_score(data, column):
93
- # score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
94
- # # Ensure the column exists and has numeric data
95
- # if column not in data.columns or not pd.api.types.is_numeric_dtype(data[column]):
96
- # return column, {}
97
- # scores = data[column].dropna().tolist()
98
- # models = data['Model'].tolist()
99
- # top_combinations = {r: [] for r in range(2, 5)}
100
- # for r in range(2, 5):
101
- # for combination in combinations(zip(scores, models), r):
102
- # combined_score = sum(score for score, _ in combination)
103
- # top_combinations[r].append((combined_score, tuple(model for _, model in combination)))
104
- # top_combinations[r].sort(key=lambda x: x[0], reverse=True)
105
- # top_combinations[r] = top_combinations[r][:5]
106
- # return column, top_combinations
107
-
108
- ## Modified function to display the results of the highest combined scores using st.dataframe
109
- #def display_highest_combined_scores(data):
110
- # score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
111
- # with st.spinner('Calculating highest combined scores...'):
112
- # results = [calculate_highest_combined_score(data, col) for col in score_columns]
113
- # for column, top_combinations in results:
114
- # st.subheader(f"Top Combinations for {column}")
115
- # for r, combinations in top_combinations.items():
116
- # # Prepare data for DataFrame
117
- # rows = [{'Score': score, 'Models': ', '.join(combination)} for score, combination in combinations]
118
- # df = pd.DataFrame(rows)
119
- #
120
- # # Display using st.dataframe
121
- # st.markdown(f"**Number of Models: {r}**")
122
- # st.dataframe(df, height=150) # Adjust height as necessary
123
-
124
-
125
-
126
-
127
- # Function to create bar chart for a given category
128
  def create_bar_chart(df, category):
129
- """Create and display a bar chart for a given category."""
130
  st.write(f"### {category} Scores")
131
-
132
- # Sort the DataFrame based on the category score
133
  sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
134
-
135
- # Create the bar chart with a color gradient (using 'Viridis' color scale as an example)
136
  fig = go.Figure(go.Bar(
137
  x=sorted_df[category],
138
  y=sorted_df['Model'],
139
  orientation='h',
140
- marker=dict(color=sorted_df[category], colorscale='Picnic') # You can change 'Viridis' to another color scale
141
  ))
142
-
143
- # Update layout for better readability
144
  fig.update_layout(
145
  margin=dict(l=20, r=20, t=20, b=20)
146
  )
147
-
148
- # Adjust the height of the chart based on the number of rows in the DataFrame
149
  st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
150
 
151
  def fetch_merge_configs(df):
152
- # Sort the data by the second column (assuming the column name is 'Average')
153
  df_sorted = df.sort_values(by='Average', ascending=False)
154
-
155
- # Open the file in append mode
156
  with open('/tmp/configurations.txt', 'a') as file:
157
- # Get model cards for the top 20 entries
158
  for index, row in df_sorted.head(20).iterrows():
159
  model_name = row['Model'].rstrip()
160
  card = ModelCard.load(model_name)
161
  file.write(f'Model Name: {model_name}\n')
162
- file.write(f'Scores: {row["Average"]}\n') # Assuming 'Average' is the benchmark score
163
  file.write(f'AGIEval: {row["AGIEval"]}\n')
164
  file.write(f'GPT4All: {row["GPT4All"]}\n')
165
  file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
166
  file.write(f'Bigbench: {row["Bigbench"]}\n')
167
  file.write(f'Model Card: {card}\n')
168
-
169
- # Open the second file in read mode
170
  with open('/tmp/configurations.txt', 'r') as file:
171
- # Read the content
172
  content = file.read()
173
-
174
- # Find all text between 'yaml' and '```'
175
  matches = re.findall(r'yaml(.*?)```', content, re.DOTALL)
176
-
177
- # Open the file 'configurations2.txt' in write mode
178
  with open('/tmp/configurations2.txt', 'w') as file:
179
- # Write the matches to the file
180
  for row, match in zip(df_sorted[['Model', 'Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']].head(20).values, matches):
181
  file.write(f'Model Name: {row[0]}\n')
182
  file.write(f'Scores: {row[1]}\n')
@@ -185,45 +83,32 @@ def fetch_merge_configs(df):
185
  file.write(f'TruthfulQA: {row[4]}\n')
186
  file.write(f'Bigbench: {row[5]}\n')
187
  file.write('yaml' + match + '```\n')
188
-
189
 
190
- # Main function to run the Streamlit app
191
  def main():
192
- # Set page configuration and title
193
  st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
194
-
195
  st.title("🏆 YALL - Yet Another LLM Leaderboard")
196
  st.markdown("Leaderboard made with 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
197
-
198
- # Create tabs for leaderboard and about section
199
  content = create_yall()
200
  tab1, tab2 = st.tabs(["🏆 Leaderboard", "📝 About"])
201
 
202
- # Leaderboard tab
203
  with tab1:
204
  if content:
205
  try:
206
  score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
207
-
208
- # Display dataframe
209
  full_df = convert_markdown_table_to_dataframe(content)
210
 
211
  for col in score_columns:
212
- # Corrected use of pd.to_numeric
213
  full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
214
 
215
  full_df = get_model_info(full_df)
216
  full_df['Tags'] = full_df['Tags'].fillna('')
217
  df = pd.DataFrame(columns=full_df.columns)
218
 
219
- # Toggles for filtering by tags
220
  show_phi = st.checkbox("Phi (2.8B)", value=True)
221
  show_mistral = st.checkbox("Mistral (7B)", value=True)
222
  show_other = st.checkbox("Other", value=True)
223
 
224
- # Create a DataFrame based on selected filters
225
  dfs_to_concat = []
226
-
227
  if show_phi:
228
  dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
229
  if show_mistral:
@@ -232,31 +117,22 @@ def main():
232
  other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
233
  dfs_to_concat.append(other_df)
234
 
235
- # Concatenate the DataFrames
236
  if dfs_to_concat:
237
  df = pd.concat(dfs_to_concat, ignore_index=True)
238
 
239
- # Add a search bar
240
  search_query = st.text_input("Search models", "")
241
-
242
- # Filter the DataFrame based on the search query
243
  if search_query:
244
  df = df[df['Model'].str.contains(search_query, case=False)]
245
 
246
- # Add a selectbox for page selection
247
  items_per_page = 50
248
  pages = calculate_pages(df, items_per_page)
249
  page = st.selectbox("Page", list(range(1, pages + 1)))
250
 
251
- # Sort the DataFrame by 'Average' column in descending order
252
  df = df.sort_values(by='Average', ascending=False)
253
-
254
- # Slice the DataFrame based on the selected page
255
  start = (page - 1) * items_per_page
256
  end = start + items_per_page
257
  df = df[start:end]
258
-
259
- # Display the filtered DataFrame or the entire leaderboard
260
  st.dataframe(
261
  df[['Model'] + score_columns + ['Likes', 'URL']],
262
  use_container_width=True,
@@ -274,12 +150,29 @@ def main():
274
  selected_models = st.multiselect('Select models to compare', df['Model'].unique())
275
  comparison_df = df[df['Model'].isin(selected_models)]
276
  st.dataframe(comparison_df)
277
- # Add a button to export data to CSV
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  if st.button("Export to CSV"):
279
- # Export the DataFrame to CSV
280
  csv_data = df.to_csv(index=False)
281
-
282
- # Create a link to download the CSV file
283
  st.download_button(
284
  label="Download CSV",
285
  data=csv_data,
@@ -288,19 +181,11 @@ def main():
288
  help="Click to download the CSV file",
289
  )
290
  if st.button("Fetch Merge-Configs"):
291
- # Call the function with the current DataFrame
292
- configurations, matches, csv_data = fetch_merge_configs(full_df) # Assuming full_df is your DataFrame
293
- # You can then display the configurations or matches as needed, or write them to a file
294
- # For example, displaying the configurations:
295
  for config in configurations:
296
  st.text(f"Model Name: {config['Model Name']}\nScores: {config['Scores']}\nAGIEval: {config['AGIEval']}\nGPT4All: {config['GPT4All']}\nTruthfulQA: {config['TruthfulQA']}\nBigbench: {config['Bigbench']}\nModel Card: {config['Model Card']}\n\n")
297
-
298
- # Convert the list of dictionaries to a DataFrame
299
  configurations_df = pd.DataFrame(configurations)
300
-
301
- # Convert the DataFrame to a CSV string
302
  configurations_csv = configurations_df.to_csv(index=False)
303
-
304
  st.download_button(
305
  label="Download Configurations",
306
  data=configurations_csv,
@@ -309,33 +194,28 @@ def main():
309
  help="Click to download the CSV file",
310
  )
311
 
312
-
313
- # Full-width plot for the first category
314
- create_bar_chart(df, score_columns[0])
315
 
316
- # Next two plots in two columns
317
  col1, col2 = st.columns(2)
318
  with col1:
319
  create_bar_chart(df, score_columns[1])
320
  with col2:
321
  create_bar_chart(df, score_columns[2])
322
 
323
- # Last two plots in two columns
324
  col3, col4 = st.columns(2)
325
  with col3:
326
  create_bar_chart(df, score_columns[3])
327
  with col4:
328
  create_bar_chart(df, score_columns[4])
329
 
330
- # display_highest_combined_scores(full_df) # Call to display the calculated scores
331
  except Exception as e:
332
  st.error("An error occurred while processing the markdown table.")
333
  st.error(str(e))
334
  else:
335
  st.error("Failed to download the content from the URL provided.")
336
- # About tab
337
  with tab2:
338
- st.markdown('''markdown
339
  ### Nous benchmark suite
340
  Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
341
  * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
@@ -349,82 +229,7 @@ def main():
349
  * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
350
  * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
351
  A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
352
-
353
-
354
- # Bonus: Workflow for Automating Model Evaluation and Selection
355
-
356
- ## Step 1. Export CSV Data from Another-LLM-LeaderBoards
357
- Go to our [Another-LLM-LeaderBoards](https://leaderboards.example.com) and click the export csv data button. Save it to `/tmp/models.csv`.
358
-
359
- ## Step 2: Examine CSV Data
360
- Run a script for extracting model names, benchmark scores, and model page link from the CSV data.
361
-
362
- ```python
363
- import re
364
- from huggingface_hub import ModelCard
365
- import pandas as pd
366
-
367
- # Load the CSV data
368
- df = pd.read_csv('/tmp/models.csv')
369
-
370
- # Sort the data by the second column (assuming the column name is 'Average')
371
- df_sorted = df.sort_values(by='Average', ascending=False)
372
-
373
- # Open the file in append mode
374
- with open('/tmp/configurations.txt', 'a') as file:
375
- # Get model cards for the top 20 entries
376
- for index, row in df_sorted.head(20).iterrows():
377
- model_name = row['Model'].rstrip()
378
- card = ModelCard.load(model_name)
379
- file.write(f'Model Name: {model_name}\n')
380
- file.write(f'Scores: {row["Average"]}\n') # Assuming 'Average' is the benchmark score
381
- file.write(f'AGIEval: {row["AGIEval"]}\n')
382
- file.write(f'GPT4All: {row["GPT4All"]}\n')
383
- file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
384
- file.write(f'Bigbench: {row["Bigbench"]}\n')
385
- file.write(f'Model Card: {card}\n')
386
- ```
387
-
388
- ## Step 3: Feed the Discovered Models, Scores and Configurations to LLM-client (shell-gpt)
389
- Run your local LLM-client by feeding it all the discovered merged models, their benchmark scores and if found the configurations used to merge them. Provide it with an instruction similar to this:
390
-
391
- ```bash
392
- cat /tmp/configurations2.txt | sgpt --chat config "Based on the merged models that are provided here, along with their respective benchmark achievements and the configurations used in merging them, your task is to come up with a new configuration for a new merged model that will outperform all others. In your thought process, argue and reflect on your own choices to improve your thinking process and outcome"
393
- ```
394
-
395
- ## Step 4: (Optional) Reflect on Initial Configuration Suggested by Chat-GPT
396
- If you wanted to get particularly naughty, you could add a step like this where you make Chat-GPT rethink and reflect on the configuration it initially comes up with based on the information you gave it.
397
-
398
- ```bash
399
- for i in $(seq 1 3); do echo "$i" && sgpt --chat config "Repeat the process from before and again reflect and improve on your suggested configuration"; sleep 20; done
400
- ```
401
-
402
- ## Step 5: Wait for Chat-GPT to give you a LeaderBoard-topping merge configuration
403
- Wait for Chat-GPT to provide a new merge configuration.
404
-
405
- ## Step 6: Enter the Configuration in Automergekit NoteBook
406
- Fire up your automergekit NoteBook and enter in the configuration that was just so generously provided to you by Chat-GPT.
407
-
408
- ## Step 7: Evaluate the New Merge using auto-llm-eval notebook
409
- Fire up your auto-llm-eval notebook to see if the merge that Chat-GPT came up with is actually making any sense and performing well.
410
-
411
- ## Step 8: Repeat the Process
412
- Repeat this process for a few times every day, learning from each new model created.
413
-
414
- ## Step 9: Rank the New Number One Model
415
- Rank the new number one model and top your own LeaderBoard: (Model: CultriX/MergeCeption-7B-v3)
416
- ![image.png](https://cdn-uploads.huggingface.co/production/uploads/6495d5a915d8ef6f01bc75eb/mFV3Ou469fk6ivj1XrD9d.png)
417
-
418
- ## Step 10: Automate the Process with Cronjob
419
- Create a cronjob that automates this process 5 times every day, only to then learn from the models that it has created in order to create even better ones and I'm telling you that you better prepare yourself for some non-neglectable increases in benchmark scores for the near future.
420
-
421
- Cheers,
422
- CultriX
423
  ''')
424
-
425
-
426
-
427
 
428
- # Run the main function if this script is run directly
429
  if __name__ == "__main__":
430
- main()
 
10
  import plotly.graph_objs as go
11
  from huggingface_hub import ModelCard
12
 
 
 
13
  def calculate_pages(df, items_per_page):
14
  return -(-len(df) // items_per_page) # Equivalent to math.ceil(len(df) / items_per_page)
15
 
 
 
 
16
  @st.cache_data
17
  def cached_model_info(api, model):
18
  try:
 
20
  except (RepositoryNotFoundError, RevisionNotFoundError):
21
  return None
22
 
 
23
  @st.cache_data
24
  def get_model_info(df):
25
  api = HfApi()
 
34
  df.loc[index, 'Tags'] = ''
35
  return df
36
 
 
37
  def convert_markdown_table_to_dataframe(md_content):
 
 
 
 
 
38
  cleaned_content = re.sub(r'\|\s*$', '', re.sub(r'^\|\s*', '', md_content, flags=re.MULTILINE), flags=re.MULTILINE)
 
 
39
  df = pd.read_csv(StringIO(cleaned_content), sep="\|", engine='python')
 
 
40
  df = df.drop(0, axis=0)
 
 
41
  df.columns = df.columns.str.strip()
 
 
42
  model_link_pattern = r'\[(.*?)\]\((.*?)\)\s*\[.*?\]\(.*?\)'
43
  df['URL'] = df['Model'].apply(lambda x: re.search(model_link_pattern, x).group(2) if re.search(model_link_pattern, x) else None)
 
 
44
  df['Model'] = df['Model'].apply(lambda x: re.sub(model_link_pattern, r'\1', x))
 
45
  return df
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def create_bar_chart(df, category):
 
48
  st.write(f"### {category} Scores")
 
 
49
  sorted_df = df[['Model', category]].sort_values(by=category, ascending=True)
 
 
50
  fig = go.Figure(go.Bar(
51
  x=sorted_df[category],
52
  y=sorted_df['Model'],
53
  orientation='h',
54
+ marker=dict(color=sorted_df[category], colorscale='Agsunset')
55
  ))
 
 
56
  fig.update_layout(
57
  margin=dict(l=20, r=20, t=20, b=20)
58
  )
 
 
59
  st.plotly_chart(fig, use_container_width=True, height=len(df) * 35)
60
 
61
  def fetch_merge_configs(df):
 
62
  df_sorted = df.sort_values(by='Average', ascending=False)
 
 
63
  with open('/tmp/configurations.txt', 'a') as file:
 
64
  for index, row in df_sorted.head(20).iterrows():
65
  model_name = row['Model'].rstrip()
66
  card = ModelCard.load(model_name)
67
  file.write(f'Model Name: {model_name}\n')
68
+ file.write(f'Scores: {row["Average"]}\n')
69
  file.write(f'AGIEval: {row["AGIEval"]}\n')
70
  file.write(f'GPT4All: {row["GPT4All"]}\n')
71
  file.write(f'TruthfulQA: {row["TruthfulQA"]}\n')
72
  file.write(f'Bigbench: {row["Bigbench"]}\n')
73
  file.write(f'Model Card: {card}\n')
 
 
74
  with open('/tmp/configurations.txt', 'r') as file:
 
75
  content = file.read()
 
 
76
  matches = re.findall(r'yaml(.*?)```', content, re.DOTALL)
 
 
77
  with open('/tmp/configurations2.txt', 'w') as file:
 
78
  for row, match in zip(df_sorted[['Model', 'Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']].head(20).values, matches):
79
  file.write(f'Model Name: {row[0]}\n')
80
  file.write(f'Scores: {row[1]}\n')
 
83
  file.write(f'TruthfulQA: {row[4]}\n')
84
  file.write(f'Bigbench: {row[5]}\n')
85
  file.write('yaml' + match + '```\n')
 
86
 
 
87
  def main():
 
88
  st.set_page_config(page_title="YALL - Yet Another LLM Leaderboard", layout="wide")
 
89
  st.title("🏆 YALL - Yet Another LLM Leaderboard")
90
  st.markdown("Leaderboard made with 🧐 [LLM AutoEval](https://github.com/mlabonne/llm-autoeval) using [Nous](https://huggingface.co/NousResearch) benchmark suite.")
 
 
91
  content = create_yall()
92
  tab1, tab2 = st.tabs(["🏆 Leaderboard", "📝 About"])
93
 
 
94
  with tab1:
95
  if content:
96
  try:
97
  score_columns = ['Average', 'AGIEval', 'GPT4All', 'TruthfulQA', 'Bigbench']
 
 
98
  full_df = convert_markdown_table_to_dataframe(content)
99
 
100
  for col in score_columns:
 
101
  full_df[col] = pd.to_numeric(full_df[col].str.strip(), errors='coerce')
102
 
103
  full_df = get_model_info(full_df)
104
  full_df['Tags'] = full_df['Tags'].fillna('')
105
  df = pd.DataFrame(columns=full_df.columns)
106
 
 
107
  show_phi = st.checkbox("Phi (2.8B)", value=True)
108
  show_mistral = st.checkbox("Mistral (7B)", value=True)
109
  show_other = st.checkbox("Other", value=True)
110
 
 
111
  dfs_to_concat = []
 
112
  if show_phi:
113
  dfs_to_concat.append(full_df[full_df['Tags'].str.lower().str.contains('phi,|phi-msft,')])
114
  if show_mistral:
 
117
  other_df = full_df[~full_df['Tags'].str.lower().str.contains('phi,|phi-msft,|mistral,')]
118
  dfs_to_concat.append(other_df)
119
 
 
120
  if dfs_to_concat:
121
  df = pd.concat(dfs_to_concat, ignore_index=True)
122
 
 
123
  search_query = st.text_input("Search models", "")
 
 
124
  if search_query:
125
  df = df[df['Model'].str.contains(search_query, case=False)]
126
 
 
127
  items_per_page = 50
128
  pages = calculate_pages(df, items_per_page)
129
  page = st.selectbox("Page", list(range(1, pages + 1)))
130
 
 
131
  df = df.sort_values(by='Average', ascending=False)
 
 
132
  start = (page - 1) * items_per_page
133
  end = start + items_per_page
134
  df = df[start:end]
135
+
 
136
  st.dataframe(
137
  df[['Model'] + score_columns + ['Likes', 'URL']],
138
  use_container_width=True,
 
150
  selected_models = st.multiselect('Select models to compare', df['Model'].unique())
151
  comparison_df = df[df['Model'].isin(selected_models)]
152
  st.dataframe(comparison_df)
153
+
154
+ selected_benchmarks = st.multiselect('Select benchmarks to include in the average', score_columns, default=score_columns)
155
+
156
+ if selected_benchmarks:
157
+ df['Filtered Average'] = df[selected_benchmarks].mean(axis=1)
158
+ df = df.sort_values(by='Filtered Average', ascending=False)
159
+ st.dataframe(
160
+ df[['Model'] + selected_benchmarks + ['Filtered Average', 'Likes', 'URL']],
161
+ use_container_width=True,
162
+ column_config={
163
+ "Likes": st.column_config.NumberColumn(
164
+ "Likes",
165
+ help="Number of likes on Hugging Face",
166
+ format="%d ❤️",
167
+ ),
168
+ "URL": st.column_config.LinkColumn("URL"),
169
+ },
170
+ hide_index=True,
171
+ height=len(df) * 37,
172
+ )
173
+
174
  if st.button("Export to CSV"):
 
175
  csv_data = df.to_csv(index=False)
 
 
176
  st.download_button(
177
  label="Download CSV",
178
  data=csv_data,
 
181
  help="Click to download the CSV file",
182
  )
183
  if st.button("Fetch Merge-Configs"):
184
+ configurations, matches, csv_data = fetch_merge_configs(full_df)
 
 
 
185
  for config in configurations:
186
  st.text(f"Model Name: {config['Model Name']}\nScores: {config['Scores']}\nAGIEval: {config['AGIEval']}\nGPT4All: {config['GPT4All']}\nTruthfulQA: {config['TruthfulQA']}\nBigbench: {config['Bigbench']}\nModel Card: {config['Model Card']}\n\n")
 
 
187
  configurations_df = pd.DataFrame(configurations)
 
 
188
  configurations_csv = configurations_df.to_csv(index=False)
 
189
  st.download_button(
190
  label="Download Configurations",
191
  data=configurations_csv,
 
194
  help="Click to download the CSV file",
195
  )
196
 
197
+ create_bar_chart(df, 'Filtered Average')
 
 
198
 
 
199
  col1, col2 = st.columns(2)
200
  with col1:
201
  create_bar_chart(df, score_columns[1])
202
  with col2:
203
  create_bar_chart(df, score_columns[2])
204
 
 
205
  col3, col4 = st.columns(2)
206
  with col3:
207
  create_bar_chart(df, score_columns[3])
208
  with col4:
209
  create_bar_chart(df, score_columns[4])
210
 
 
211
  except Exception as e:
212
  st.error("An error occurred while processing the markdown table.")
213
  st.error(str(e))
214
  else:
215
  st.error("Failed to download the content from the URL provided.")
216
+
217
  with tab2:
218
+ st.markdown('''
219
  ### Nous benchmark suite
220
  Popularized by [Teknium](https://huggingface.co/teknium) and [NousResearch](https://huggingface.co/NousResearch), this benchmark suite aggregates four benchmarks:
221
  * [**AGIEval**](https://arxiv.org/abs/2304.06364) (0-shot): `agieval_aqua_rat,agieval_logiqa_en,agieval_lsat_ar,agieval_lsat_lr,agieval_lsat_rc,agieval_sat_en,agieval_sat_en_without_passage,agieval_sat_math`
 
229
  * Change the `gist_id` in [yall.py](https://huggingface.co/spaces/mlabonne/Yet_Another_LLM_Leaderboard/blob/main/yall.py#L126).
230
  * Create "New Secret" in Settings > Variables and secrets (name: "github", value: [your GitHub token](https://github.com/settings/tokens))
231
  A special thanks to [gblazex](https://huggingface.co/gblazex) for providing many evaluations.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  ''')
 
 
 
233
 
 
234
  if __name__ == "__main__":
235
+ main()