binwang commited on
Commit
62dd38d
·
verified ·
1 Parent(s): a4d1edd

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. app/content.py +87 -75
  2. app/draw_diagram.py +1 -1
  3. app/pages.py +166 -117
  4. app/summarization.py +1 -1
app/content.py CHANGED
@@ -1,3 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  asr_datsets = {'LibriSpeech-Test-Clean': 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.',
2
  'LibriSpeech-Test-Other' : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.',
3
  'Common-Voice-15-En-Test': 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.',
@@ -19,31 +70,39 @@ singlish_asr_datasets = {
19
  }
20
 
21
  sqa_datasets = {'CN-College-Listen-MCQ-Test': 'Chinese College English Listening Test, with multiple-choice questions.',
22
- 'DREAM-TTS-MCQ-Test': 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.',
23
- 'SLUE-P2-SQA5-Test': 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.',
24
- 'Public-SG-Speech-QA-Test': 'Public dataset for speech-based question answering, gathered from Singapore.',
25
- 'Spoken-Squad-Test': 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.'
26
  }
27
 
28
- si_datasets = {'OpenHermes-Audio-Test': 'Test set for spoken instructions. Synthesized from the OpenHermes dataset.',
29
- 'ALPACA-Audio-Test': 'Spoken version of the ALPACA dataset, used for evaluating instruction following in audio.'
 
 
 
 
 
 
 
 
30
  }
31
 
32
  ac_datasets = {
33
- 'WavCaps-Test': 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.',
34
  'AudioCaps-Test': 'AudioCaps dataset, used for generating captions from general audio events.'
35
  }
36
 
37
  asqa_datasets = {
38
- 'Clotho-AQA-Test': 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.',
39
- 'WavCaps-QA-Test': 'Question-answering test dataset derived from WavCaps, focusing on audio content.',
40
  'AudioCaps-QA-Test': 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.'
41
  }
42
 
43
  er_datasets = {
44
- 'IEMOCAP-Emotion-Test': 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.',
45
- 'MELD-Sentiment-Test': 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.',
46
- 'MELD-Emotion-Test': 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.'
47
  }
48
 
49
  ar_datsets = {
@@ -51,17 +110,17 @@ ar_datsets = {
51
  }
52
 
53
  gr_datasets = {
54
- 'VoxCeleb-Gender-Test': 'Test dataset for gender classification, also derived from VoxCeleb.',
55
- 'IEMOCAP-Gender-Test': 'Gender classification based on the IEMOCAP dataset.'
56
  }
57
 
58
  spt_datasets = {
59
- 'Covost2-EN-ID-test': 'Covost 2 dataset for speech translation from English to Indonesian.',
60
- 'Covost2-EN-ZH-test': 'Covost 2 dataset for speech translation from English to Chinese.',
61
- 'Covost2-EN-TA-test': 'Covost 2 dataset for speech translation from English to Tamil.',
62
- 'Covost2-ID-EN-test': 'Covost 2 dataset for speech translation from Indonesian to English.',
63
- 'Covost2-ZH-EN-test': 'Covost 2 dataset for speech translation from Chinese to English.',
64
- 'Covost2-TA-EN-test': 'Covost 2 dataset for speech translation from Tamil to English.'
65
  }
66
 
67
  cnasr_datasets = {
@@ -73,65 +132,18 @@ MUSIC_MCQ_DATASETS = {
73
  }
74
 
75
  metrics = {
76
- 'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
77
  'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
78
- 'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
79
- 'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
80
- 'bleu': 'BLEU (Bilingual Evaluation Understudy), another text generation evaluation metric commonly used in machine translation. (Sensitive to output length)',
81
  }
82
 
83
  metrics_info = {
84
- 'wer': 'Word Error Rate (WER) - The Lower, the better.',
85
  'llama3_70b_judge_binary': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
86
- 'llama3_70b_judge': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
87
- 'meteor': 'METEOR Score. The higher, the better.',
88
- 'bleu': 'BLEU Score. The higher, the better.',
89
  }
90
 
91
-
92
- dataname_column_rename_in_table = {
93
- 'librispeech_test_clean' : 'LibriSpeech-Clean',
94
- 'librispeech_test_other' : 'LibriSpeech-Other',
95
- 'common_voice_15_en_test' : 'CommonVoice-15-EN',
96
- 'peoples_speech_test' : 'Peoples-Speech',
97
- 'gigaspeech_test' : 'GigaSpeech-1',
98
- 'earnings21_test' : 'Earnings-21',
99
- 'earnings22_test' : 'Earnings-22',
100
- 'tedlium3_test' : 'TED-LIUM-3',
101
- 'tedlium3_long_form_test' : 'TED-LIUM-3-Long',
102
- 'aishell_asr_zh_test' : 'Aishell-ASR-ZH',
103
- 'covost2_en_id_test' : 'Covost2-EN-ID',
104
- 'covost2_en_zh_test' : 'Covost2-EN-ZH',
105
- 'covost2_en_ta_test' : 'Covost2-EN-TA',
106
- 'covost2_id_en_test' : 'Covost2-ID-EN',
107
- 'covost2_zh_en_test' : 'Covost2-ZH-EN',
108
- 'covost2_ta_en_test' : 'Covost2-TA-EN',
109
- 'cn_college_listen_mcq_test': 'CN-College-Listen-MCQ',
110
- 'dream_tts_mcq_test' : 'DREAM-TTS-MCQ',
111
- 'slue_p2_sqa5_test' : 'SLUE-P2-SQA5',
112
- 'public_sg_speech_qa_test' : 'Public-SG-Speech-QA',
113
- 'spoken_squad_test' : 'Spoken-SQuAD',
114
- 'openhermes_audio_test' : 'OpenHermes-Audio',
115
- 'alpaca_audio_test' : 'ALPACA-Audio',
116
- 'wavcaps_test' : 'WavCaps',
117
- 'audiocaps_test' : 'AudioCaps',
118
- 'clotho_aqa_test' : 'Clotho-AQA',
119
- 'wavcaps_qa_test' : 'WavCaps-QA',
120
- 'audiocaps_qa_test' : 'AudioCaps-QA',
121
- 'voxceleb_accent_test' : 'VoxCeleb-Accent',
122
- 'voxceleb_gender_test' : 'VoxCeleb-Gender',
123
- 'iemocap_gender_test' : 'IEMOCAP-Gender',
124
- 'iemocap_emotion_test' : 'IEMOCAP-Emotion',
125
- 'meld_sentiment_test' : 'MELD-Sentiment',
126
- 'meld_emotion_test' : 'MELD-Emotion',
127
- 'imda_part1_asr_test' : 'IMDA-Part1-ASR',
128
- 'imda_part2_asr_test' : 'IMDA-Part2-ASR',
129
- 'imda_part3_30s_asr_test' : 'IMDA-Part3-30s-ASR',
130
- 'imda_part4_30s_asr_test' : 'IMDA-Part4-30s-ASR',
131
- 'imda_part5_30s_asr_test' : 'IMDA-Part5-30s-ASR',
132
- 'imda_part6_30s_asr_test' : 'IMDA-Part6-30s-ASR',
133
-
134
- 'muchomusic_test' : 'MuChoMusic'
135
-
136
-
137
- }
 
1
+
2
+ dataname_column_rename_in_table = {
3
+ 'librispeech_test_clean' : 'LibriSpeech-Clean',
4
+ 'librispeech_test_other' : 'LibriSpeech-Other',
5
+ 'common_voice_15_en_test' : 'CommonVoice-15-EN',
6
+ 'peoples_speech_test' : 'Peoples-Speech',
7
+ 'gigaspeech_test' : 'GigaSpeech-1',
8
+ 'earnings21_test' : 'Earnings-21',
9
+ 'earnings22_test' : 'Earnings-22',
10
+ 'tedlium3_test' : 'TED-LIUM-3',
11
+ 'tedlium3_long_form_test' : 'TED-LIUM-3-Long',
12
+ 'aishell_asr_zh_test' : 'Aishell-ASR-ZH',
13
+ 'covost2_en_id_test' : 'CoVoST2-EN-ID',
14
+ 'covost2_en_zh_test' : 'CoVoST2-EN-ZH',
15
+ 'covost2_en_ta_test' : 'CoVoST2-EN-TA',
16
+ 'covost2_id_en_test' : 'CoVoST2-ID-EN',
17
+ 'covost2_zh_en_test' : 'CoVoST2-ZH-EN',
18
+ 'covost2_ta_en_test' : 'CoVoST2-TA-EN',
19
+ 'cn_college_listen_mcq_test' : 'CN-College-Listen-MCQ',
20
+ 'dream_tts_mcq_test' : 'DREAM-TTS-MCQ',
21
+ 'slue_p2_sqa5_test' : 'SLUE-P2-SQA5',
22
+ 'public_sg_speech_qa_test' : 'Public-SG-Speech-QA',
23
+ 'spoken_squad_test' : 'Spoken-SQuAD',
24
+ 'openhermes_audio_test' : 'OpenHermes-Audio',
25
+ 'alpaca_audio_test' : 'ALPACA-Audio',
26
+ 'wavcaps_test' : 'WavCaps',
27
+ 'audiocaps_test' : 'AudioCaps',
28
+ 'clotho_aqa_test' : 'Clotho-AQA',
29
+ 'wavcaps_qa_test' : 'WavCaps-QA',
30
+ 'audiocaps_qa_test' : 'AudioCaps-QA',
31
+ 'voxceleb_accent_test' : 'VoxCeleb-Accent',
32
+ 'voxceleb_gender_test' : 'VoxCeleb-Gender',
33
+ 'iemocap_gender_test' : 'IEMOCAP-Gender',
34
+ 'iemocap_emotion_test' : 'IEMOCAP-Emotion',
35
+ 'meld_sentiment_test' : 'MELD-Sentiment',
36
+ 'meld_emotion_test' : 'MELD-Emotion',
37
+ 'imda_part1_asr_test' : 'IMDA-Part1-ASR',
38
+ 'imda_part2_asr_test' : 'IMDA-Part2-ASR',
39
+ 'imda_part3_30s_asr_test' : 'IMDA-Part3-30s-ASR',
40
+ 'imda_part4_30s_asr_test' : 'IMDA-Part4-30s-ASR',
41
+ 'imda_part5_30s_asr_test' : 'IMDA-Part5-30s-ASR',
42
+ 'imda_part6_30s_asr_test' : 'IMDA-Part6-30s-ASR',
43
+ 'muchomusic_test' : 'MuChoMusic',
44
+ 'imda_part3_30s_sqa_human_test': 'MNSC-PART3-SQA',
45
+ 'imda_part4_30s_sqa_human_test': 'MNSC-PART4-SQA',
46
+ 'imda_part5_30s_sqa_human_test': 'MNSC-PART5-SQA',
47
+ 'imda_part6_30s_sqa_human_test': 'MNSC-PART6-SQA',
48
+
49
+
50
+ }
51
+
52
  asr_datsets = {'LibriSpeech-Test-Clean': 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.',
53
  'LibriSpeech-Test-Other' : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.',
54
  'Common-Voice-15-En-Test': 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.',
 
70
  }
71
 
72
  sqa_datasets = {'CN-College-Listen-MCQ-Test': 'Chinese College English Listening Test, with multiple-choice questions.',
73
+ 'DREAM-TTS-MCQ-Test' : 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.',
74
+ 'SLUE-P2-SQA5-Test' : 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.',
75
+ 'Public-SG-Speech-QA-Test': 'Public dataset for speech-based question answering, gathered from Singapore.',
76
+ 'Spoken-Squad-Test' : 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.'
77
  }
78
 
79
+ sqa_singlish_datasets = {
80
+ 'MNSC-PART3-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 3.',
81
+ 'MNSC-PART4-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 4.',
82
+ 'MNSC-PART5-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 5.',
83
+ 'MNSC-PART6-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 6.',
84
+ }
85
+
86
+ si_datasets = {
87
+ 'OpenHermes-Audio-Test': 'Test set for spoken instructions. Synthesized from the OpenHermes dataset.',
88
+ 'ALPACA-Audio-Test' : 'Spoken version of the ALPACA dataset, used for evaluating instruction following in audio.'
89
  }
90
 
91
  ac_datasets = {
92
+ 'WavCaps-Test' : 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.',
93
  'AudioCaps-Test': 'AudioCaps dataset, used for generating captions from general audio events.'
94
  }
95
 
96
  asqa_datasets = {
97
+ 'Clotho-AQA-Test' : 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.',
98
+ 'WavCaps-QA-Test' : 'Question-answering test dataset derived from WavCaps, focusing on audio content.',
99
  'AudioCaps-QA-Test': 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.'
100
  }
101
 
102
  er_datasets = {
103
+ 'IEMOCAP-Emotion-Test': 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.',
104
+ 'MELD-Sentiment-Test' : 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.',
105
+ 'MELD-Emotion-Test' : 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.'
106
  }
107
 
108
  ar_datsets = {
 
110
  }
111
 
112
  gr_datasets = {
113
+ 'VoxCeleb-Gender-Test': 'Test dataset for gender classification, also derived from VoxCeleb.',
114
+ 'IEMOCAP-Gender-Test' : 'Gender classification based on the IEMOCAP dataset.'
115
  }
116
 
117
  spt_datasets = {
118
+ 'CoVoST2-EN-ID-test': 'CoVoST 2 dataset for speech translation from English to Indonesian.',
119
+ 'CoVoST2-EN-ZH-test': 'CoVoST 2 dataset for speech translation from English to Chinese.',
120
+ 'CoVoST2-EN-TA-test': 'CoVoST 2 dataset for speech translation from English to Tamil.',
121
+ 'CoVoST2-ID-EN-test': 'CoVoST 2 dataset for speech translation from Indonesian to English.',
122
+ 'CoVoST2-ZH-EN-test': 'CoVoST 2 dataset for speech translation from Chinese to English.',
123
+ 'CoVoST2-TA-EN-test': 'CoVoST 2 dataset for speech translation from Tamil to English.'
124
  }
125
 
126
  cnasr_datasets = {
 
132
  }
133
 
134
  metrics = {
135
+ 'wer' : 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
136
  'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
137
+ 'llama3_70b_judge' : 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
138
+ 'meteor' : 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
139
+ 'bleu' : 'BLEU (Bilingual Evaluation Understudy), another text generation evaluation metric commonly used in machine translation. (Sensitive to output length)',
140
  }
141
 
142
  metrics_info = {
143
+ 'wer' : 'Word Error Rate (WER) - The Lower, the better.',
144
  'llama3_70b_judge_binary': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
145
+ 'llama3_70b_judge' : 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
146
+ 'meteor' : 'METEOR Score. The higher, the better.',
147
+ 'bleu' : 'BLEU Score. The higher, the better.',
148
  }
149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/draw_diagram.py CHANGED
@@ -17,7 +17,7 @@ info_df = get_dataframe()
17
 
18
  def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
19
 
20
- folder = f"./results/{metrics}/"
21
 
22
  # Load the results from CSV
23
  data_path = f'{folder}/{category_name.lower()}.csv'
 
17
 
18
  def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
19
 
20
+ folder = f"./results_organized/{metrics}/"
21
 
22
  # Load the results from CSV
23
  data_path = f'{folder}/{category_name.lower()}.csv'
app/pages.py CHANGED
@@ -29,38 +29,33 @@ def dataset_contents(dataset, metrics):
29
  def dashboard():
30
 
31
  with st.container():
32
- st.title("AudioBench")
33
 
34
  st.markdown("""
35
- [gh]: https://github.com/AudioLLMs/AudioBench
36
- [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/AudioBench?style=social)][gh]
37
- [![GitHub watchers](https://img.shields.io/github/watchers/AudioLLMs/AudioBench?style=social)][gh]
 
 
38
  """)
39
 
40
 
41
  st.markdown("""
42
- ### Changelog
43
-
44
- - **Dec, 2024**:
45
- - Added MuChoMusic dataset for Music Understanding - MCQ Questions. From Paper: https://arxiv.org/abs/2408.01337.
46
- - Singlish ASR task added! The datasets are available on [HF](https://huggingface.co/datasets/MERaLiON/MNSC).
47
-
48
- - **Dec, 2024**:
49
- - Updated layout and added support for comparison between models with similar sizes.
50
- - Reorganized layout for a better user experience.
51
- - Added performance summary for each task.
52
-
53
- - **Aug 2024**:
54
- - Initial leaderboard is now online.
55
  """)
56
 
57
  st.divider()
58
 
59
  st.markdown("""
60
- #### What is [AudioBench](https://arxiv.org/abs/2406.16020)?
61
 
62
  - AudioBench is a comprehensive evaluation benchmark designed for general instruction-following audio large language models.
63
- - AudioBench is a evaluation benchmark that we consistently put effort in updating and maintaining.
64
 
65
  Below are the initial 26 datasets that are included in AudioBench. We are now exteneded to over 40 datasets and going to extend to more in the future.
66
  """
@@ -68,27 +63,19 @@ def dashboard():
68
 
69
 
70
  with st.container():
71
- left_co, center_co, right_co = st.columns([1, 0.5, 0.5])
72
- with left_co:
73
- st.image("./style/audio_overview.png",
74
- caption="Overview of the datasets in AudioBench.",
75
- )
76
 
77
  st.markdown('''
78
-
79
-
80
  ''')
81
 
82
  st.markdown("###### :dart: Our Benchmark includes: ")
83
- cols = st.columns(10)
84
  cols[0].metric(label="Tasks", value=">8")
85
  cols[1].metric(label="Datasets", value=">40")
86
  cols[2].metric(label="Evaluated Models", value=">5")
87
-
88
-
89
  st.divider()
90
  with st.container():
91
- left_co, center_co, right_co = st.columns([1, 0.5, 0.5])
92
 
93
  with left_co:
94
  st.markdown("""
@@ -104,8 +91,10 @@ def dashboard():
104
  """)
105
 
106
 
107
- def asr():
108
- st.title("Task: Automatic Speech Recognition")
 
 
109
 
110
  sum = ['Overall']
111
  dataset_lists = [
@@ -122,20 +111,23 @@ def asr():
122
 
123
  filters_levelone = sum + dataset_lists
124
 
125
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
126
 
127
  with left:
128
  filter_1 = st.selectbox('Dataset', filters_levelone)
129
 
130
  if filter_1:
131
  if filter_1 in sum:
132
- sum_table_mulit_metrix('ASR', ['wer'])
133
  else:
134
  dataset_contents(asr_datsets[filter_1], metrics['wer'])
135
- draw('su', 'ASR', filter_1, 'wer', cus_sort=True)
 
136
 
137
 
138
- def singlish_asr():
 
 
139
  st.title("Task: Automatic Speech Recognition - Singlish")
140
 
141
  sum = ['Overall']
@@ -150,20 +142,22 @@ def singlish_asr():
150
 
151
  filters_levelone = sum + dataset_lists
152
 
153
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
154
 
155
  with left:
156
  filter_1 = st.selectbox('Dataset', filters_levelone)
157
 
158
  if filter_1:
159
  if filter_1 in sum:
160
- sum_table_mulit_metrix('singlish_asr', ['wer'])
161
  else:
162
  dataset_contents(singlish_asr_datasets[filter_1], metrics['wer'])
163
- draw('su', 'singlish_asr', filter_1, 'wer')
 
164
 
165
 
166
- def cnasr():
 
167
  st.title("Task: Automatic Speech Recognition - Mandarin")
168
 
169
  sum = ['Overall']
@@ -173,80 +167,151 @@ def cnasr():
173
 
174
  filters_levelone = sum + dataset_lists
175
 
176
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
177
 
178
  with left:
179
  filter_1 = st.selectbox('Dataset', filters_levelone)
180
 
181
  if filter_1:
182
  if filter_1 in sum:
183
- sum_table_mulit_metrix('CNASR', ['wer'])
184
  else:
185
  dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
186
- draw('su', 'CNASR', filter_1, 'wer')
187
 
188
 
189
 
190
- def sqa():
191
- st.title("Task: Speech Question Answering")
 
192
 
193
  sum = ['Overall']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
 
195
- binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
196
 
197
- rest = ['SLUE-P2-SQA5-Test',
198
- 'Public-SG-Speech-QA-Test',
199
- 'Spoken-Squad-Test']
200
 
201
- filters_levelone = sum + binary + rest
 
 
202
 
203
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
 
 
 
 
 
 
 
 
 
 
 
 
204
 
205
  with left:
206
  filter_1 = st.selectbox('Dataset', filters_levelone)
207
 
208
  if filter_1:
209
  if filter_1 in sum:
210
- sum_table_mulit_metrix('SQA', ['llama3_70b_judge_binary', 'llama3_70b_judge'])
 
 
 
 
 
 
 
 
 
 
 
211
 
212
- elif filter_1 in binary:
213
- dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge_binary'])
214
- draw('su', 'SQA', filter_1, 'llama3_70b_judge_binary')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
  else:
217
  dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
218
- draw('su', 'SQA', filter_1, 'llama3_70b_judge')
 
 
219
 
220
- def si():
 
221
  st.title("Task: Speech Instruction")
222
 
223
  sum = ['Overall']
224
 
225
  dataset_lists = ['OpenHermes-Audio-Test',
226
- 'ALPACA-Audio-Test']
 
227
 
228
  filters_levelone = sum + dataset_lists
229
 
230
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
231
 
232
  with left:
233
  filter_1 = st.selectbox('Dataset', filters_levelone)
234
 
235
  if filter_1:
236
  if filter_1 in sum:
237
- sum_table_mulit_metrix('SI', ['llama3_70b_judge'])
238
  else:
239
  dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
240
- draw('su', 'SI', filter_1, 'llama3_70b_judge')
 
 
241
 
242
- def ac():
 
243
  st.title("Task: Audio Captioning")
244
 
245
  filters_levelone = ['WavCaps-Test',
246
- 'AudioCaps-Test']
 
247
  filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
248
 
249
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
250
 
251
  with left:
252
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -255,10 +320,12 @@ def ac():
255
 
256
  if filter_1 or metric:
257
  dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')])
258
- draw('asu', 'AC',filter_1, metric.lower().replace('-', '_'))
 
 
259
 
260
 
261
- def asqa():
262
  st.title("Task: Audio Scene Question Answering")
263
 
264
  sum = ['Overall']
@@ -269,44 +336,50 @@ def asqa():
269
 
270
  filters_levelone = sum + dataset_lists
271
 
272
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
273
 
274
  with left:
275
  filter_1 = st.selectbox('Dataset', filters_levelone)
276
 
277
  if filter_1:
278
  if filter_1 in sum:
279
- sum_table_mulit_metrix('AQA', ['llama3_70b_judge'])
280
  else:
281
  dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
282
- draw('asu', 'AQA', filter_1, 'llama3_70b_judge')
 
 
283
 
284
 
285
- def er():
286
  st.title("Task: Emotion Recognition")
287
 
288
  sum = ['Overall']
289
 
290
- dataset_lists = ['IEMOCAP-Emotion-Test',
291
- 'MELD-Sentiment-Test',
292
- 'MELD-Emotion-Test']
 
 
293
 
294
  filters_levelone = sum + dataset_lists
295
 
296
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
297
 
298
  with left:
299
  filter_1 = st.selectbox('Dataset', filters_levelone)
300
 
301
  if filter_1:
302
  if filter_1 in sum:
303
- sum_table_mulit_metrix('ER', ['llama3_70b_judge_binary'])
304
  else:
305
- dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge_binary'])
306
- draw('vu', 'ER', filter_1, 'llama3_70b_judge_binary')
 
307
 
308
 
309
- def ar():
 
310
  st.title("Task: Accent Recognition")
311
 
312
  sum = ['Overall']
@@ -315,7 +388,7 @@ def ar():
315
 
316
  filters_levelone = sum + dataset_lists
317
 
318
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
319
 
320
  with left:
321
  filter_1 = st.selectbox('Dataset', filters_levelone)
@@ -323,14 +396,15 @@ def ar():
323
 
324
  if filter_1:
325
  if filter_1 in sum:
326
- sum_table_mulit_metrix('AR', ['llama3_70b_judge'])
327
- # sum_table('aR', 'llama3_70b_judge')
328
  else:
329
  dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge'])
330
- draw('vu', 'AR', filter_1, 'llama3_70b_judge')
 
 
331
 
332
 
333
- def gr():
334
  st.title("Task: Gender Recognition")
335
 
336
  sum = ['Overall']
@@ -340,47 +414,22 @@ def gr():
340
 
341
  filters_levelone = sum + dataset_lists
342
 
343
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
344
 
345
  with left:
346
  filter_1 = st.selectbox('Dataset', filters_levelone)
347
 
348
  if filter_1:
349
  if filter_1 in sum:
350
- sum_table_mulit_metrix('GR', ['llama3_70b_judge_binary'])
351
  else:
352
- dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge_binary'])
353
- draw('vu', 'GR', filter_1, 'llama3_70b_judge_binary')
354
 
355
 
356
- def spt():
357
- st.title("Task: Speech Translation")
358
-
359
- sum = ['Overall']
360
- dataset_lists = [
361
- 'Covost2-EN-ID-test',
362
- 'Covost2-EN-ZH-test',
363
- 'Covost2-EN-TA-test',
364
- 'Covost2-ID-EN-test',
365
- 'Covost2-ZH-EN-test',
366
- 'Covost2-TA-EN-test']
367
-
368
- filters_levelone = sum + dataset_lists
369
-
370
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
371
-
372
- with left:
373
- filter_1 = st.selectbox('Dataset', filters_levelone)
374
-
375
- if filter_1:
376
- if filter_1 in sum:
377
- sum_table_mulit_metrix('st', ['bleu'])
378
- else:
379
- dataset_contents(spt_datasets[filter_1], metrics['bleu'])
380
- draw('su', 'ST', filter_1, 'bleu')
381
 
382
 
383
- def music_mcq():
384
  st.title("Task: Music Understanding - MCQ Questions")
385
 
386
  sum = ['Overall']
@@ -390,17 +439,17 @@ def music_mcq():
390
 
391
  filters_levelone = sum + dataset_lists
392
 
393
- left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
394
 
395
  with left:
396
  filter_1 = st.selectbox('Dataset', filters_levelone)
397
 
398
  if filter_1:
399
  if filter_1 in sum:
400
- sum_table_mulit_metrix('music_mcq', ['llama3_70b_judge_binary'])
401
  else:
402
- dataset_contents(MUSIC_MCQ_DATASETS[filter_1], metrics['llama3_70b_judge_binary'])
403
- draw('vu', 'music_mcq', filter_1, 'llama3_70b_judge_binary')
404
 
405
 
406
 
 
29
  def dashboard():
30
 
31
  with st.container():
32
+ st.title("Leaderboard for AudioBench")
33
 
34
  st.markdown("""
35
+ [gh1]: https://github.com/AudioLLMs/AudioBench
36
+ [gh2]: https://github.com/AudioLLMs/AudioBench
37
+ **Toolkit:** [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/AudioBench?style=social)][gh1] |
38
+ [**Research Paper**](https://arxiv.org/abs/2406.16020) |
39
+ **Resource for AudioLLMs:** [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/Awesome-Audio-LLM?style=social)][gh2]
40
  """)
41
 
42
 
43
  st.markdown("""
44
+ #### Recent updates
45
+ - **Jan. 2025**: Update the layout.
46
+ - **Dec. 2024**: Added MuChoMusic dataset for Music Understanding - MCQ Questions. From Paper: https://arxiv.org/abs/2408.01337.
47
+ - **Dec. 2024**: Singlish ASR task added! The datasets are available on [HF](https://huggingface.co/datasets/MERaLiON/MNSC).
48
+ - **Dec. 2024**: Updated layout and added support for comparison between models with similar sizes. 1) Reorganized layout for a better user experience. 2) Added performance summary for each task.
49
+ - **Aug. 2024**: Initial leaderboard is now online.
 
 
 
 
 
 
 
50
  """)
51
 
52
  st.divider()
53
 
54
  st.markdown("""
55
+ #### Evaluating Audio-based Large Language Models
56
 
57
  - AudioBench is a comprehensive evaluation benchmark designed for general instruction-following audio large language models.
58
+ - AudioBench is an evaluation benchmark that we continually improve and maintain.
59
 
60
  Below are the initial 26 datasets that are included in AudioBench. We are now exteneded to over 40 datasets and going to extend to more in the future.
61
  """
 
63
 
64
 
65
  with st.container():
 
 
 
 
 
66
 
67
  st.markdown('''
 
 
68
  ''')
69
 
70
  st.markdown("###### :dart: Our Benchmark includes: ")
71
+ cols = st.columns(8)
72
  cols[0].metric(label="Tasks", value=">8")
73
  cols[1].metric(label="Datasets", value=">40")
74
  cols[2].metric(label="Evaluated Models", value=">5")
75
+
 
76
  st.divider()
77
  with st.container():
78
+ left_co, right_co = st.columns([1, 0.7])
79
 
80
  with left_co:
81
  st.markdown("""
 
91
  """)
92
 
93
 
94
+
95
+
96
+ def asr_english():
97
+ st.title("Task: Automatic Speech Recognition - English")
98
 
99
  sum = ['Overall']
100
  dataset_lists = [
 
111
 
112
  filters_levelone = sum + dataset_lists
113
 
114
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
115
 
116
  with left:
117
  filter_1 = st.selectbox('Dataset', filters_levelone)
118
 
119
  if filter_1:
120
  if filter_1 in sum:
121
+ sum_table_mulit_metrix('asr_english', ['wer'])
122
  else:
123
  dataset_contents(asr_datsets[filter_1], metrics['wer'])
124
+ draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
125
+
126
 
127
 
128
+
129
+
130
+ def asr_singlish():
131
  st.title("Task: Automatic Speech Recognition - Singlish")
132
 
133
  sum = ['Overall']
 
142
 
143
  filters_levelone = sum + dataset_lists
144
 
145
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
146
 
147
  with left:
148
  filter_1 = st.selectbox('Dataset', filters_levelone)
149
 
150
  if filter_1:
151
  if filter_1 in sum:
152
+ sum_table_mulit_metrix('asr_singlish', ['wer'])
153
  else:
154
  dataset_contents(singlish_asr_datasets[filter_1], metrics['wer'])
155
+ draw('su', 'asr_singlish', filter_1, 'wer')
156
+
157
 
158
 
159
+
160
+ def asr_mandarin():
161
  st.title("Task: Automatic Speech Recognition - Mandarin")
162
 
163
  sum = ['Overall']
 
167
 
168
  filters_levelone = sum + dataset_lists
169
 
170
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
171
 
172
  with left:
173
  filter_1 = st.selectbox('Dataset', filters_levelone)
174
 
175
  if filter_1:
176
  if filter_1 in sum:
177
+ sum_table_mulit_metrix('asr_mandarin', ['wer'])
178
  else:
179
  dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
180
+ draw('su', 'asr_mandarin', filter_1, 'wer')
181
 
182
 
183
 
184
+
185
+ def speech_translation():
186
+ st.title("Task: Speech Translation")
187
 
188
  sum = ['Overall']
189
+ dataset_lists = [
190
+ 'CoVoST2-EN-ID-test',
191
+ 'CoVoST2-EN-ZH-test',
192
+ 'CoVoST2-EN-TA-test',
193
+ 'CoVoST2-ID-EN-test',
194
+ 'CoVoST2-ZH-EN-test',
195
+ 'CoVoST2-TA-EN-test']
196
+
197
+ filters_levelone = sum + dataset_lists
198
+
199
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
200
+
201
+ with left:
202
+ filter_1 = st.selectbox('Dataset', filters_levelone)
203
+
204
+ if filter_1:
205
+ if filter_1 in sum:
206
+ sum_table_mulit_metrix('st', ['bleu'])
207
+ else:
208
+ dataset_contents(spt_datasets[filter_1], metrics['bleu'])
209
+ draw('su', 'ST', filter_1, 'bleu')
210
 
 
211
 
 
 
 
212
 
213
+
214
+ def speech_question_answering_english():
215
+ st.title("Task: Spoken Question Answering - English")
216
 
217
+ sum = ['Overall']
218
+
219
+ dataset_lists = [
220
+ 'CN-College-Listen-MCQ-Test',
221
+ 'DREAM-TTS-MCQ-Test',
222
+ 'SLUE-P2-SQA5-Test',
223
+ 'Public-SG-Speech-QA-Test',
224
+ 'Spoken-Squad-Test',
225
+ ]
226
+
227
+ filters_levelone = sum + dataset_lists
228
+
229
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
230
 
231
  with left:
232
  filter_1 = st.selectbox('Dataset', filters_levelone)
233
 
234
  if filter_1:
235
  if filter_1 in sum:
236
+ sum_table_mulit_metrix('sqa_english', ['llama3_70b_judge'])
237
+
238
+ #elif filter_1 in dataset_lists:
239
+ # dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
240
+ # draw('su', 'SQA', filter_1, 'llama3_70b_judge')
241
+
242
+ else:
243
+ dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
244
+ draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
245
+
246
+
247
+
248
 
249
+ def speech_question_answering_singlish():
250
+ st.title("Task: Spoken Question Answering - Singlish")
251
+
252
+ sum = ['Overall']
253
+
254
+ dataset_lists = [
255
+ 'MNSC-PART3-SQA',
256
+ 'MNSC-PART4-SQA',
257
+ 'MNSC-PART5-SQA',
258
+ 'MNSC-PART6-SQA',
259
+ ]
260
+
261
+
262
+ filters_levelone = sum + dataset_lists
263
+
264
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
265
+
266
+ with left:
267
+ filter_1 = st.selectbox('Dataset', filters_levelone)
268
+
269
+ if filter_1:
270
+ if filter_1 in sum:
271
+ sum_table_mulit_metrix('sqa_singlish', ['llama3_70b_judge'])
272
 
273
  else:
274
  dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
275
+ draw('su', 'sqa_singlish', filter_1, 'llama3_70b_judge')
276
+
277
+
278
 
279
+
280
+ def speech_instruction():
281
  st.title("Task: Speech Instruction")
282
 
283
  sum = ['Overall']
284
 
285
  dataset_lists = ['OpenHermes-Audio-Test',
286
+ 'ALPACA-Audio-Test',
287
+ ]
288
 
289
  filters_levelone = sum + dataset_lists
290
 
291
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
292
 
293
  with left:
294
  filter_1 = st.selectbox('Dataset', filters_levelone)
295
 
296
  if filter_1:
297
  if filter_1 in sum:
298
+ sum_table_mulit_metrix('speech_instruction', ['llama3_70b_judge'])
299
  else:
300
  dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
301
+ draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
302
+
303
+
304
 
305
+
306
+ def audio_captioning():
307
  st.title("Task: Audio Captioning")
308
 
309
  filters_levelone = ['WavCaps-Test',
310
+ 'AudioCaps-Test',
311
+ ]
312
  filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
313
 
314
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
315
 
316
  with left:
317
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
320
 
321
  if filter_1 or metric:
322
  dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')])
323
+ draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
324
+
325
+
326
 
327
 
328
+ def audio_scene_question_answering():
329
  st.title("Task: Audio Scene Question Answering")
330
 
331
  sum = ['Overall']
 
336
 
337
  filters_levelone = sum + dataset_lists
338
 
339
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
340
 
341
  with left:
342
  filter_1 = st.selectbox('Dataset', filters_levelone)
343
 
344
  if filter_1:
345
  if filter_1 in sum:
346
+ sum_table_mulit_metrix('audio_scene_question_answering', ['llama3_70b_judge'])
347
  else:
348
  dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
349
+ draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
350
+
351
+
352
 
353
 
354
+ def emotion_recognition():
355
  st.title("Task: Emotion Recognition")
356
 
357
  sum = ['Overall']
358
 
359
+ dataset_lists = [
360
+ 'IEMOCAP-Emotion-Test',
361
+ 'MELD-Sentiment-Test',
362
+ 'MELD-Emotion-Test',
363
+ ]
364
 
365
  filters_levelone = sum + dataset_lists
366
 
367
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
368
 
369
  with left:
370
  filter_1 = st.selectbox('Dataset', filters_levelone)
371
 
372
  if filter_1:
373
  if filter_1 in sum:
374
+ sum_table_mulit_metrix('emotion_recognition', ['llama3_70b_judge'])
375
  else:
376
+ dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge'])
377
+ draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
378
+
379
 
380
 
381
+
382
+ def accent_recognition():
383
  st.title("Task: Accent Recognition")
384
 
385
  sum = ['Overall']
 
388
 
389
  filters_levelone = sum + dataset_lists
390
 
391
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
392
 
393
  with left:
394
  filter_1 = st.selectbox('Dataset', filters_levelone)
 
396
 
397
  if filter_1:
398
  if filter_1 in sum:
399
+ sum_table_mulit_metrix('accent_recognition', ['llama3_70b_judge'])
 
400
  else:
401
  dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge'])
402
+ draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
403
+
404
+
405
 
406
 
407
+ def gender_recognition():
408
  st.title("Task: Gender Recognition")
409
 
410
  sum = ['Overall']
 
414
 
415
  filters_levelone = sum + dataset_lists
416
 
417
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
418
 
419
  with left:
420
  filter_1 = st.selectbox('Dataset', filters_levelone)
421
 
422
  if filter_1:
423
  if filter_1 in sum:
424
+ sum_table_mulit_metrix('gender_recognition', ['llama3_70b_judge'])
425
  else:
426
+ dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge'])
427
+ draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
428
 
429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
 
432
+ def music_understanding():
433
  st.title("Task: Music Understanding - MCQ Questions")
434
 
435
  sum = ['Overall']
 
439
 
440
  filters_levelone = sum + dataset_lists
441
 
442
+ left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
443
 
444
  with left:
445
  filter_1 = st.selectbox('Dataset', filters_levelone)
446
 
447
  if filter_1:
448
  if filter_1 in sum:
449
+ sum_table_mulit_metrix('music_understanding', ['llama3_70b_judge'])
450
  else:
451
+ dataset_contents(MUSIC_MCQ_DATASETS[filter_1], metrics['llama3_70b_judge'])
452
+ draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
453
 
454
 
455
 
app/summarization.py CHANGED
@@ -21,7 +21,7 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
21
  # combine chart data from multiple sources
22
  chart_data = pd.DataFrame()
23
  for metrics in metrics_lists:
24
- folder = f"./results/{metrics}"
25
  data_path = f'{folder}/{task_name.lower()}.csv'
26
  one_chart_data = pd.read_csv(data_path).round(3)
27
  if len(chart_data) == 0:
 
21
  # combine chart data from multiple sources
22
  chart_data = pd.DataFrame()
23
  for metrics in metrics_lists:
24
+ folder = f"./results_organized/{metrics}"
25
  data_path = f'{folder}/{task_name.lower()}.csv'
26
  one_chart_data = pd.read_csv(data_path).round(3)
27
  if len(chart_data) == 0: