Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- app/content.py +87 -75
- app/draw_diagram.py +1 -1
- app/pages.py +166 -117
- app/summarization.py +1 -1
app/content.py
CHANGED
@@ -1,3 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
asr_datsets = {'LibriSpeech-Test-Clean': 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.',
|
2 |
'LibriSpeech-Test-Other' : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.',
|
3 |
'Common-Voice-15-En-Test': 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.',
|
@@ -19,31 +70,39 @@ singlish_asr_datasets = {
|
|
19 |
}
|
20 |
|
21 |
sqa_datasets = {'CN-College-Listen-MCQ-Test': 'Chinese College English Listening Test, with multiple-choice questions.',
|
22 |
-
'DREAM-TTS-MCQ-Test': 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.',
|
23 |
-
'SLUE-P2-SQA5-Test': 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.',
|
24 |
-
'Public-SG-Speech-QA-Test': 'Public dataset for speech-based question answering, gathered from Singapore.',
|
25 |
-
'Spoken-Squad-Test': 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.'
|
26 |
}
|
27 |
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
}
|
31 |
|
32 |
ac_datasets = {
|
33 |
-
'WavCaps-Test': 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.',
|
34 |
'AudioCaps-Test': 'AudioCaps dataset, used for generating captions from general audio events.'
|
35 |
}
|
36 |
|
37 |
asqa_datasets = {
|
38 |
-
'Clotho-AQA-Test': 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.',
|
39 |
-
'WavCaps-QA-Test': 'Question-answering test dataset derived from WavCaps, focusing on audio content.',
|
40 |
'AudioCaps-QA-Test': 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.'
|
41 |
}
|
42 |
|
43 |
er_datasets = {
|
44 |
-
'IEMOCAP-Emotion-Test': 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.',
|
45 |
-
'MELD-Sentiment-Test': 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.',
|
46 |
-
'MELD-Emotion-Test': 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.'
|
47 |
}
|
48 |
|
49 |
ar_datsets = {
|
@@ -51,17 +110,17 @@ ar_datsets = {
|
|
51 |
}
|
52 |
|
53 |
gr_datasets = {
|
54 |
-
'VoxCeleb-Gender-Test': 'Test dataset for gender classification, also derived from VoxCeleb.',
|
55 |
-
'IEMOCAP-Gender-Test': 'Gender classification based on the IEMOCAP dataset.'
|
56 |
}
|
57 |
|
58 |
spt_datasets = {
|
59 |
-
'
|
60 |
-
'
|
61 |
-
'
|
62 |
-
'
|
63 |
-
'
|
64 |
-
'
|
65 |
}
|
66 |
|
67 |
cnasr_datasets = {
|
@@ -73,65 +132,18 @@ MUSIC_MCQ_DATASETS = {
|
|
73 |
}
|
74 |
|
75 |
metrics = {
|
76 |
-
'wer': 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
|
77 |
'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
|
78 |
-
'llama3_70b_judge': 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
|
79 |
-
'meteor': 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
|
80 |
-
'bleu': 'BLEU (Bilingual Evaluation Understudy), another text generation evaluation metric commonly used in machine translation. (Sensitive to output length)',
|
81 |
}
|
82 |
|
83 |
metrics_info = {
|
84 |
-
'wer': 'Word Error Rate (WER) - The Lower, the better.',
|
85 |
'llama3_70b_judge_binary': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
|
86 |
-
'llama3_70b_judge': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
|
87 |
-
'meteor': 'METEOR Score. The higher, the better.',
|
88 |
-
'bleu': 'BLEU Score. The higher, the better.',
|
89 |
}
|
90 |
|
91 |
-
|
92 |
-
dataname_column_rename_in_table = {
|
93 |
-
'librispeech_test_clean' : 'LibriSpeech-Clean',
|
94 |
-
'librispeech_test_other' : 'LibriSpeech-Other',
|
95 |
-
'common_voice_15_en_test' : 'CommonVoice-15-EN',
|
96 |
-
'peoples_speech_test' : 'Peoples-Speech',
|
97 |
-
'gigaspeech_test' : 'GigaSpeech-1',
|
98 |
-
'earnings21_test' : 'Earnings-21',
|
99 |
-
'earnings22_test' : 'Earnings-22',
|
100 |
-
'tedlium3_test' : 'TED-LIUM-3',
|
101 |
-
'tedlium3_long_form_test' : 'TED-LIUM-3-Long',
|
102 |
-
'aishell_asr_zh_test' : 'Aishell-ASR-ZH',
|
103 |
-
'covost2_en_id_test' : 'Covost2-EN-ID',
|
104 |
-
'covost2_en_zh_test' : 'Covost2-EN-ZH',
|
105 |
-
'covost2_en_ta_test' : 'Covost2-EN-TA',
|
106 |
-
'covost2_id_en_test' : 'Covost2-ID-EN',
|
107 |
-
'covost2_zh_en_test' : 'Covost2-ZH-EN',
|
108 |
-
'covost2_ta_en_test' : 'Covost2-TA-EN',
|
109 |
-
'cn_college_listen_mcq_test': 'CN-College-Listen-MCQ',
|
110 |
-
'dream_tts_mcq_test' : 'DREAM-TTS-MCQ',
|
111 |
-
'slue_p2_sqa5_test' : 'SLUE-P2-SQA5',
|
112 |
-
'public_sg_speech_qa_test' : 'Public-SG-Speech-QA',
|
113 |
-
'spoken_squad_test' : 'Spoken-SQuAD',
|
114 |
-
'openhermes_audio_test' : 'OpenHermes-Audio',
|
115 |
-
'alpaca_audio_test' : 'ALPACA-Audio',
|
116 |
-
'wavcaps_test' : 'WavCaps',
|
117 |
-
'audiocaps_test' : 'AudioCaps',
|
118 |
-
'clotho_aqa_test' : 'Clotho-AQA',
|
119 |
-
'wavcaps_qa_test' : 'WavCaps-QA',
|
120 |
-
'audiocaps_qa_test' : 'AudioCaps-QA',
|
121 |
-
'voxceleb_accent_test' : 'VoxCeleb-Accent',
|
122 |
-
'voxceleb_gender_test' : 'VoxCeleb-Gender',
|
123 |
-
'iemocap_gender_test' : 'IEMOCAP-Gender',
|
124 |
-
'iemocap_emotion_test' : 'IEMOCAP-Emotion',
|
125 |
-
'meld_sentiment_test' : 'MELD-Sentiment',
|
126 |
-
'meld_emotion_test' : 'MELD-Emotion',
|
127 |
-
'imda_part1_asr_test' : 'IMDA-Part1-ASR',
|
128 |
-
'imda_part2_asr_test' : 'IMDA-Part2-ASR',
|
129 |
-
'imda_part3_30s_asr_test' : 'IMDA-Part3-30s-ASR',
|
130 |
-
'imda_part4_30s_asr_test' : 'IMDA-Part4-30s-ASR',
|
131 |
-
'imda_part5_30s_asr_test' : 'IMDA-Part5-30s-ASR',
|
132 |
-
'imda_part6_30s_asr_test' : 'IMDA-Part6-30s-ASR',
|
133 |
-
|
134 |
-
'muchomusic_test' : 'MuChoMusic'
|
135 |
-
|
136 |
-
|
137 |
-
}
|
|
|
1 |
+
|
2 |
+
dataname_column_rename_in_table = {
|
3 |
+
'librispeech_test_clean' : 'LibriSpeech-Clean',
|
4 |
+
'librispeech_test_other' : 'LibriSpeech-Other',
|
5 |
+
'common_voice_15_en_test' : 'CommonVoice-15-EN',
|
6 |
+
'peoples_speech_test' : 'Peoples-Speech',
|
7 |
+
'gigaspeech_test' : 'GigaSpeech-1',
|
8 |
+
'earnings21_test' : 'Earnings-21',
|
9 |
+
'earnings22_test' : 'Earnings-22',
|
10 |
+
'tedlium3_test' : 'TED-LIUM-3',
|
11 |
+
'tedlium3_long_form_test' : 'TED-LIUM-3-Long',
|
12 |
+
'aishell_asr_zh_test' : 'Aishell-ASR-ZH',
|
13 |
+
'covost2_en_id_test' : 'CoVoST2-EN-ID',
|
14 |
+
'covost2_en_zh_test' : 'CoVoST2-EN-ZH',
|
15 |
+
'covost2_en_ta_test' : 'CoVoST2-EN-TA',
|
16 |
+
'covost2_id_en_test' : 'CoVoST2-ID-EN',
|
17 |
+
'covost2_zh_en_test' : 'CoVoST2-ZH-EN',
|
18 |
+
'covost2_ta_en_test' : 'CoVoST2-TA-EN',
|
19 |
+
'cn_college_listen_mcq_test' : 'CN-College-Listen-MCQ',
|
20 |
+
'dream_tts_mcq_test' : 'DREAM-TTS-MCQ',
|
21 |
+
'slue_p2_sqa5_test' : 'SLUE-P2-SQA5',
|
22 |
+
'public_sg_speech_qa_test' : 'Public-SG-Speech-QA',
|
23 |
+
'spoken_squad_test' : 'Spoken-SQuAD',
|
24 |
+
'openhermes_audio_test' : 'OpenHermes-Audio',
|
25 |
+
'alpaca_audio_test' : 'ALPACA-Audio',
|
26 |
+
'wavcaps_test' : 'WavCaps',
|
27 |
+
'audiocaps_test' : 'AudioCaps',
|
28 |
+
'clotho_aqa_test' : 'Clotho-AQA',
|
29 |
+
'wavcaps_qa_test' : 'WavCaps-QA',
|
30 |
+
'audiocaps_qa_test' : 'AudioCaps-QA',
|
31 |
+
'voxceleb_accent_test' : 'VoxCeleb-Accent',
|
32 |
+
'voxceleb_gender_test' : 'VoxCeleb-Gender',
|
33 |
+
'iemocap_gender_test' : 'IEMOCAP-Gender',
|
34 |
+
'iemocap_emotion_test' : 'IEMOCAP-Emotion',
|
35 |
+
'meld_sentiment_test' : 'MELD-Sentiment',
|
36 |
+
'meld_emotion_test' : 'MELD-Emotion',
|
37 |
+
'imda_part1_asr_test' : 'IMDA-Part1-ASR',
|
38 |
+
'imda_part2_asr_test' : 'IMDA-Part2-ASR',
|
39 |
+
'imda_part3_30s_asr_test' : 'IMDA-Part3-30s-ASR',
|
40 |
+
'imda_part4_30s_asr_test' : 'IMDA-Part4-30s-ASR',
|
41 |
+
'imda_part5_30s_asr_test' : 'IMDA-Part5-30s-ASR',
|
42 |
+
'imda_part6_30s_asr_test' : 'IMDA-Part6-30s-ASR',
|
43 |
+
'muchomusic_test' : 'MuChoMusic',
|
44 |
+
'imda_part3_30s_sqa_human_test': 'MNSC-PART3-SQA',
|
45 |
+
'imda_part4_30s_sqa_human_test': 'MNSC-PART4-SQA',
|
46 |
+
'imda_part5_30s_sqa_human_test': 'MNSC-PART5-SQA',
|
47 |
+
'imda_part6_30s_sqa_human_test': 'MNSC-PART6-SQA',
|
48 |
+
|
49 |
+
|
50 |
+
}
|
51 |
+
|
52 |
asr_datsets = {'LibriSpeech-Test-Clean': 'A clean, high-quality testset of the LibriSpeech dataset, used for ASR testing.',
|
53 |
'LibriSpeech-Test-Other' : 'A more challenging, noisier testset of the LibriSpeech dataset for ASR testing.',
|
54 |
'Common-Voice-15-En-Test': 'Test set from the Common Voice project, which is a crowd-sourced, multilingual speech dataset.',
|
|
|
70 |
}
|
71 |
|
72 |
sqa_datasets = {'CN-College-Listen-MCQ-Test': 'Chinese College English Listening Test, with multiple-choice questions.',
|
73 |
+
'DREAM-TTS-MCQ-Test' : 'DREAM dataset for spoken question-answering, derived from textual data and synthesized speech.',
|
74 |
+
'SLUE-P2-SQA5-Test' : 'Spoken Language Understanding Evaluation (SLUE) dataset, part 2, focused on QA tasks.',
|
75 |
+
'Public-SG-Speech-QA-Test': 'Public dataset for speech-based question answering, gathered from Singapore.',
|
76 |
+
'Spoken-Squad-Test' : 'Spoken SQuAD dataset, based on the textual SQuAD dataset, converted into audio.'
|
77 |
}
|
78 |
|
79 |
+
sqa_singlish_datasets = {
|
80 |
+
'MNSC-PART3-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 3.',
|
81 |
+
'MNSC-PART4-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 4.',
|
82 |
+
'MNSC-PART5-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 5.',
|
83 |
+
'MNSC-PART6-SQA': 'Multitak National Speech Corpus (MNSC) dataset, Question answering task, Part 6.',
|
84 |
+
}
|
85 |
+
|
86 |
+
si_datasets = {
|
87 |
+
'OpenHermes-Audio-Test': 'Test set for spoken instructions. Synthesized from the OpenHermes dataset.',
|
88 |
+
'ALPACA-Audio-Test' : 'Spoken version of the ALPACA dataset, used for evaluating instruction following in audio.'
|
89 |
}
|
90 |
|
91 |
ac_datasets = {
|
92 |
+
'WavCaps-Test' : 'WavCaps is a dataset for testing audio captioning, where models generate textual descriptions of audio clips.',
|
93 |
'AudioCaps-Test': 'AudioCaps dataset, used for generating captions from general audio events.'
|
94 |
}
|
95 |
|
96 |
asqa_datasets = {
|
97 |
+
'Clotho-AQA-Test' : 'Clotho dataset adapted for audio-based question answering, containing audio clips and questions.',
|
98 |
+
'WavCaps-QA-Test' : 'Question-answering test dataset derived from WavCaps, focusing on audio content.',
|
99 |
'AudioCaps-QA-Test': 'AudioCaps adapted for question-answering tasks, using audio events as input for Q&A.'
|
100 |
}
|
101 |
|
102 |
er_datasets = {
|
103 |
+
'IEMOCAP-Emotion-Test': 'Emotion recognition test data from the IEMOCAP dataset, focusing on identifying emotions in speech.',
|
104 |
+
'MELD-Sentiment-Test' : 'Sentiment recognition from speech using the MELD dataset, classifying positive, negative, or neutral sentiments.',
|
105 |
+
'MELD-Emotion-Test' : 'Emotion classification in speech using MELD, detecting specific emotions like happiness, anger, etc.'
|
106 |
}
|
107 |
|
108 |
ar_datsets = {
|
|
|
110 |
}
|
111 |
|
112 |
gr_datasets = {
|
113 |
+
'VoxCeleb-Gender-Test': 'Test dataset for gender classification, also derived from VoxCeleb.',
|
114 |
+
'IEMOCAP-Gender-Test' : 'Gender classification based on the IEMOCAP dataset.'
|
115 |
}
|
116 |
|
117 |
spt_datasets = {
|
118 |
+
'CoVoST2-EN-ID-test': 'CoVoST 2 dataset for speech translation from English to Indonesian.',
|
119 |
+
'CoVoST2-EN-ZH-test': 'CoVoST 2 dataset for speech translation from English to Chinese.',
|
120 |
+
'CoVoST2-EN-TA-test': 'CoVoST 2 dataset for speech translation from English to Tamil.',
|
121 |
+
'CoVoST2-ID-EN-test': 'CoVoST 2 dataset for speech translation from Indonesian to English.',
|
122 |
+
'CoVoST2-ZH-EN-test': 'CoVoST 2 dataset for speech translation from Chinese to English.',
|
123 |
+
'CoVoST2-TA-EN-test': 'CoVoST 2 dataset for speech translation from Tamil to English.'
|
124 |
}
|
125 |
|
126 |
cnasr_datasets = {
|
|
|
132 |
}
|
133 |
|
134 |
metrics = {
|
135 |
+
'wer' : 'Word Error Rate (WER), a common metric for ASR evaluation. (The lower, the better)',
|
136 |
'llama3_70b_judge_binary': 'Binary evaluation using the LLAMA3-70B model, for tasks requiring a binary outcome. (0-100 based on score 0-1)',
|
137 |
+
'llama3_70b_judge' : 'General evaluation using the LLAMA3-70B model, typically scoring based on subjective judgments. (0-100 based on score 0-5)',
|
138 |
+
'meteor' : 'METEOR, a metric used for evaluating text generation, often used in translation or summarization tasks. (Sensitive to output length)',
|
139 |
+
'bleu' : 'BLEU (Bilingual Evaluation Understudy), another text generation evaluation metric commonly used in machine translation. (Sensitive to output length)',
|
140 |
}
|
141 |
|
142 |
metrics_info = {
|
143 |
+
'wer' : 'Word Error Rate (WER) - The Lower, the better.',
|
144 |
'llama3_70b_judge_binary': 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
|
145 |
+
'llama3_70b_judge' : 'Model-as-a-Judge Peformance. Using LLAMA-3-70B. Scale from 0-100. The higher, the better.',
|
146 |
+
'meteor' : 'METEOR Score. The higher, the better.',
|
147 |
+
'bleu' : 'BLEU Score. The higher, the better.',
|
148 |
}
|
149 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app/draw_diagram.py
CHANGED
@@ -17,7 +17,7 @@ info_df = get_dataframe()
|
|
17 |
|
18 |
def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
|
19 |
|
20 |
-
folder = f"./
|
21 |
|
22 |
# Load the results from CSV
|
23 |
data_path = f'{folder}/{category_name.lower()}.csv'
|
|
|
17 |
|
18 |
def draw(folder_name, category_name, dataset_name, metrics, cus_sort=True):
|
19 |
|
20 |
+
folder = f"./results_organized/{metrics}/"
|
21 |
|
22 |
# Load the results from CSV
|
23 |
data_path = f'{folder}/{category_name.lower()}.csv'
|
app/pages.py
CHANGED
@@ -29,38 +29,33 @@ def dataset_contents(dataset, metrics):
|
|
29 |
def dashboard():
|
30 |
|
31 |
with st.container():
|
32 |
-
st.title("AudioBench")
|
33 |
|
34 |
st.markdown("""
|
35 |
-
[
|
36 |
-
[
|
37 |
-
[![GitHub
|
|
|
|
|
38 |
""")
|
39 |
|
40 |
|
41 |
st.markdown("""
|
42 |
-
|
43 |
-
|
44 |
-
- **Dec
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
- **Dec, 2024**:
|
49 |
-
- Updated layout and added support for comparison between models with similar sizes.
|
50 |
-
- Reorganized layout for a better user experience.
|
51 |
-
- Added performance summary for each task.
|
52 |
-
|
53 |
-
- **Aug 2024**:
|
54 |
-
- Initial leaderboard is now online.
|
55 |
""")
|
56 |
|
57 |
st.divider()
|
58 |
|
59 |
st.markdown("""
|
60 |
-
####
|
61 |
|
62 |
- AudioBench is a comprehensive evaluation benchmark designed for general instruction-following audio large language models.
|
63 |
-
- AudioBench is
|
64 |
|
65 |
Below are the initial 26 datasets that are included in AudioBench. We are now exteneded to over 40 datasets and going to extend to more in the future.
|
66 |
"""
|
@@ -68,27 +63,19 @@ def dashboard():
|
|
68 |
|
69 |
|
70 |
with st.container():
|
71 |
-
left_co, center_co, right_co = st.columns([1, 0.5, 0.5])
|
72 |
-
with left_co:
|
73 |
-
st.image("./style/audio_overview.png",
|
74 |
-
caption="Overview of the datasets in AudioBench.",
|
75 |
-
)
|
76 |
|
77 |
st.markdown('''
|
78 |
-
|
79 |
-
|
80 |
''')
|
81 |
|
82 |
st.markdown("###### :dart: Our Benchmark includes: ")
|
83 |
-
cols = st.columns(
|
84 |
cols[0].metric(label="Tasks", value=">8")
|
85 |
cols[1].metric(label="Datasets", value=">40")
|
86 |
cols[2].metric(label="Evaluated Models", value=">5")
|
87 |
-
|
88 |
-
|
89 |
st.divider()
|
90 |
with st.container():
|
91 |
-
left_co,
|
92 |
|
93 |
with left_co:
|
94 |
st.markdown("""
|
@@ -104,8 +91,10 @@ def dashboard():
|
|
104 |
""")
|
105 |
|
106 |
|
107 |
-
|
108 |
-
|
|
|
|
|
109 |
|
110 |
sum = ['Overall']
|
111 |
dataset_lists = [
|
@@ -122,20 +111,23 @@ def asr():
|
|
122 |
|
123 |
filters_levelone = sum + dataset_lists
|
124 |
|
125 |
-
left, center, _, middle, right = st.columns([0.
|
126 |
|
127 |
with left:
|
128 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
129 |
|
130 |
if filter_1:
|
131 |
if filter_1 in sum:
|
132 |
-
sum_table_mulit_metrix('
|
133 |
else:
|
134 |
dataset_contents(asr_datsets[filter_1], metrics['wer'])
|
135 |
-
draw('su', '
|
|
|
136 |
|
137 |
|
138 |
-
|
|
|
|
|
139 |
st.title("Task: Automatic Speech Recognition - Singlish")
|
140 |
|
141 |
sum = ['Overall']
|
@@ -150,20 +142,22 @@ def singlish_asr():
|
|
150 |
|
151 |
filters_levelone = sum + dataset_lists
|
152 |
|
153 |
-
left, center, _, middle, right = st.columns([0.
|
154 |
|
155 |
with left:
|
156 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
157 |
|
158 |
if filter_1:
|
159 |
if filter_1 in sum:
|
160 |
-
sum_table_mulit_metrix('
|
161 |
else:
|
162 |
dataset_contents(singlish_asr_datasets[filter_1], metrics['wer'])
|
163 |
-
draw('su', '
|
|
|
164 |
|
165 |
|
166 |
-
|
|
|
167 |
st.title("Task: Automatic Speech Recognition - Mandarin")
|
168 |
|
169 |
sum = ['Overall']
|
@@ -173,80 +167,151 @@ def cnasr():
|
|
173 |
|
174 |
filters_levelone = sum + dataset_lists
|
175 |
|
176 |
-
left, center, _, middle, right = st.columns([0.
|
177 |
|
178 |
with left:
|
179 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
180 |
|
181 |
if filter_1:
|
182 |
if filter_1 in sum:
|
183 |
-
sum_table_mulit_metrix('
|
184 |
else:
|
185 |
dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
|
186 |
-
draw('su', '
|
187 |
|
188 |
|
189 |
|
190 |
-
|
191 |
-
|
|
|
192 |
|
193 |
sum = ['Overall']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
|
195 |
-
binary = ['CN-College-Listen-MCQ-Test', 'DREAM-TTS-MCQ-Test']
|
196 |
|
197 |
-
rest = ['SLUE-P2-SQA5-Test',
|
198 |
-
'Public-SG-Speech-QA-Test',
|
199 |
-
'Spoken-Squad-Test']
|
200 |
|
201 |
-
|
|
|
|
|
202 |
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
|
205 |
with left:
|
206 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
207 |
|
208 |
if filter_1:
|
209 |
if filter_1 in sum:
|
210 |
-
sum_table_mulit_metrix('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
else:
|
217 |
dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
|
218 |
-
draw('su', '
|
|
|
|
|
219 |
|
220 |
-
|
|
|
221 |
st.title("Task: Speech Instruction")
|
222 |
|
223 |
sum = ['Overall']
|
224 |
|
225 |
dataset_lists = ['OpenHermes-Audio-Test',
|
226 |
-
'ALPACA-Audio-Test'
|
|
|
227 |
|
228 |
filters_levelone = sum + dataset_lists
|
229 |
|
230 |
-
left, center, _, middle, right = st.columns([0.
|
231 |
|
232 |
with left:
|
233 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
234 |
|
235 |
if filter_1:
|
236 |
if filter_1 in sum:
|
237 |
-
sum_table_mulit_metrix('
|
238 |
else:
|
239 |
dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
|
240 |
-
draw('su', '
|
|
|
|
|
241 |
|
242 |
-
|
|
|
243 |
st.title("Task: Audio Captioning")
|
244 |
|
245 |
filters_levelone = ['WavCaps-Test',
|
246 |
-
'AudioCaps-Test'
|
|
|
247 |
filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
|
248 |
|
249 |
-
left, center, _, middle, right = st.columns([0.
|
250 |
|
251 |
with left:
|
252 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
@@ -255,10 +320,12 @@ def ac():
|
|
255 |
|
256 |
if filter_1 or metric:
|
257 |
dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')])
|
258 |
-
draw('asu', '
|
|
|
|
|
259 |
|
260 |
|
261 |
-
def
|
262 |
st.title("Task: Audio Scene Question Answering")
|
263 |
|
264 |
sum = ['Overall']
|
@@ -269,44 +336,50 @@ def asqa():
|
|
269 |
|
270 |
filters_levelone = sum + dataset_lists
|
271 |
|
272 |
-
left, center, _, middle, right = st.columns([0.
|
273 |
|
274 |
with left:
|
275 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
276 |
|
277 |
if filter_1:
|
278 |
if filter_1 in sum:
|
279 |
-
sum_table_mulit_metrix('
|
280 |
else:
|
281 |
dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
|
282 |
-
draw('asu', '
|
|
|
|
|
283 |
|
284 |
|
285 |
-
def
|
286 |
st.title("Task: Emotion Recognition")
|
287 |
|
288 |
sum = ['Overall']
|
289 |
|
290 |
-
dataset_lists = [
|
291 |
-
|
292 |
-
|
|
|
|
|
293 |
|
294 |
filters_levelone = sum + dataset_lists
|
295 |
|
296 |
-
left, center, _, middle, right = st.columns([0.
|
297 |
|
298 |
with left:
|
299 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
300 |
|
301 |
if filter_1:
|
302 |
if filter_1 in sum:
|
303 |
-
sum_table_mulit_metrix('
|
304 |
else:
|
305 |
-
dataset_contents(er_datasets[filter_1], metrics['
|
306 |
-
draw('vu', '
|
|
|
307 |
|
308 |
|
309 |
-
|
|
|
310 |
st.title("Task: Accent Recognition")
|
311 |
|
312 |
sum = ['Overall']
|
@@ -315,7 +388,7 @@ def ar():
|
|
315 |
|
316 |
filters_levelone = sum + dataset_lists
|
317 |
|
318 |
-
left, center, _, middle, right = st.columns([0.
|
319 |
|
320 |
with left:
|
321 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
@@ -323,14 +396,15 @@ def ar():
|
|
323 |
|
324 |
if filter_1:
|
325 |
if filter_1 in sum:
|
326 |
-
sum_table_mulit_metrix('
|
327 |
-
# sum_table('aR', 'llama3_70b_judge')
|
328 |
else:
|
329 |
dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge'])
|
330 |
-
draw('vu', '
|
|
|
|
|
331 |
|
332 |
|
333 |
-
def
|
334 |
st.title("Task: Gender Recognition")
|
335 |
|
336 |
sum = ['Overall']
|
@@ -340,47 +414,22 @@ def gr():
|
|
340 |
|
341 |
filters_levelone = sum + dataset_lists
|
342 |
|
343 |
-
left, center, _, middle, right = st.columns([0.
|
344 |
|
345 |
with left:
|
346 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
347 |
|
348 |
if filter_1:
|
349 |
if filter_1 in sum:
|
350 |
-
sum_table_mulit_metrix('
|
351 |
else:
|
352 |
-
dataset_contents(gr_datasets[filter_1], metrics['
|
353 |
-
draw('vu', '
|
354 |
|
355 |
|
356 |
-
def spt():
|
357 |
-
st.title("Task: Speech Translation")
|
358 |
-
|
359 |
-
sum = ['Overall']
|
360 |
-
dataset_lists = [
|
361 |
-
'Covost2-EN-ID-test',
|
362 |
-
'Covost2-EN-ZH-test',
|
363 |
-
'Covost2-EN-TA-test',
|
364 |
-
'Covost2-ID-EN-test',
|
365 |
-
'Covost2-ZH-EN-test',
|
366 |
-
'Covost2-TA-EN-test']
|
367 |
-
|
368 |
-
filters_levelone = sum + dataset_lists
|
369 |
-
|
370 |
-
left, center, _, middle, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2])
|
371 |
-
|
372 |
-
with left:
|
373 |
-
filter_1 = st.selectbox('Dataset', filters_levelone)
|
374 |
-
|
375 |
-
if filter_1:
|
376 |
-
if filter_1 in sum:
|
377 |
-
sum_table_mulit_metrix('st', ['bleu'])
|
378 |
-
else:
|
379 |
-
dataset_contents(spt_datasets[filter_1], metrics['bleu'])
|
380 |
-
draw('su', 'ST', filter_1, 'bleu')
|
381 |
|
382 |
|
383 |
-
def
|
384 |
st.title("Task: Music Understanding - MCQ Questions")
|
385 |
|
386 |
sum = ['Overall']
|
@@ -390,17 +439,17 @@ def music_mcq():
|
|
390 |
|
391 |
filters_levelone = sum + dataset_lists
|
392 |
|
393 |
-
left, center, _, middle, right = st.columns([0.
|
394 |
|
395 |
with left:
|
396 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
397 |
|
398 |
if filter_1:
|
399 |
if filter_1 in sum:
|
400 |
-
sum_table_mulit_metrix('
|
401 |
else:
|
402 |
-
dataset_contents(MUSIC_MCQ_DATASETS[filter_1], metrics['
|
403 |
-
draw('vu', '
|
404 |
|
405 |
|
406 |
|
|
|
29 |
def dashboard():
|
30 |
|
31 |
with st.container():
|
32 |
+
st.title("Leaderboard for AudioBench")
|
33 |
|
34 |
st.markdown("""
|
35 |
+
[gh1]: https://github.com/AudioLLMs/AudioBench
|
36 |
+
[gh2]: https://github.com/AudioLLMs/AudioBench
|
37 |
+
**Toolkit:** [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/AudioBench?style=social)][gh1] |
|
38 |
+
[**Research Paper**](https://arxiv.org/abs/2406.16020) |
|
39 |
+
**Resource for AudioLLMs:** [![GitHub Repo stars](https://img.shields.io/github/stars/AudioLLMs/Awesome-Audio-LLM?style=social)][gh2]
|
40 |
""")
|
41 |
|
42 |
|
43 |
st.markdown("""
|
44 |
+
#### Recent updates
|
45 |
+
- **Jan. 2025**: Update the layout.
|
46 |
+
- **Dec. 2024**: Added MuChoMusic dataset for Music Understanding - MCQ Questions. From Paper: https://arxiv.org/abs/2408.01337.
|
47 |
+
- **Dec. 2024**: Singlish ASR task added! The datasets are available on [HF](https://huggingface.co/datasets/MERaLiON/MNSC).
|
48 |
+
- **Dec. 2024**: Updated layout and added support for comparison between models with similar sizes. 1) Reorganized layout for a better user experience. 2) Added performance summary for each task.
|
49 |
+
- **Aug. 2024**: Initial leaderboard is now online.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
""")
|
51 |
|
52 |
st.divider()
|
53 |
|
54 |
st.markdown("""
|
55 |
+
#### Evaluating Audio-based Large Language Models
|
56 |
|
57 |
- AudioBench is a comprehensive evaluation benchmark designed for general instruction-following audio large language models.
|
58 |
+
- AudioBench is an evaluation benchmark that we continually improve and maintain.
|
59 |
|
60 |
Below are the initial 26 datasets that are included in AudioBench. We are now exteneded to over 40 datasets and going to extend to more in the future.
|
61 |
"""
|
|
|
63 |
|
64 |
|
65 |
with st.container():
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
st.markdown('''
|
|
|
|
|
68 |
''')
|
69 |
|
70 |
st.markdown("###### :dart: Our Benchmark includes: ")
|
71 |
+
cols = st.columns(8)
|
72 |
cols[0].metric(label="Tasks", value=">8")
|
73 |
cols[1].metric(label="Datasets", value=">40")
|
74 |
cols[2].metric(label="Evaluated Models", value=">5")
|
75 |
+
|
|
|
76 |
st.divider()
|
77 |
with st.container():
|
78 |
+
left_co, right_co = st.columns([1, 0.7])
|
79 |
|
80 |
with left_co:
|
81 |
st.markdown("""
|
|
|
91 |
""")
|
92 |
|
93 |
|
94 |
+
|
95 |
+
|
96 |
+
def asr_english():
|
97 |
+
st.title("Task: Automatic Speech Recognition - English")
|
98 |
|
99 |
sum = ['Overall']
|
100 |
dataset_lists = [
|
|
|
111 |
|
112 |
filters_levelone = sum + dataset_lists
|
113 |
|
114 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
115 |
|
116 |
with left:
|
117 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
118 |
|
119 |
if filter_1:
|
120 |
if filter_1 in sum:
|
121 |
+
sum_table_mulit_metrix('asr_english', ['wer'])
|
122 |
else:
|
123 |
dataset_contents(asr_datsets[filter_1], metrics['wer'])
|
124 |
+
draw('su', 'asr_english', filter_1, 'wer', cus_sort=True)
|
125 |
+
|
126 |
|
127 |
|
128 |
+
|
129 |
+
|
130 |
+
def asr_singlish():
|
131 |
st.title("Task: Automatic Speech Recognition - Singlish")
|
132 |
|
133 |
sum = ['Overall']
|
|
|
142 |
|
143 |
filters_levelone = sum + dataset_lists
|
144 |
|
145 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
146 |
|
147 |
with left:
|
148 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
149 |
|
150 |
if filter_1:
|
151 |
if filter_1 in sum:
|
152 |
+
sum_table_mulit_metrix('asr_singlish', ['wer'])
|
153 |
else:
|
154 |
dataset_contents(singlish_asr_datasets[filter_1], metrics['wer'])
|
155 |
+
draw('su', 'asr_singlish', filter_1, 'wer')
|
156 |
+
|
157 |
|
158 |
|
159 |
+
|
160 |
+
def asr_mandarin():
|
161 |
st.title("Task: Automatic Speech Recognition - Mandarin")
|
162 |
|
163 |
sum = ['Overall']
|
|
|
167 |
|
168 |
filters_levelone = sum + dataset_lists
|
169 |
|
170 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
171 |
|
172 |
with left:
|
173 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
174 |
|
175 |
if filter_1:
|
176 |
if filter_1 in sum:
|
177 |
+
sum_table_mulit_metrix('asr_mandarin', ['wer'])
|
178 |
else:
|
179 |
dataset_contents(cnasr_datasets[filter_1], metrics['wer'])
|
180 |
+
draw('su', 'asr_mandarin', filter_1, 'wer')
|
181 |
|
182 |
|
183 |
|
184 |
+
|
185 |
+
def speech_translation():
|
186 |
+
st.title("Task: Speech Translation")
|
187 |
|
188 |
sum = ['Overall']
|
189 |
+
dataset_lists = [
|
190 |
+
'CoVoST2-EN-ID-test',
|
191 |
+
'CoVoST2-EN-ZH-test',
|
192 |
+
'CoVoST2-EN-TA-test',
|
193 |
+
'CoVoST2-ID-EN-test',
|
194 |
+
'CoVoST2-ZH-EN-test',
|
195 |
+
'CoVoST2-TA-EN-test']
|
196 |
+
|
197 |
+
filters_levelone = sum + dataset_lists
|
198 |
+
|
199 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
200 |
+
|
201 |
+
with left:
|
202 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
203 |
+
|
204 |
+
if filter_1:
|
205 |
+
if filter_1 in sum:
|
206 |
+
sum_table_mulit_metrix('st', ['bleu'])
|
207 |
+
else:
|
208 |
+
dataset_contents(spt_datasets[filter_1], metrics['bleu'])
|
209 |
+
draw('su', 'ST', filter_1, 'bleu')
|
210 |
|
|
|
211 |
|
|
|
|
|
|
|
212 |
|
213 |
+
|
214 |
+
def speech_question_answering_english():
|
215 |
+
st.title("Task: Spoken Question Answering - English")
|
216 |
|
217 |
+
sum = ['Overall']
|
218 |
+
|
219 |
+
dataset_lists = [
|
220 |
+
'CN-College-Listen-MCQ-Test',
|
221 |
+
'DREAM-TTS-MCQ-Test',
|
222 |
+
'SLUE-P2-SQA5-Test',
|
223 |
+
'Public-SG-Speech-QA-Test',
|
224 |
+
'Spoken-Squad-Test',
|
225 |
+
]
|
226 |
+
|
227 |
+
filters_levelone = sum + dataset_lists
|
228 |
+
|
229 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
230 |
|
231 |
with left:
|
232 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
233 |
|
234 |
if filter_1:
|
235 |
if filter_1 in sum:
|
236 |
+
sum_table_mulit_metrix('sqa_english', ['llama3_70b_judge'])
|
237 |
+
|
238 |
+
#elif filter_1 in dataset_lists:
|
239 |
+
# dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
|
240 |
+
# draw('su', 'SQA', filter_1, 'llama3_70b_judge')
|
241 |
+
|
242 |
+
else:
|
243 |
+
dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
|
244 |
+
draw('su', 'sqa_english', filter_1, 'llama3_70b_judge')
|
245 |
+
|
246 |
+
|
247 |
+
|
248 |
|
249 |
+
def speech_question_answering_singlish():
|
250 |
+
st.title("Task: Spoken Question Answering - Singlish")
|
251 |
+
|
252 |
+
sum = ['Overall']
|
253 |
+
|
254 |
+
dataset_lists = [
|
255 |
+
'MNSC-PART3-SQA',
|
256 |
+
'MNSC-PART4-SQA',
|
257 |
+
'MNSC-PART5-SQA',
|
258 |
+
'MNSC-PART6-SQA',
|
259 |
+
]
|
260 |
+
|
261 |
+
|
262 |
+
filters_levelone = sum + dataset_lists
|
263 |
+
|
264 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
265 |
+
|
266 |
+
with left:
|
267 |
+
filter_1 = st.selectbox('Dataset', filters_levelone)
|
268 |
+
|
269 |
+
if filter_1:
|
270 |
+
if filter_1 in sum:
|
271 |
+
sum_table_mulit_metrix('sqa_singlish', ['llama3_70b_judge'])
|
272 |
|
273 |
else:
|
274 |
dataset_contents(sqa_datasets[filter_1], metrics['llama3_70b_judge'])
|
275 |
+
draw('su', 'sqa_singlish', filter_1, 'llama3_70b_judge')
|
276 |
+
|
277 |
+
|
278 |
|
279 |
+
|
280 |
+
def speech_instruction():
|
281 |
st.title("Task: Speech Instruction")
|
282 |
|
283 |
sum = ['Overall']
|
284 |
|
285 |
dataset_lists = ['OpenHermes-Audio-Test',
|
286 |
+
'ALPACA-Audio-Test',
|
287 |
+
]
|
288 |
|
289 |
filters_levelone = sum + dataset_lists
|
290 |
|
291 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
292 |
|
293 |
with left:
|
294 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
295 |
|
296 |
if filter_1:
|
297 |
if filter_1 in sum:
|
298 |
+
sum_table_mulit_metrix('speech_instruction', ['llama3_70b_judge'])
|
299 |
else:
|
300 |
dataset_contents(si_datasets[filter_1], metrics['llama3_70b_judge'])
|
301 |
+
draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge')
|
302 |
+
|
303 |
+
|
304 |
|
305 |
+
|
306 |
+
def audio_captioning():
|
307 |
st.title("Task: Audio Captioning")
|
308 |
|
309 |
filters_levelone = ['WavCaps-Test',
|
310 |
+
'AudioCaps-Test',
|
311 |
+
]
|
312 |
filters_leveltwo = ['Llama3-70b-judge', 'Meteor']
|
313 |
|
314 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
315 |
|
316 |
with left:
|
317 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
|
|
320 |
|
321 |
if filter_1 or metric:
|
322 |
dataset_contents(ac_datasets[filter_1], metrics[metric.lower().replace('-', '_')])
|
323 |
+
draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_'))
|
324 |
+
|
325 |
+
|
326 |
|
327 |
|
328 |
+
def audio_scene_question_answering():
|
329 |
st.title("Task: Audio Scene Question Answering")
|
330 |
|
331 |
sum = ['Overall']
|
|
|
336 |
|
337 |
filters_levelone = sum + dataset_lists
|
338 |
|
339 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
340 |
|
341 |
with left:
|
342 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
343 |
|
344 |
if filter_1:
|
345 |
if filter_1 in sum:
|
346 |
+
sum_table_mulit_metrix('audio_scene_question_answering', ['llama3_70b_judge'])
|
347 |
else:
|
348 |
dataset_contents(asqa_datasets[filter_1], metrics['llama3_70b_judge'])
|
349 |
+
draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge')
|
350 |
+
|
351 |
+
|
352 |
|
353 |
|
354 |
+
def emotion_recognition():
|
355 |
st.title("Task: Emotion Recognition")
|
356 |
|
357 |
sum = ['Overall']
|
358 |
|
359 |
+
dataset_lists = [
|
360 |
+
'IEMOCAP-Emotion-Test',
|
361 |
+
'MELD-Sentiment-Test',
|
362 |
+
'MELD-Emotion-Test',
|
363 |
+
]
|
364 |
|
365 |
filters_levelone = sum + dataset_lists
|
366 |
|
367 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
368 |
|
369 |
with left:
|
370 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
371 |
|
372 |
if filter_1:
|
373 |
if filter_1 in sum:
|
374 |
+
sum_table_mulit_metrix('emotion_recognition', ['llama3_70b_judge'])
|
375 |
else:
|
376 |
+
dataset_contents(er_datasets[filter_1], metrics['llama3_70b_judge'])
|
377 |
+
draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge')
|
378 |
+
|
379 |
|
380 |
|
381 |
+
|
382 |
+
def accent_recognition():
|
383 |
st.title("Task: Accent Recognition")
|
384 |
|
385 |
sum = ['Overall']
|
|
|
388 |
|
389 |
filters_levelone = sum + dataset_lists
|
390 |
|
391 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
392 |
|
393 |
with left:
|
394 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
|
|
396 |
|
397 |
if filter_1:
|
398 |
if filter_1 in sum:
|
399 |
+
sum_table_mulit_metrix('accent_recognition', ['llama3_70b_judge'])
|
|
|
400 |
else:
|
401 |
dataset_contents(ar_datsets[filter_1], metrics['llama3_70b_judge'])
|
402 |
+
draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge')
|
403 |
+
|
404 |
+
|
405 |
|
406 |
|
407 |
+
def gender_recognition():
|
408 |
st.title("Task: Gender Recognition")
|
409 |
|
410 |
sum = ['Overall']
|
|
|
414 |
|
415 |
filters_levelone = sum + dataset_lists
|
416 |
|
417 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
418 |
|
419 |
with left:
|
420 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
421 |
|
422 |
if filter_1:
|
423 |
if filter_1 in sum:
|
424 |
+
sum_table_mulit_metrix('gender_recognition', ['llama3_70b_judge'])
|
425 |
else:
|
426 |
+
dataset_contents(gr_datasets[filter_1], metrics['llama3_70b_judge'])
|
427 |
+
draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge')
|
428 |
|
429 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
430 |
|
431 |
|
432 |
+
def music_understanding():
|
433 |
st.title("Task: Music Understanding - MCQ Questions")
|
434 |
|
435 |
sum = ['Overall']
|
|
|
439 |
|
440 |
filters_levelone = sum + dataset_lists
|
441 |
|
442 |
+
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2])
|
443 |
|
444 |
with left:
|
445 |
filter_1 = st.selectbox('Dataset', filters_levelone)
|
446 |
|
447 |
if filter_1:
|
448 |
if filter_1 in sum:
|
449 |
+
sum_table_mulit_metrix('music_understanding', ['llama3_70b_judge'])
|
450 |
else:
|
451 |
+
dataset_contents(MUSIC_MCQ_DATASETS[filter_1], metrics['llama3_70b_judge'])
|
452 |
+
draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge')
|
453 |
|
454 |
|
455 |
|
app/summarization.py
CHANGED
@@ -21,7 +21,7 @@ def sum_table_mulit_metrix(task_name, metrics_lists: List[str]):
|
|
21 |
# combine chart data from multiple sources
|
22 |
chart_data = pd.DataFrame()
|
23 |
for metrics in metrics_lists:
|
24 |
-
folder = f"./
|
25 |
data_path = f'{folder}/{task_name.lower()}.csv'
|
26 |
one_chart_data = pd.read_csv(data_path).round(3)
|
27 |
if len(chart_data) == 0:
|
|
|
21 |
# combine chart data from multiple sources
|
22 |
chart_data = pd.DataFrame()
|
23 |
for metrics in metrics_lists:
|
24 |
+
folder = f"./results_organized/{metrics}"
|
25 |
data_path = f'{folder}/{task_name.lower()}.csv'
|
26 |
one_chart_data = pd.read_csv(data_path).round(3)
|
27 |
if len(chart_data) == 0:
|