Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

YingxuHe commited on Dec 26, 2024

Commit

67da2ee

verified ·

1 Parent(s): af54a08

Update pages.py

Browse files

Files changed (1) hide show

pages.py +24 -93

pages.py CHANGED Viewed

@@ -101,105 +101,34 @@ def audio_llm():
     with col3:
         audio_samples_w_instruct = {
-            '1_ASR_IMDA_PART1_ASR_v2_141' : "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
-            '2_ASR_IMDA_PART1_ASR_v2_2258': "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
-            '3_ASR_IMDA_PART1_ASR_v2_2265': "Example Instruction:\n\n- Turn the spoken language into a text format.",
-            '4_ASR_IMDA_PART2_ASR_v2_999' : "Example Instruction:\n\n- Translate the spoken words into text format.",
-            '5_ASR_IMDA_PART2_ASR_v2_2241': "Example Instruction: \n\n- Translate the spoken words into text format.",
-            '6_ASR_IMDA_PART2_ASR_v2_3409': "Example Instruction: \n\n- Translate the spoken words into text format.",
-            '7_ASR_IMDA_PART3_30_ASR_v2_2269': "Example Instruction:\n\n- Need this talk written down, please.",
-            '8_ASR_IMDA_PART3_30_ASR_v2_1698': "Example Instruction: \n\n- Need this talk written down, please.",
-            '9_ASR_IMDA_PART3_30_ASR_v2_2474': "Example Instruction: \n\n- Need this talk written down, please.",
-            '10_ASR_IMDA_PART4_30_ASR_v2_1527': "Example Instruction:\n\n- Write out the dialogue as text.",
-            '11_ASR_IMDA_PART4_30_ASR_v2_3771': "Example Instruction: \n\n- Write out the dialogue as text.",
-            '12_ASR_IMDA_PART4_30_ASR_v2_103' : "Example Instruction: \n\n- Write out the dialogue as text.",
-            '13_ASR_IMDA_PART5_30_ASR_v2_1446': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
-            '14_ASR_IMDA_PART5_30_ASR_v2_2281': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
-            '15_ASR_IMDA_PART5_30_ASR_v2_4388': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
-            '16_ASR_IMDA_PART6_30_ASR_v2_576': "Example Instruction: \n\n- Record the spoken word in text form.",
-            '17_ASR_IMDA_PART6_30_ASR_v2_1413': "Example Instruction: \n\n- Record the spoken word in text form.",
-            '18_ASR_IMDA_PART6_30_ASR_v2_2834': "Example Instruction: \n\n- Record the spoken word in text form.",
-            '19_ASR_AIShell_zh_ASR_v2_5044': "Example Instruction: \n\n- Transform the oral presentation into a text document.",
-            '20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': "Example Instruction: \n\n- Please provide a written transcription of the speech.",
-            '21_ASR_LIBRISPEECH_OTHER_ASR_V2_656': "Example Instruction: \n\n- Can you make this audio into text?",
-            '22_ASR_MEDIACORP_ASR_V2_35': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",
-            '23_ASR_MEDIACORP_ASR_V2_6': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",
-            '24_ASR_PEOPLES_SPEECH_ASR_V2_21376': "Example Instruction: \n\n- Need this audio turned into a written piece.",
-            '25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': "Example Instruction: \n\n- Please translate the given speech to English.",
-            '26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': "Example Instruction: \n\n- Please translate the given speech to Chinese.",
-            '27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': "Example Instruction: \n\n- Please translate the given speech to Chinese.",
-            '28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': "Example Instruction: \n\n- Please follow the instruction in the speech.",
-            '29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': "Example Instruction: \n\n- Please follow the instruction in the speech.",
-            '30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': "Example Instruction: \n\n- Please follow the instruction in the speech.",
-            '31_SI_OPENHERMES-AUDIO_SI_V2_673': "Example Instruction: \n\n- Please follow the instruction in the speech.",
-            '32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': "Example Instruction: \n\n- What does the man think the woman should do at 4:00?",
-            '33_SQA_IMDA_PART3_30_SQA_V2_2310': "Example Instruction: \n\n- Does Speaker2's wife cook for Speaker2 when they are at home?",
-            '34_SQA_IMDA_PART3_30_SQA_V2_3621': "Example Instruction: \n\n- Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language?",
-            '35_SQA_IMDA_PART3_30_SQA_V2_4062': "Example Instruction: \n\n- What is the color of the vase mentioned in the dialogue?",
-            '36_DS_IMDA_PART4_30_DS_V2_849': "Example Instruction: \n\n- Condense the dialogue into a concise summary highlighting major topics and conclusions.",
-            '39_Paralingual_IEMOCAP_ER_V2_91': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
-            '40_Paralingual_IEMOCAP_ER_V2_567': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
-            '41_Paralingual_IEMOCAP_ER_V2_468': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
-            '42_Paralingual_IEMOCAP_GR_V2_320': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
-            '43_Paralingual_IEMOCAP_GR_V2_129': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
-            '44_Paralingual_IEMOCAP_GR_V2_213': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",
-            '45_Paralingual_IMDA_PART3_30_GR_V2_12312': "Example Instruction: \n\n- So, who's speaking in the second part of the clip? \n\n- So, who's speaking in the first part of the clip?",
-            '46_Paralingual_IMDA_PART3_30_GR_V2_1442': "Example Instruction: \n\n- Who starts the conversation in the dialogue?",
-            '47_Paralingual_IMDA_PART3_30_NR_V2_10479': "Example Instruction: \n\n- Can you guess which ethnic group this person is from based on their accent?",
-            '48_Paralingual_IMDA_PART3_30_NR_V2_15735': "Example Instruction: \n\n- In an analysis of the audio recording, determine the ethnic backgrounds of the speakers based on the accents used.",
-            '49_Paralingual_MELD_ER_V2_676': "Example Instruction: \n\n- What emotions do you think the speaker is expressing?",
-            '50_Paralingual_MELD_ER_V2_692': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",
-            '51_Paralingual_VOXCELEB1_GR_V2_2148': "Example Instruction: \n\n- May I know the gender of the speaker?",
-            '52_Paralingual_VOXCELEB1_GR_V2_3282': "Example Instruction: \n\n- I'd appreciate knowing the gender of the speaker, if possible.",
-            '53_Paralingual_VOXCELEB1_NR_V2_2286': "Example Instruction: \n\n- What's the nationality identity of the speaker?",
-            '54_Paralingual_VOXCELEB1_NR_V2_2742': "Example Instruction: \n\n- I'm intrigued by the speaker's nationality, could you enlighten me?",
-            '55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2': "Example Instruction: \n\n- What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth?",
-            '56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415': "Example Instruction: \n\n- Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore?",
-            '57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460': "Example Instruction: \n\n- How does the author respond to parents' worries about masks in schools?"
-            }
         audio_sample_names = [audio_sample_name for audio_sample_name in audio_samples_w_instruct.keys()]
@@ -220,7 +149,6 @@ def audio_llm():
             st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
             st.session_state.audio_array = bytes_to_array(audio_bytes)
-    st.write(st.session_state.default_instruction)
     st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
     st.session_state.update(on_upload=False, on_record=False, on_select=False)
@@ -233,6 +161,9 @@ def audio_llm():
     with col5:
         st.slider(label='Top P', min_value=0.0, max_value=1.0, value=1.0, key='top_p')
     st.markdown(
         """
         <style>

     with col3:
         audio_samples_w_instruct = {
+            '1_ASR_IMDA_PART1_ASR_v2_141' : "- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
+            '2_ASR_IMDA_PART1_ASR_v2_2258': "- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
+            '3_ASR_IMDA_PART1_ASR_v2_2265': "- Turn the spoken language into a text format.",
+            '4_ASR_IMDA_PART2_ASR_v2_999' : "- Translate the spoken words into text format.",
+            '5_ASR_IMDA_PART2_ASR_v2_2241': "- Translate the spoken words into text format.",
+            '6_ASR_IMDA_PART2_ASR_v2_3409': "- Translate the spoken words into text format.",
+            '7_ASR_IMDA_PART3_30_ASR_v2_2269': "- Need this talk written down, please.",
+            '8_ASR_IMDA_PART3_30_ASR_v2_1698': "- Need this talk written down, please.",
+            '9_ASR_IMDA_PART3_30_ASR_v2_2474': "- Need this talk written down, please.",
+            '10_ASR_IMDA_PART4_30_ASR_v2_1527': "- Write out the dialogue as text.",
+            '11_ASR_IMDA_PART4_30_ASR_v2_3771': "- Write out the dialogue as text.",
+            '12_ASR_IMDA_PART4_30_ASR_v2_103' : "- Write out the dialogue as text.",
+            '13_ASR_IMDA_PART5_30_ASR_v2_1446': "- Translate this vocal recording into a textual format.",
+            '14_ASR_IMDA_PART5_30_ASR_v2_2281': "- Translate this vocal recording into a textual format.",
+            '15_ASR_IMDA_PART5_30_ASR_v2_4388': "- Translate this vocal recording into a textual format.",
+            '16_ASR_IMDA_PART6_30_ASR_v2_576': "- Record the spoken word in text form.",
+            '17_ASR_IMDA_PART6_30_ASR_v2_1413': "- Record the spoken word in text form.",
+            '18_ASR_IMDA_PART6_30_ASR_v2_2834': "- Record the spoken word in text form.",
+            '19_ASR_AIShell_zh_ASR_v2_5044': "- Transform the oral presentation into a text document.",
+            '20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': "- Please provide a written transcription of the speech."
+        }
         audio_sample_names = [audio_sample_name for audio_sample_name in audio_samples_w_instruct.keys()]
             st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
             st.session_state.audio_array = bytes_to_array(audio_bytes)
     st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
     st.session_state.update(on_upload=False, on_record=False, on_select=False)
     with col5:
         st.slider(label='Top P', min_value=0.0, max_value=1.0, value=1.0, key='top_p')
+    st.markdown("**Example Instruction:**")
+    st.write(st.session_state.default_instruction)
     st.markdown(
         """
         <style>