Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

File size: 13,139 Bytes

import os
import base64

import numpy as np
import streamlit as st
import streamlit.components.v1 as components
from streamlit_mic_recorder import mic_recorder

from utils import load_model, generate_response, bytes_to_array


def home_page():
    ## Set up home page Title
    col1, col2 = st.columns([1, 4])
    custom_html = """
    <div class="banner">
        <img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRhB2e_AhOe11wKxnnwOmOVg9E7J1MBgiTeYzzFAESwcCP5IbBAc2X8BwGChMfJzwqtVg&usqp=CAU" alt="Banner Image">
    </div>
    <style>
        .banner {
            width: 100%;
            height: 200px;
            overflow: visible;
        }
        .banner img {
            width: 100%;
            object-fit: cover;
        }
    </style>
    """
    with col1:
        components.html(custom_html)
    with col2: 
        st.write("# Welcome to MERaLiON - AudioLLMs 🤖")

    ## Set up home page other information
    st.markdown('')


def audio_llm():    
    with st.sidebar:
        st.divider()
        st.markdown("""<div class="sidebar-intro">
                    <p><strong>Purpose</strong>: Complex Audio Understanding</p>
                    <p><strong>Name</strong>: MERaLiON-AudioLLM-v1</p>
                    <p><strong>Version</strong>: Dec. 20, 2024</p>
                    </div>""", unsafe_allow_html=True)
    
    
    if st.sidebar.button('Clear History'):
        st.session_state.update(messages=[], 
                                on_upload=False, 
                                on_record=False, 
                                on_select=False, 
                                audio_array=np.array([]))    
    
    
    if "client" not in st.session_state or 'model_name' not in st.session_state:
        st.session_state.client, st.session_state.model_name = load_model()


    if "audio_array" not in st.session_state:
        st.session_state.audio_base64 = ''
        st.session_state.audio_array = np.array([])
    
    
    if "default_instruction" not in st.session_state: 
        st.session_state.default_instruction = ""
    
    col1, col2, col3 = st.columns(3)
    
    with col1:
        st.markdown("**Record Audio:**")
        
        recording = mic_recorder(
            format="wav", 
            use_container_width=True, 
            callback=lambda: st.session_state.update(on_record=True, messages=[]),
            key='record')
        
        if recording and st.session_state.on_record:
            audio_bytes = recording["bytes"]
            st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
            st.session_state.audio_array = bytes_to_array(audio_bytes)
    
    with col2:
        uploaded_file = st.file_uploader(
            label="**Upload Audio:**", 
            type=['wav', 'mp3'],
            on_change=lambda: st.session_state.update(on_upload=True, messages=[]),
            key='upload'
        )
        
        if uploaded_file and st.session_state.on_upload:
            audio_bytes = uploaded_file.read()
            st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
            st.session_state.audio_array = bytes_to_array(audio_bytes)

    with col3:
        audio_samples_w_instruct = {
            '1_ASR_IMDA_PART1_ASR_v2_141' : "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
            '2_ASR_IMDA_PART1_ASR_v2_2258': "Example Instruction:\n\n- Turn the spoken language into a text format.\n\n- Please translate the content into Chinese.",
            '3_ASR_IMDA_PART1_ASR_v2_2265': "Example Instruction:\n\n- Turn the spoken language into a text format.",
                    
            '4_ASR_IMDA_PART2_ASR_v2_999' : "Example Instruction:\n\n- Translate the spoken words into text format.",
            '5_ASR_IMDA_PART2_ASR_v2_2241': "Example Instruction: \n\n- Translate the spoken words into text format.",
            '6_ASR_IMDA_PART2_ASR_v2_3409': "Example Instruction: \n\n- Translate the spoken words into text format.",
    
            '7_ASR_IMDA_PART3_30_ASR_v2_2269': "Example Instruction:\n\n- Need this talk written down, please.",
            '8_ASR_IMDA_PART3_30_ASR_v2_1698': "Example Instruction: \n\n- Need this talk written down, please.",
            '9_ASR_IMDA_PART3_30_ASR_v2_2474': "Example Instruction: \n\n- Need this talk written down, please.",

            '10_ASR_IMDA_PART4_30_ASR_v2_1527': "Example Instruction:\n\n- Write out the dialogue as text.",
            '11_ASR_IMDA_PART4_30_ASR_v2_3771': "Example Instruction: \n\n- Write out the dialogue as text.",
            '12_ASR_IMDA_PART4_30_ASR_v2_103' : "Example Instruction: \n\n- Write out the dialogue as text.",
    
            '13_ASR_IMDA_PART5_30_ASR_v2_1446': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
            '14_ASR_IMDA_PART5_30_ASR_v2_2281': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",
            '15_ASR_IMDA_PART5_30_ASR_v2_4388': "Example Instruction: \n\n- Translate this vocal recording into a textual format.",

            '16_ASR_IMDA_PART6_30_ASR_v2_576': "Example Instruction: \n\n- Record the spoken word in text form.",
            '17_ASR_IMDA_PART6_30_ASR_v2_1413': "Example Instruction: \n\n- Record the spoken word in text form.",
            '18_ASR_IMDA_PART6_30_ASR_v2_2834': "Example Instruction: \n\n- Record the spoken word in text form.",

            '19_ASR_AIShell_zh_ASR_v2_5044': "Example Instruction: \n\n- Transform the oral presentation into a text document.",

            '20_ASR_LIBRISPEECH_CLEAN_ASR_V2_833': "Example Instruction: \n\n- Please provide a written transcription of the speech.",

            '21_ASR_LIBRISPEECH_OTHER_ASR_V2_656': "Example Instruction: \n\n- Can you make this audio into text?",

            '22_ASR_MEDIACORP_ASR_V2_35': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",

            '23_ASR_MEDIACORP_ASR_V2_6': "Example Instruction: \n\n- Transform the audio speech into a written transcript.",

            '24_ASR_PEOPLES_SPEECH_ASR_V2_21376': "Example Instruction: \n\n- Need this audio turned into a written piece.",

            '25_ST_COVOST2_ZH-CN_EN_ST_V2_4567': "Example Instruction: \n\n- Please translate the given speech to English.",
    
            '26_ST_COVOST2_EN_ZH-CN_ST_V2_5422': "Example Instruction: \n\n- Please translate the given speech to Chinese.",

            '27_ST_COVOST2_EN_ZH-CN_ST_V2_6697': "Example Instruction: \n\n- Please translate the given speech to Chinese.",

            '28_SI_ALPACA-GPT4-AUDIO_SI_V2_299': "Example Instruction: \n\n- Please follow the instruction in the speech.",

            '29_SI_ALPACA-GPT4-AUDIO_SI_V2_750': "Example Instruction: \n\n- Please follow the instruction in the speech.",

            '30_SI_ALPACA-GPT4-AUDIO_SI_V2_1454': "Example Instruction: \n\n- Please follow the instruction in the speech.",

            '31_SI_OPENHERMES-AUDIO_SI_V2_673': "Example Instruction: \n\n- Please follow the instruction in the speech.",

            '32_SQA_CN_COLLEDGE_ENTRANCE_ENGLISH_TEST_SQA_V2_572': "Example Instruction: \n\n- What does the man think the woman should do at 4:00?",

            '33_SQA_IMDA_PART3_30_SQA_V2_2310': "Example Instruction: \n\n- Does Speaker2's wife cook for Speaker2 when they are at home?",

            '34_SQA_IMDA_PART3_30_SQA_V2_3621': "Example Instruction: \n\n- Does the phrase \"#gai-gai#\" have a meaning in Chinese or Hokkien language?",

            '35_SQA_IMDA_PART3_30_SQA_V2_4062': "Example Instruction: \n\n- What is the color of the vase mentioned in the dialogue?",

            '36_DS_IMDA_PART4_30_DS_V2_849': "Example Instruction: \n\n- Condense the dialogue into a concise summary highlighting major topics and conclusions.",

            '39_Paralingual_IEMOCAP_ER_V2_91': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",

            '40_Paralingual_IEMOCAP_ER_V2_567': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",

            '41_Paralingual_IEMOCAP_ER_V2_468': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",

            '42_Paralingual_IEMOCAP_GR_V2_320': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",

            '43_Paralingual_IEMOCAP_GR_V2_129': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",

            '44_Paralingual_IEMOCAP_GR_V2_213': "Example Instruction: \n\n- Is it possible for you to identify whether the speaker in this recording is male or female?",

            '45_Paralingual_IMDA_PART3_30_GR_V2_12312': "Example Instruction: \n\n- So, who's speaking in the second part of the clip? \n\n- So, who's speaking in the first part of the clip?",

            '46_Paralingual_IMDA_PART3_30_GR_V2_1442': "Example Instruction: \n\n- Who starts the conversation in the dialogue?",
    
            '47_Paralingual_IMDA_PART3_30_NR_V2_10479': "Example Instruction: \n\n- Can you guess which ethnic group this person is from based on their accent?",

            '48_Paralingual_IMDA_PART3_30_NR_V2_15735': "Example Instruction: \n\n- In an analysis of the audio recording, determine the ethnic backgrounds of the speakers based on the accents used.",

            '49_Paralingual_MELD_ER_V2_676': "Example Instruction: \n\n- What emotions do you think the speaker is expressing?",

            '50_Paralingual_MELD_ER_V2_692': "Example Instruction: \n\n- Based on the speaker's speech patterns, what do you think they are feeling?",

            '51_Paralingual_VOXCELEB1_GR_V2_2148': "Example Instruction: \n\n- May I know the gender of the speaker?",

            '52_Paralingual_VOXCELEB1_GR_V2_3282': "Example Instruction: \n\n- I'd appreciate knowing the gender of the speaker, if possible.",

            '53_Paralingual_VOXCELEB1_NR_V2_2286': "Example Instruction: \n\n- What's the nationality identity of the speaker?",

            '54_Paralingual_VOXCELEB1_NR_V2_2742': "Example Instruction: \n\n- I'm intrigued by the speaker's nationality, could you enlighten me?",

            '55_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_2': "Example Instruction: \n\n- What impact would the growth of the healthcare sector have on the country's economy in terms of employment and growth?",

            '56_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_415': "Example Instruction: \n\n- Based on the statement, can you summarize the speaker's position on the recent controversial issues in Singapore?",

            '57_SQA_PUBLIC_SPEECH_SG_TEST_SQA_V2_460': "Example Instruction: \n\n- How does the author respond to parents' worries about masks in schools?"
           
            }
        
        audio_sample_names = [audio_sample_name for audio_sample_name in audio_samples_w_instruct.keys()]
       
        sample_name = st.selectbox(
            label="**Select Audio:**",
            options=audio_sample_names,
            index=None,
            placeholder="Select an audio sample:",
            on_change=lambda: st.session_state.update(on_select=True, messages=[]),
            key='select')
       
        if sample_name and st.session_state.on_select:
            audio_bytes = open(f"audio_samples/{sample_name}.wav", "rb").read()
            st.session_state.default_instruction = audio_samples_w_instruct[sample_name]
            st.session_state.audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
            st.session_state.audio_array = bytes_to_array(audio_bytes)
    
    st.write(st.session_state.default_instruction)
    st.audio(st.session_state.audio_array, format="audio/wav", sample_rate=16000)
    st.session_state.update(on_upload=False, on_record=False, on_select=False)


    st.markdown(
        """
        <style>
            .st-emotion-cache-1c7y2kd {
                flex-direction: row-reverse;
                text-align: right;
            }
        </style>
    
        """,
        unsafe_allow_html=True,
    )

    if "messages" not in st.session_state:
        st.session_state.messages = []
    
    if prompt := st.chat_input(placeholder="Your Instruction"):
        with st.chat_message("user"):
            st.write(prompt)
        st.session_state.messages.append({"role": "user", "content": prompt})
    
        with st.chat_message("assistant"):
            with st.spinner("Thinking..."):
                try:
                    stream = generate_response(prompt, st.session_state.audio_base64)
                    response = st.write_stream(stream)
                except Exception as e:
                    response = f"Caught Exception: {repr(e)}. Please contact the administrator to restart this space."
                    st.write(response)
                    raise(e)
                st.session_state.messages.append({"role": "assistant", "content": response})