Spaces:

MERaLiON
/

MERaLiON-AudioLLM

Running

App Files Files Community

YingxuHe commited on 12 days ago

Commit

cf9d671

1 Parent(s): 5e8f8a6

add mic button

Browse files

Files changed (8) hide show

app.py +36 -11
requirements.txt +0 -2
src/exceptions.py +6 -0
src/generation.py +138 -0
pages.py → src/pages.py +59 -52
src/tunnel.py +60 -0
utils.py → src/utils.py +1 -159
style/{sidebar_style.css → app_style.css} +31 -2

app.py CHANGED Viewed

@@ -1,17 +1,42 @@
-import os
 import streamlit as st
-import streamlit.components.v1 as components
-from pages import *
-def main():
-    ## Set Streamlit configuration
-    st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide')
-    st.markdown('<style>' + open('./style/sidebar_style.css').read() + '</style>', unsafe_allow_html=True)
-    audio_llm()
-if __name__ == '__main__':
-    main()

+import copy
 import streamlit as st
+from src.tunnel import start_server
+from src.generation import FIXED_GENERATION_CONFIG, load_model
+from src.pages import DEFAULT_DIALOGUE_STATES, sidebar_fragment, specify_audio_fragment, conversation_section
+st.set_page_config(page_title='MERaLiON-AudioLLM', page_icon = "🔥", layout='wide')
+st.markdown('<style>' + open('./style/app_style.css').read() + '</style>', unsafe_allow_html=True)
+if "server" not in st.session_state:
+    st.session_state.server = start_server()
+if "client" not in st.session_state or 'model_name' not in st.session_state:
+    st.session_state.client, st.session_state.model_name = load_model()
+for key, value in FIXED_GENERATION_CONFIG.items():
+    if key not in st.session_state:
+        st.session_state[key]=copy.deepcopy(value)
+for key, value in DEFAULT_DIALOGUE_STATES.items():
+    if key not in st.session_state:
+        st.session_state[key]=copy.deepcopy(value)
+with st.sidebar:
+    sidebar_fragment()
+if st.sidebar.button('Clear History'):
+    st.session_state.update(copy.deepcopy(DEFAULT_DIALOGUE_STATES))
+st.markdown("<h1 style='text-align: center;'>MERaLiON-AudioLLM Demo 🤖</h1>", unsafe_allow_html=True)
+st.markdown(
+    """This demo is based on [MERaLiON-AudioLLM](https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION),
+    developed by I2R, A*STAR, in collaboration with AISG, Singapore.
+    It is tailored for Singapore’s multilingual and multicultural landscape."""
+)
+specify_audio_fragment()
+conversation_section()

requirements.txt CHANGED Viewed

@@ -1,7 +1,5 @@
 librosa==0.10.2.post1
 streamlit==1.40.2
-streamlit-on-Hover-tabs==1.0.1
 openai==1.57.1
 streamlit_mic_recorder==0.0.8
-streamlit-server-state==0.18.2
 sshtunnel

 librosa==0.10.2.post1
 streamlit==1.40.2
 openai==1.57.1
 streamlit_mic_recorder==0.0.8
 sshtunnel

src/exceptions.py ADDED Viewed

	@@ -0,0 +1,6 @@

+class NoAudioException(Exception):
+    pass
+class TunnelNotRunningException(Exception):
+    pass

src/generation.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import os
+import re
+import time
+from typing import List
+import streamlit as st
+from openai import OpenAI, APIConnectionError
+from src.exceptions import NoAudioException, TunnelNotRunningException
+local_port = int(os.getenv('LOCAL_PORT'))
+FIXED_GENERATION_CONFIG = dict(
+    max_completion_tokens=1024,
+    top_k=50,
+    length_penalty=1.0,
+    seed=42
+)
+def load_model():
+    """
+    Create an OpenAI client with connection to vllm server.
+    """
+    openai_api_key = os.getenv('API_KEY')
+    openai_api_base = f"http://localhost:{local_port}/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    models = client.models.list()
+    model_name = models.data[0].id
+    return client, model_name
+def _retrive_response(text_input: str, base64_audio_input: str, **kwargs):
+    """
+    Send request through OpenAI client.
+    """
+    return st.session_state.client.chat.completions.create(
+        messages=[{
+            "role":
+            "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": f"Text instruction: {text_input}"
+                },
+                {
+                    "type": "audio_url",
+                    "audio_url": {
+                        "url": f"data:audio/ogg;base64,{base64_audio_input}"
+                    },
+                },
+            ],
+        }],
+        **kwargs
+    )
+def _retry_retrive_response_throws_exception(text_input, base64_audio_input, stream=False, retry=3):
+    if not base64_audio_input:
+        raise NoAudioException("audio is empty.")
+    try:
+        response_object = _retrive_response(
+            text_input=text_input,
+            base64_audio_input=base64_audio_input,
+            model=st.session_state.model_name,
+            max_completion_tokens=st.session_state.max_completion_tokens,
+            temperature=st.session_state.temperature,
+            top_p=st.session_state.top_p,
+            extra_body={
+                "repetition_penalty": st.session_state.repetition_penalty,
+                "top_k": st.session_state.top_k,
+                "length_penalty": st.session_state.length_penalty
+            },
+            seed=st.session_state.seed,
+            stream=stream
+        )
+    except APIConnectionError as e:
+        if not st.session_state.server.is_running():
+            if retry == 0:
+                raise TunnelNotRunningException()
+            st.toast(f":warning: Internet connection is down. Trying to re-establish connection ({retry}).")
+            if st.session_state.server.is_down():
+                st.session_state.server.restart()
+            elif st.session_state.server.is_starting():
+                time.sleep(2)
+            return _retry_retrive_response_throws_exception(text_input, retry-1)
+        raise e
+    return response_object
+def _validate_text_input(text_input) -> List[str]:
+    """
+    TODO: improve the input validation regex.
+    """
+    warnings = []
+    if re.search("tool|code|python|java|math|calculate", text_input):
+        warnings.append("WARNING: MERaLiON-AudioLLM is not intended for use in tool calling, math, and coding tasks.")
+    if re.search(r'[\u4e00-\u9fff]+', text_input):
+        warnings.append("NOTE: Please try to prompt in English for the best performance.")
+    return warnings
+def retrive_response(text_input, base64_audio_input, stream=False):
+    warnings = _validate_text_input(text_input)
+    response_object, error_msg = None, ""
+    try:
+        response_object = _retry_retrive_response_throws_exception(
+            text_input, base64_audio_input, stream
+        )
+    except NoAudioException:
+        error_msg = "Please specify audio first!"
+    except TunnelNotRunningException:
+        error_msg = "Internet connection cannot be established. Please contact the administrator."
+    except Exception as e:
+        error_msg = f"Caught Exception: {repr(e)}. Please contact the administrator."
+    return error_msg, warnings, response_object
+def postprocess_voice_transcription(text):
+    text = re.sub("<.*>:?|\(.*\)|\[.*\]", "", text)
+    text = re.sub("\s+", " ", text).strip()
+    return text

pages.py → src/pages.py RENAMED Viewed

@@ -3,19 +3,17 @@ import base64
 import numpy as np
 import streamlit as st
-from utils import (
     GENERAL_INSTRUCTIONS,
     AUDIO_SAMPLES_W_INSTRUCT,
-    NoAudioException,
-    TunnelNotRunningException,
-    retry_generate_response,
-    load_model,
     bytes_to_array,
     array_to_bytes,
-    start_server,
 )
 DEFAULT_DIALOGUE_STATES = dict(
     default_instruction=[],
     audio_base64='',
@@ -23,14 +21,18 @@ DEFAULT_DIALOGUE_STATES = dict(
     disprompt = False,
     new_prompt = "",
     messages=[],
     on_select=False,
     on_upload=False,
     on_record=False,
-    on_click_button = False
 )
 MAX_AUDIO_LENGTH = 120
 def _update_audio(audio_bytes):
     origin_audio_array = bytes_to_array(audio_bytes)
     truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
@@ -141,7 +143,42 @@ def specify_audio_fragment():
         st.rerun(scope="app")
-def dialogue_section():
     for message in st.session_state.messages:
         with st.chat_message(message["role"]):
             if message.get("error"):
@@ -151,12 +188,8 @@ def dialogue_section():
             if message.get("content"):
                 st.write(message["content"])
-    if chat_input := st.chat_input(
-        placeholder="Type Your Instruction Here",
-        disabled=st.session_state.disprompt,
-        on_submit=lambda: st.session_state.update(disprompt=True, messages=[])
-    ):
-        st.session_state.new_prompt = chat_input
     if one_time_prompt := st.session_state.new_prompt:
         st.session_state.update(new_prompt="", messages=[])
@@ -167,15 +200,17 @@ def dialogue_section():
         with st.chat_message("assistant"):
             with st.spinner("Thinking..."):
-                error_msg, warnings, response = "", [], ""
-                try:
-                    response, warnings = retry_generate_response(one_time_prompt)
-                except NoAudioException:
-                    error_msg = "Please specify audio first!"
-                except TunnelNotRunningException:
-                    error_msg = "Internet connection cannot be established. Please contact the administrator."
-                except Exception as e:
-                    error_msg = f"Caught Exception: {repr(e)}. Please contact the administrator."
         st.session_state.messages.append({
             "role": "assistant",
             "error": error_msg,
@@ -184,32 +219,4 @@ def dialogue_section():
         })
         st.session_state.disprompt=False
-        st.rerun(scope="app")
-def audio_llm():
-    if "server" not in st.session_state:
-        st.session_state.server = start_server()
-    if "client" not in st.session_state or 'model_name' not in st.session_state:
-        st.session_state.client, st.session_state.model_name = load_model()
-    for key, value in DEFAULT_DIALOGUE_STATES.items():
-        if key not in st.session_state:
-            st.session_state[key]=copy.deepcopy(value)
-    with st.sidebar:
-        sidebar_fragment()
-    if st.sidebar.button('Clear History'):
-        st.session_state.update(DEFAULT_DIALOGUE_STATES)
-    st.markdown("<h1 style='text-align: center;'>MERaLiON-AudioLLM Demo 🤖</h1>", unsafe_allow_html=True)
-    st.markdown(
-        """This demo is based on [MERaLiON-AudioLLM](https://huggingface.co/MERaLiON/MERaLiON-AudioLLM-Whisper-SEA-LION),
-        developed by I2R, A*STAR, in collaboration with AISG, Singapore.
-        It is tailored for Singapore’s multilingual and multicultural landscape."""
-    )
-    specify_audio_fragment()
-    dialogue_section()

 import numpy as np
 import streamlit as st
+from streamlit_float import *
+from src.generation import retrive_response, postprocess_voice_transcription
+from src.utils import (
     GENERAL_INSTRUCTIONS,
     AUDIO_SAMPLES_W_INSTRUCT,
     bytes_to_array,
     array_to_bytes,
 )
 DEFAULT_DIALOGUE_STATES = dict(
     default_instruction=[],
     audio_base64='',
     disprompt = False,
     new_prompt = "",
     messages=[],
+    voice_instruction="",
     on_select=False,
     on_upload=False,
     on_record=False,
+    on_click_button=False,
+    on_record_voice=False
 )
 MAX_AUDIO_LENGTH = 120
 def _update_audio(audio_bytes):
     origin_audio_array = bytes_to_array(audio_bytes)
     truncated_audio_array = origin_audio_array[: MAX_AUDIO_LENGTH*16000]
         st.rerun(scope="app")
+def bottom_input_section():
+    bottom_cols = st.columns([0.02, 0.98])
+    uploaded_file = bottom_cols[0].audio_input(
+        label="voice",
+        label_visibility="collapsed",
+        disabled=st.session_state.disprompt,
+        on_change=lambda: st.session_state.update(on_record_voice=True),
+        key='voice'
+    )
+    if uploaded_file and st.session_state.on_record_voice:
+        audio_bytes = uploaded_file.read()
+        audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
+        error_msg, warnings, completion = retrive_response(
+            "Write out the dialogue as text.", audio_base64, stream=False)
+        if error_msg:
+            st.toast(error_msg, icon="🚨")
+        for warning_msg in warnings:
+            st.toast(warning_msg, icon="❗")
+        st.session_state.update(
+            new_prompt = postprocess_voice_transcription(
+                completion.choices[0].message.content),
+            on_record_voice = False
+        )
+    if chat_input := bottom_cols[1].chat_input(
+        placeholder="Type Your Instruction Here",
+        disabled=st.session_state.disprompt,
+        on_submit=lambda: st.session_state.update(disprompt=True, messages=[])
+    ):
+        st.session_state.new_prompt = chat_input
+def conversation_section():
     for message in st.session_state.messages:
         with st.chat_message(message["role"]):
             if message.get("error"):
             if message.get("content"):
                 st.write(message["content"])
+    with st._bottom:
+        bottom_input_section()
     if one_time_prompt := st.session_state.new_prompt:
         st.session_state.update(new_prompt="", messages=[])
         with st.chat_message("assistant"):
             with st.spinner("Thinking..."):
+                error_msg, warnings, stream = retrive_response(
+                    one_time_prompt, st.session_state.audio_base64, stream=True)
+                response = ""
+                if error_msg:
+                    st.error(error_msg)
+                for warning_msg in warnings:
+                    st.warning(warning_msg)
+                if stream:
+                    response = st.write_stream(stream)
         st.session_state.messages.append({
             "role": "assistant",
             "error": error_msg,
         })
         st.session_state.disprompt=False
+        st.rerun(scope="app")

src/tunnel.py ADDED Viewed

	@@ -0,0 +1,60 @@

+import io
+import os
+import paramiko
+import streamlit as st
+from sshtunnel import SSHTunnelForwarder
+@st.cache_resource()
+def start_server():
+    server = SSHTunnelManager()
+    server.start()
+    return server
+class SSHTunnelManager:
+    def __init__(self):
+        pkey = paramiko.RSAKey.from_private_key(io.StringIO(os.getenv('PRIVATE_KEY')))
+        self.server = SSHTunnelForwarder(
+            ssh_address_or_host=os.getenv('SERVER_DNS_NAME'),
+            ssh_username="ec2-user",
+            ssh_pkey=pkey,
+            local_bind_address=("127.0.0.1", int(os.getenv('LOCAL_PORT'))),
+            remote_bind_address=("127.0.0.1", 8000)
+        )
+        self._is_starting = False
+        self._is_running = False
+    def update_status(self):
+        if not self._is_starting:
+            self.server.check_tunnels()
+            self._is_running = list(self.server.tunnel_is_up.values())[0]
+        else:
+            self._is_running = False
+    def is_starting(self):
+        self.update_status()
+        return self._is_starting
+    def is_running(self):
+        self.update_status()
+        return self._is_running
+    def is_down(self):
+        self.update_status()
+        return (not self._is_running) and (not self._is_starting)
+    def start(self, *args, **kwargs):
+        if not self._is_starting:
+            self._is_starting = True
+            self.server.start(*args, **kwargs)
+            self._is_starting = False
+    def restart(self, *args, **kwargs):
+        if not self._is_starting:
+            self._is_starting = True
+            self.server.restart(*args, **kwargs)
+            self._is_starting = False

utils.py → src/utils.py RENAMED Viewed

@@ -1,16 +1,7 @@
 import io
-import os
-import re
-import time
 from scipy.io.wavfile import write
 import librosa
-import paramiko
-import streamlit as st
-from openai import OpenAI, APIConnectionError
-from sshtunnel import SSHTunnelForwarder
-local_port = int(os.getenv('LOCAL_PORT'))
 GENERAL_INSTRUCTIONS = [
@@ -77,156 +68,6 @@ AUDIO_SAMPLES_W_INSTRUCT = {
 }
-class NoAudioException(Exception):
-    pass
-class TunnelNotRunningException(Exception):
-    pass
-class SSHTunnelManager:
-    def __init__(self):
-        pkey = paramiko.RSAKey.from_private_key(io.StringIO(os.getenv('PRIVATE_KEY')))
-        self.server = SSHTunnelForwarder(
-            ssh_address_or_host=os.getenv('SERVER_DNS_NAME'),
-            ssh_username="ec2-user",
-            ssh_pkey=pkey,
-            local_bind_address=("127.0.0.1", local_port),
-            remote_bind_address=("127.0.0.1", 8000)
-        )
-        self._is_starting = False
-        self._is_running = False
-    def update_status(self):
-        if not self._is_starting:
-            self.server.check_tunnels()
-            self._is_running = list(self.server.tunnel_is_up.values())[0]
-        else:
-            self._is_running = False
-    def is_starting(self):
-        self.update_status()
-        return self._is_starting
-    def is_running(self):
-        self.update_status()
-        return self._is_running
-    def is_down(self):
-        self.update_status()
-        return (not self._is_running) and (not self._is_starting)
-    def start(self, *args, **kwargs):
-        if not self._is_starting:
-            self._is_starting = True
-            self.server.start(*args, **kwargs)
-            self._is_starting = False
-    def restart(self, *args, **kwargs):
-        if not self._is_starting:
-            self._is_starting = True
-            self.server.restart(*args, **kwargs)
-            self._is_starting = False
-@st.cache_resource()
-def start_server():
-    server = SSHTunnelManager()
-    server.start()
-    return server
-def load_model():
-    openai_api_key = os.getenv('API_KEY')
-    openai_api_base = f"http://localhost:{local_port}/v1"
-    client = OpenAI(
-        api_key=openai_api_key,
-        base_url=openai_api_base,
-    )
-    models = client.models.list()
-    model_name = models.data[0].id
-    return client, model_name
-def generate_response(text_input):
-    if not st.session_state.audio_base64:
-        raise NoAudioException("audio is empty.")
-    warnings = []
-    if re.search("tool|code|python|java|math|calculate", text_input):
-        warnings.append("WARNING: MERaLiON-AudioLLM is not intended for use in tool calling, math, and coding tasks.")
-    if re.search(r'[\u4e00-\u9fff]+', text_input):
-        warnings.append("NOTE: Please try to prompt in English for the best performance.")
-    try:
-        stream = st.session_state.client.chat.completions.create(
-            messages=[{
-                "role":
-                "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": f"Text instruction: {text_input}"
-                    },
-                    {
-                        "type": "audio_url",
-                        "audio_url": {
-                            "url": f"data:audio/ogg;base64,{st.session_state.audio_base64}"
-                        },
-                    },
-                ],
-            }],
-            model=st.session_state.model_name,
-            max_completion_tokens=1024,
-            temperature=st.session_state.temperature,
-            top_p=st.session_state.top_p,
-            extra_body={
-                "repetition_penalty": st.session_state.repetition_penalty,
-                "top_k": 50,
-                "length_penalty": 1.0
-            },
-            seed=42,
-            stream=True,
-        )
-    except APIConnectionError as e:
-        if not st.session_state.server.is_running():
-            raise TunnelNotRunningException()
-        raise e
-    return stream, warnings
-def retry_generate_response(prompt, retry=3):
-    response, warnings = "", []
-    try:
-        stream, warnings = generate_response(prompt)
-        for warning_msg in warnings:
-            st.warning(warning_msg)
-        response = st.write_stream(stream)
-    except TunnelNotRunningException as e:
-        if retry == 0:
-            raise e
-        st.warning(f"Internet connection is down. Trying to re-establish connection ({retry}).")
-        if st.session_state.server.is_down():
-            st.session_state.server.restart()
-        elif st.session_state.server.is_starting():
-            time.sleep(2)
-        return retry_generate_response(retry-1)
-    return response, warnings
 def bytes_to_array(audio_bytes):
     audio_array, _ = librosa.load(
         io.BytesIO(audio_bytes),
@@ -234,6 +75,7 @@ def bytes_to_array(audio_bytes):
     )
     return audio_array
 def array_to_bytes(audio_array):
     bytes_wav = bytes()
     byte_io = io.BytesIO(bytes_wav)

 import io
 from scipy.io.wavfile import write
 import librosa
 GENERAL_INSTRUCTIONS = [
 }
 def bytes_to_array(audio_bytes):
     audio_array, _ = librosa.load(
         io.BytesIO(audio_bytes),
     )
     return audio_array
 def array_to_bytes(audio_array):
     bytes_wav = bytes()
     byte_io = io.BytesIO(bytes_wav)

style/{sidebar_style.css → app_style.css} RENAMED Viewed

@@ -1,4 +1,4 @@
-.st-emotion-cache-1c7y2kd {
     flex-direction: row-reverse;
     text-align: right;
 }
@@ -21,10 +21,39 @@ div[data-testid="stFileUploaderDropzoneInstructions"]>div>span {
     display:none;
 }
-.st-emotion-cache-1aq2la2 {
     max-height: 3rem;
 }
 [class='stAudio'] {
     max-width: 500px !important;
     margin: auto !important;

+div[data-testid="stChatMessage"]:has(> div[data-testid="stChatMessageAvatarUser"]) {
     flex-direction: row-reverse;
     text-align: right;
 }
     display:none;
 }
+div[data-testid="stMainBlockContainer"] div[data-testid="stAudioInput"]>div {
     max-height: 3rem;
 }
+div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div {
+    background-color:transparent;
+    /* border:1px solid rgba(49, 51, 63, 0.2); */
+    max-height: 40px;
+    display: block;
+    padding: 0;
+    margin: auto;
+}
+div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:last-of-type {
+    display:none;
+}
+div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:nth-of-type(2) {
+    margin:auto;
+}
+div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:nth-of-type(2)>span:last-of-type {
+    display:none;
+}
+div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>div:nth-of-type(2)>span:only-of-type {
+    display:block;
+}
+div[data-testid="stBottomBlockContainer"] div[data-testid="stAudioInput"]>div>span {
+    display:none;
+}
 [class='stAudio'] {
     max-width: 500px !important;
     margin: auto !important;