import gradio as gr from huggingface_hub import InferenceClient from transformers import AutoTokenizer, AutoModelForCausalLM import torch # Load Inference Client for the response model client = InferenceClient("Qwen/Qwen2.5-3B-Instruct") # Load tokenizer and model for the EOU detection tokenizer = AutoTokenizer.from_pretrained("livekit/turn-detector") model = AutoModelForCausalLM.from_pretrained("livekit/turn-detector") import re import string def normalize_text(text: str) -> str: """Chuẩn hóa văn bản bằng cách loại bỏ dấu câu, khoảng trắng thừa và chuyển về chữ thường.""" text = text.strip().lower() # Chuyển về chữ thường và xóa khoảng trắng đầu/cuối text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Loại bỏ dấu câu return re.sub(r"\s+", " ", text) # Loại bỏ khoảng trắng thừa def compute_eou_probability(chat_ctx: list[dict[str, str]], max_tokens: int = 512) -> float: """Compute the probability of End of Utterance (EOU) after normalizing text.""" conversation = ["Assistant ready to help."] # Add system message directly for msg in chat_ctx: content = msg.get("content", "") if content: normalized_content = normalize_text(content) # Chuẩn hóa văn bản conversation.append(normalized_content) # Tokenize the conversation inputs = tokenizer( conversation, padding=True, truncation=True, max_length=max_tokens, return_tensors="pt" ) with torch.no_grad(): outputs = model(**inputs) logits = outputs.logits[0, -1, :] probabilities = torch.nn.functional.softmax(logits, dim=-1) # Get EOU token probability eou_token_id = tokenizer.encode("<|im_end|>")[0] if eou_token_id not in tokenizer.get_vocab().values(): raise ValueError("EOU token '<|im_end|>' not found in tokenizer vocabulary.") return probabilities[eou_token_id].item() # Respond function with EOU checking logic def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, eou_threshold: float = 0.2, # Default EOU threshold ): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) # Compute EOU probability before responding eou_probability = compute_eou_probability(messages, max_tokens=max_tokens) console.log(eou_probability) # Only respond if EOU probability exceeds threshold if eou_probability >= eou_threshold: # Prepare message for assistant response messages.append({"role": "user", "content": message}) response = "" for message in client.chat_completion( messages, max_tokens=max_tokens, stream=True, temperature=temperature, top_p=top_p, ): token = message.choices[0].delta.content response += token yield response else: # Let the user continue typing if the EOU probability is low yield "Waiting for user to finish... Please continue." print("Waiting for user to finish... Please continue.") # Gradio UI demo = gr.ChatInterface( respond, additional_inputs=[ gr.Textbox(value="You are helpful assistant", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)", ), gr.Slider( minimum=0.0, maximum=1.0, value=0.7, step=0.05, label="EOU Threshold" ), # Add EOU threshold slider ], ) if __name__ == "__main__": demo.launch()