Spaces:

joe-muller
/

livekit-turn-detector

Running

App Files Files Community

livekit-turn-detector / app.py

jtmuller

Update Space

00407e6 22 days ago

raw

history blame contribute delete

5.41 kB

	import sys
	import onnxruntime as ort
	import numpy as np
	import string

	# Transformers, HuggingFace Hub, and Gradio
	from transformers import AutoTokenizer
	import gradio as gr
	from huggingface_hub import InferenceClient

	# ------------------------------------------------
	# Turn Detector Configuration
	# ------------------------------------------------
	HG_MODEL = "livekit/turn-detector" # or your HF model repo
	ONNX_FILENAME = "model_quantized.onnx" # path to your ONNX file
	MAX_HISTORY_TOKENS = 512
	PUNCS = string.punctuation.replace("'", "")

	# ------------------------------------------------
	# Utility functions
	# ------------------------------------------------


	def softmax(logits: np.ndarray) -> np.ndarray:
	exp_logits = np.exp(logits - np.max(logits))
	return exp_logits / np.sum(exp_logits)


	def normalize_text(text: str) -> str:
	"""Lowercase, strip punctuation (except single quotes), and collapse whitespace."""
	def strip_puncs(text_in):
	return text_in.translate(str.maketrans("", "", PUNCS))
	return " ".join(strip_puncs(text).lower().split())


	def calculate_eou(chat_ctx, session, tokenizer) -> float:
	"""
	Given a conversation context (list of dicts with 'role' and 'content'),
	returns the probability that the user is finished speaking.
	"""
	# Collect normalized messages from 'user' or 'assistant' roles
	normalized_ctx = []
	for msg in chat_ctx:
	if msg["role"] in ("user", "assistant"):
	content = normalize_text(msg["content"])
	if content:
	normalized_ctx.append(content)

	# Join them into one input string
	text = " ".join(normalized_ctx)
	inputs = tokenizer(
	text,
	return_tensors="np",
	truncation=True,
	max_length=MAX_HISTORY_TOKENS,
	)

	input_ids = np.array(inputs["input_ids"], dtype=np.int64)
	# Run inference
	outputs = session.run(["logits"], {"input_ids": input_ids})
	logits = outputs[0][0, -1, :]

	# Softmax over logits
	probs = softmax(logits)
	# The ID for the <\|im_end\|> special token
	eou_token_id = tokenizer.encode("<\|im_end\|>")[-1]
	return probs[eou_token_id]


	# ------------------------------------------------
	# Load ONNX session & tokenizer once
	# ------------------------------------------------
	print("Loading ONNX model session...")
	onnx_session = ort.InferenceSession(
	ONNX_FILENAME, providers=["CPUExecutionProvider"])

	print("Loading tokenizer...")
	turn_detector_tokenizer = AutoTokenizer.from_pretrained(HG_MODEL)

	# ------------------------------------------------
	# HF InferenceClient for text generation (example)
	# ------------------------------------------------
	client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
	# Adjust above to any other endpoint that suits your use case.

	# ------------------------------------------------
	# Gradio Chat Handler
	# ------------------------------------------------


	def respond(message, history, system_message, max_tokens, temperature, top_p):
	"""
	This function is called on each new user message in the ChatInterface.
	- 'message' is the new user input
	- 'history' is a list of (user, assistant) tuples
	- 'system_message' is from the system Textbox
	- max_tokens, temperature, top_p come from the Sliders
	"""

	# 1) Build a list of messages in the OpenAI-style format:
	# [{'role': 'system', 'content': ...},
	# {'role': 'user', 'content': ...}, ...]

	messages = [
	{"role": "user",
	"content": message}
	]
	if system_message.strip():
	messages.insert(0, {"role": "system", "content": system_message})

	# history is a list of tuples: [(user1, assistant1), (user2, assistant2), ...]
	""" for user_text, assistant_text in history:
	if user_text:
	messages.append({"role": "user", "content": user_text})
	if assistant_text:
	messages.append({"role": "assistant", "content": assistant_text})

	# Append the new user message
	messages.append({"role": "user", "content": message}) """

	# 2) Calculate EOU probability on the entire conversation
	eou_prob = calculate_eou(messages, onnx_session, turn_detector_tokenizer)

	# 3) Generate the assistant response from your HF model.
	# (This code streams token-by-token.)
	response = ""

	yield f"[EOU Probability: {eou_prob:.4f}]"


	# ------------------------------------------------
	# Gradio ChatInterface
	# ------------------------------------------------
	"""
	This ChatInterface will have:
	- A chat box
	- A system message textbox
	- 3 sliders for max_tokens, temperature, and top_p
	"""
	demo = gr.ChatInterface(
	fn=respond,
	additional_inputs=[
	gr.Textbox(
	value="You are a friendly Chatbot.",
	label="System message",
	lines=2
	),
	gr.Slider(
	minimum=1,
	maximum=2048,
	value=512,
	step=1,
	label="Max new tokens"
	),
	gr.Slider(
	minimum=0.1,
	maximum=4.0,
	value=0.7,
	step=0.1,
	label="Temperature"
	),
	gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (nucleus sampling)"
	),
	],
	)

	if __name__ == "__main__":
	demo.launch()