Spaces:

beyoru
/

qew

Running

App Files Files Community

qew / app.py

beyoru

Update app.py

4f81850 verified 14 days ago

raw

history blame

3.47 kB

	import gradio as gr
	from transformers import AutoTokenizer
	import onnxruntime as ort
	import numpy as np
	import string
	from huggingface_hub import InferenceClient
	import os

	# Initialize Qwen client
	qwen_client = InferenceClient(os.environ.get("HF_TOKEN"))

	# Model and ONNX setup
	HG_MODEL = "livekit/turn-detector"
	ONNX_FILENAME = "model_quantized.onnx"
	PUNCS = string.punctuation.replace("'", "")
	MAX_HISTORY = 4
	MAX_HISTORY_TOKENS = 512
	EOU_THRESHOLD = 0.5

	# Initialize ONNX model
	tokenizer = AutoTokenizer.from_pretrained(HG_MODEL)
	onnx_session = ort.InferenceSession(ONNX_FILENAME, providers=["CPUExecutionProvider"])

	def softmax(logits):
	exp_logits = np.exp(logits - np.max(logits))
	return exp_logits / np.sum(exp_logits)

	def normalize_text(text):
	def strip_puncs(text):
	return text.translate(str.maketrans("", "", PUNCS))
	return " ".join(strip_puncs(text).lower().split())

	def format_chat_ctx(chat_ctx):
	new_chat_ctx = []
	for msg in chat_ctx:
	if msg["role"] in ("user", "assistant"):
	content = normalize_text(msg["content"])
	if content:
	msg["content"] = content
	new_chat_ctx.append(msg)
	convo_text = tokenizer.apply_chat_template(
	new_chat_ctx, add_generation_prompt=False, add_special_tokens=False, tokenize=False
	)
	ix = convo_text.rfind("<\|im_end\|>")
	return convo_text[:ix]

	def calculate_eou(chat_ctx, session):
	formatted_text = format_chat_ctx(chat_ctx[-MAX_HISTORY:])
	inputs = tokenizer(
	formatted_text,
	return_tensors="np",
	truncation=True,
	max_length=MAX_HISTORY_TOKENS,
	)
	input_ids = np.array(inputs["input_ids"], dtype=np.int64)
	outputs = session.run(["logits"], {"input_ids": input_ids})
	logits = outputs[0][0, -1, :]
	probs = softmax(logits)
	eou_token_id = tokenizer.encode("<\|im_end\|>")[-1]
	return probs[eou_token_id]

	def respond(
	message,
	history: list[tuple[str, str]],
	max_tokens=2048,
	temperature=0.6,
	top_p=0.95,
	):
	messages = [{"role": "system", "content": os.environ.get("CHARACTER_DESC", "You are a helpful assistant.")}]

	for val in history[-MAX_HISTORY:]:
	if val[0]:
	messages.append({"role": "user", "content": val[0]})
	if val[1]:
	messages.append({"role": "assistant", "content": val[1]})

	messages.append({"role": "user", "content": message})

	eou_prob = calculate_eou(messages, onnx_session)
	if eou_prob < EOU_THRESHOLD:
	yield "[Wait... Keep typing...]"
	return

	# Generate raw response without any processing
	full_response = ""
	stream = qwen_client.chat.completions.create(
	model="deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
	messages=messages,
	max_tokens=max_tokens,
	stream=True,
	temperature=temperature,
	top_p=top_p,
	stream=True
	)

	for chunk in stream:
	full_response += chunk.choices[0].delta.content
	yield chunk.choices[0].delta.content # Send raw unmodified response to Gradio

	# This will match both console and Gradio output
	# Create Gradio interface
	demo = gr.ChatInterface(
	respond,
	additional_inputs=[
	gr.Slider(1, 4096, value=256, label="Max Tokens"),
	gr.Slider(0.1, 4.0, value=0.7, label="Temperature"),
	gr.Slider(0.1, 1.0, value=0.95, label="Top-p"),
	]
	)

	if __name__ == "__main__":
	demo.launch()