TTS_API / app.py
khurrameycon's picture
Update app.py
6f1334b verified
from fastapi import FastAPI, Response
from fastapi.responses import FileResponse
from kokoro import KPipeline
import soundfile as sf
import os
import numpy as np
import torch
from huggingface_hub import InferenceClient
def llm_chat_response(text):
HF_TOKEN = os.getenv("HF_TOKEN")
client = InferenceClient(api_key=HF_TOKEN)
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": text + str('describe in one line only')
} #,
# {
# "type": "image_url",
# "image_url": {
# "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
# }
# }
]
}
]
response_from_llama = client.chat.completions.create(
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
messages=messages,
max_tokens=500)
return response_from_llama.choices[0].message['content']
app = FastAPI()
# Initialize pipeline once at startup
pipeline = KPipeline(lang_code='a')
@app.post("/generate")
async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
text_reply = llm_chat_response(text)
# Generate audio
generator = pipeline(
text_reply,
voice=voice,
speed=speed,
split_pattern=r'\n+'
)
# # Save first segment only for demo
# for i, (gs, ps, audio) in enumerate(generator):
# sf.write(f"output_{i}.wav", audio, 24000)
# return FileResponse(
# f"output_{i}.wav",
# media_type="audio/wav",
# filename="output.wav"
# )
# return Response("No audio generated", status_code=400)
# Process only the first segment for demo
for i, (gs, ps, audio) in enumerate(generator):
# Convert PyTorch tensor to NumPy array
audio_numpy = audio.cpu().numpy()
# Convert to 16-bit PCM
# Ensure the audio is in the range [-1, 1]
audio_numpy = np.clip(audio_numpy, -1, 1)
# Convert to 16-bit signed integers
pcm_data = (audio_numpy * 32767).astype(np.int16)
# Convert to bytes (automatically uses row-major order)
raw_audio = pcm_data.tobytes()
# Return PCM data with minimal necessary headers
return Response(
content=raw_audio,
media_type="application/octet-stream",
headers={
"Content-Disposition": f'attachment; filename="output.pcm"',
"X-Sample-Rate": "24000",
"X-Bits-Per-Sample": "16",
"X-Endianness": "little"
}
)
return Response("No audio generated", status_code=400)