Spaces:
Running
Running
File size: 2,680 Bytes
e3f5ff0 ff704b5 d0ae17f c870bf1 6f1334b ff704b5 908288f e3f5ff0 a3044d1 e3f5ff0 ff704b5 e3f5ff0 908288f e3f5ff0 908288f e3f5ff0 fce7c66 e3f5ff0 c870bf1 fce7c66 c870bf1 d0ae17f c870bf1 fce7c66 c870bf1 fce7c66 d0ae17f fce7c66 d0ae17f fce7c66 d0ae17f fce7c66 d0ae17f fce7c66 e3f5ff0 d0ae17f e3f5ff0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
from fastapi import FastAPI, Response
from fastapi.responses import FileResponse
from kokoro import KPipeline
import soundfile as sf
import os
import numpy as np
import torch
from huggingface_hub import InferenceClient
def llm_chat_response(text):
HF_TOKEN = os.getenv("HF_TOKEN")
client = InferenceClient(api_key=HF_TOKEN)
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": text + str('describe in one line only')
} #,
# {
# "type": "image_url",
# "image_url": {
# "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
# }
# }
]
}
]
response_from_llama = client.chat.completions.create(
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
messages=messages,
max_tokens=500)
return response_from_llama.choices[0].message['content']
app = FastAPI()
# Initialize pipeline once at startup
pipeline = KPipeline(lang_code='a')
@app.post("/generate")
async def generate_audio(text: str, voice: str = "af_heart", speed: float = 1.0):
text_reply = llm_chat_response(text)
# Generate audio
generator = pipeline(
text_reply,
voice=voice,
speed=speed,
split_pattern=r'\n+'
)
# # Save first segment only for demo
# for i, (gs, ps, audio) in enumerate(generator):
# sf.write(f"output_{i}.wav", audio, 24000)
# return FileResponse(
# f"output_{i}.wav",
# media_type="audio/wav",
# filename="output.wav"
# )
# return Response("No audio generated", status_code=400)
# Process only the first segment for demo
for i, (gs, ps, audio) in enumerate(generator):
# Convert PyTorch tensor to NumPy array
audio_numpy = audio.cpu().numpy()
# Convert to 16-bit PCM
# Ensure the audio is in the range [-1, 1]
audio_numpy = np.clip(audio_numpy, -1, 1)
# Convert to 16-bit signed integers
pcm_data = (audio_numpy * 32767).astype(np.int16)
# Convert to bytes (automatically uses row-major order)
raw_audio = pcm_data.tobytes()
# Return PCM data with minimal necessary headers
return Response(
content=raw_audio,
media_type="application/octet-stream",
headers={
"Content-Disposition": f'attachment; filename="output.pcm"',
"X-Sample-Rate": "24000",
"X-Bits-Per-Sample": "16",
"X-Endianness": "little"
}
)
return Response("No audio generated", status_code=400) |