Spaces:
Sleeping
Sleeping
File size: 2,822 Bytes
a121edc 5152717 a121edc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import os
import json
from pathlib import Path
from typing import List
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.request import CommonRequest
import nls
class CosyVoiceSynthesizer:
def __init__(self) -> None:
self.access_key_id = os.environ.get('ALIYUN_ACCESS_KEY_ID')
self.access_key_secret = os.environ.get('ALIYUN_ACCESS_KEY_SECRET')
self.app_key = os.environ.get('ALIYUN_APP_KEY')
self.setup_token()
def setup_token(self):
client = AcsClient(self.access_key_id, self.access_key_secret,
'cn-shanghai')
request = CommonRequest()
request.set_method('POST')
request.set_domain('nls-meta.cn-shanghai.aliyuncs.com')
request.set_version('2019-02-28')
request.set_action_name('CreateToken')
try:
response = client.do_action_with_exception(request)
jss = json.loads(response)
if 'Token' in jss and 'Id' in jss['Token']:
token = jss['Token']['Id']
self.token = token
except Exception as e:
import traceback
raise RuntimeError(
f'Request token failed with error: {e}, with detail {traceback.format_exc()}'
)
def call(self, save_file, transcript, voice="longyuan", sample_rate=16000):
writer = open(save_file, "wb")
return_data = b''
def write_data(data, *args):
nonlocal return_data
return_data += data
if writer is not None:
writer.write(data)
def raise_error(error, *args):
raise RuntimeError(
f'Synthesizing speech failed with error: {error}')
def close_file(*args):
if writer is not None:
writer.close()
sdk = nls.NlsStreamInputTtsSynthesizer(
url='wss://nls-gateway-cn-beijing.aliyuncs.com/ws/v1',
token=self.token,
appkey=self.app_key,
on_data=write_data,
on_error=raise_error,
on_close=close_file,
)
sdk.startStreamInputTts(voice=voice, sample_rate=sample_rate, aformat='wav')
sdk.sendStreamInputTts(transcript,)
sdk.stopStreamInputTts()
class CosyVoiceAgent:
def __init__(self, config) -> None:
self.config = config
def call(self, pages: List, device: str, save_path: str):
save_path = Path(save_path)
generation_agent = CosyVoiceSynthesizer()
for idx, page in enumerate(pages):
generation_agent.call(
save_file=save_path / f"p{idx + 1}.wav",
transcript=page,
**self.config["call_cfg"]
)
return {
"modality": "speech"
}
|