Xu Xuenan
Multi-GPUs
5152717
raw
history blame
2.82 kB
import os
import json
from pathlib import Path
from typing import List
from aliyunsdkcore.client import AcsClient
from aliyunsdkcore.request import CommonRequest
import nls
class CosyVoiceSynthesizer:
def __init__(self) -> None:
self.access_key_id = os.environ.get('ALIYUN_ACCESS_KEY_ID')
self.access_key_secret = os.environ.get('ALIYUN_ACCESS_KEY_SECRET')
self.app_key = os.environ.get('ALIYUN_APP_KEY')
self.setup_token()
def setup_token(self):
client = AcsClient(self.access_key_id, self.access_key_secret,
'cn-shanghai')
request = CommonRequest()
request.set_method('POST')
request.set_domain('nls-meta.cn-shanghai.aliyuncs.com')
request.set_version('2019-02-28')
request.set_action_name('CreateToken')
try:
response = client.do_action_with_exception(request)
jss = json.loads(response)
if 'Token' in jss and 'Id' in jss['Token']:
token = jss['Token']['Id']
self.token = token
except Exception as e:
import traceback
raise RuntimeError(
f'Request token failed with error: {e}, with detail {traceback.format_exc()}'
)
def call(self, save_file, transcript, voice="longyuan", sample_rate=16000):
writer = open(save_file, "wb")
return_data = b''
def write_data(data, *args):
nonlocal return_data
return_data += data
if writer is not None:
writer.write(data)
def raise_error(error, *args):
raise RuntimeError(
f'Synthesizing speech failed with error: {error}')
def close_file(*args):
if writer is not None:
writer.close()
sdk = nls.NlsStreamInputTtsSynthesizer(
url='wss://nls-gateway-cn-beijing.aliyuncs.com/ws/v1',
token=self.token,
appkey=self.app_key,
on_data=write_data,
on_error=raise_error,
on_close=close_file,
)
sdk.startStreamInputTts(voice=voice, sample_rate=sample_rate, aformat='wav')
sdk.sendStreamInputTts(transcript,)
sdk.stopStreamInputTts()
class CosyVoiceAgent:
def __init__(self, config) -> None:
self.config = config
def call(self, pages: List, device: str, save_path: str):
save_path = Path(save_path)
generation_agent = CosyVoiceSynthesizer()
for idx, page in enumerate(pages):
generation_agent.call(
save_file=save_path / f"p{idx + 1}.wav",
transcript=page,
**self.config["call_cfg"]
)
return {
"modality": "speech"
}