import os import json from pathlib import Path from typing import List from aliyunsdkcore.client import AcsClient from aliyunsdkcore.request import CommonRequest import nls class CosyVoiceSynthesizer: def __init__(self) -> None: self.access_key_id = os.environ.get('ALIYUN_ACCESS_KEY_ID') self.access_key_secret = os.environ.get('ALIYUN_ACCESS_KEY_SECRET') self.app_key = os.environ.get('ALIYUN_APP_KEY') self.setup_token() def setup_token(self): client = AcsClient(self.access_key_id, self.access_key_secret, 'cn-shanghai') request = CommonRequest() request.set_method('POST') request.set_domain('nls-meta.cn-shanghai.aliyuncs.com') request.set_version('2019-02-28') request.set_action_name('CreateToken') try: response = client.do_action_with_exception(request) jss = json.loads(response) if 'Token' in jss and 'Id' in jss['Token']: token = jss['Token']['Id'] self.token = token except Exception as e: import traceback raise RuntimeError( f'Request token failed with error: {e}, with detail {traceback.format_exc()}' ) def call(self, save_file, transcript, voice="longyuan", sample_rate=16000): writer = open(save_file, "wb") return_data = b'' def write_data(data, *args): nonlocal return_data return_data += data if writer is not None: writer.write(data) def raise_error(error, *args): raise RuntimeError( f'Synthesizing speech failed with error: {error}') def close_file(*args): if writer is not None: writer.close() sdk = nls.NlsStreamInputTtsSynthesizer( url='wss://nls-gateway-cn-beijing.aliyuncs.com/ws/v1', token=self.token, appkey=self.app_key, on_data=write_data, on_error=raise_error, on_close=close_file, ) sdk.startStreamInputTts(voice=voice, sample_rate=sample_rate, aformat='wav') sdk.sendStreamInputTts(transcript,) sdk.stopStreamInputTts() class CosyVoiceAgent: def __init__(self, config) -> None: self.config = config def call(self, pages: List, device: str, save_path: str): save_path = Path(save_path) generation_agent = CosyVoiceSynthesizer() for idx, page in enumerate(pages): generation_agent.call( save_file=save_path / f"p{idx + 1}.wav", transcript=page, **self.config["call_cfg"] ) return { "modality": "speech" }