--- library_name: transformers datasets: - mesolitica/TTS language: - ms --- # Malaysian Parler TTS Tiny V1 Finetuned https://huggingface.co/parler-tts/parler-tts-tiny-v1 on [Mesolitica/TTS](https://huggingface.co/datasets/mesolitica/TTS) Source code at https://github.com/malaysia-ai/cooking/tree/main/parlertts Wandb at https://wandb.ai/huseinzol05/malaysian-parler-tts-tiny-v1 ## requirements ```bash pip3 install git+https://github.com/malaysia-ai/async-parler-tts ``` ## how to ```python import torch from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer from pypinyin import lazy_pinyin, Style import soundfile as sf import malaya import jieba normalizer = malaya.normalize.normalizer() jieba.initialize() def is_chinese(c): return ( "\u3100" <= c <= "\u9fff" ) def convert_char_to_pinyin(text_list, polyphone=True): final_text_list = [] custom_trans = str.maketrans( {";": ",", "“": '"', "”": '"', "‘": "'", "’": "'", ',': ', ', '!': '. ', '。': '. '} ) for text in text_list: char_list = [] text = text.translate(custom_trans) for seg in jieba.cut(text): seg_byte_len = len(bytes(seg, "UTF-8")) if seg_byte_len == len(seg): # if pure alphabets and symbols if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"": char_list.append(" ") char_list.extend(seg) elif polyphone and seg_byte_len == 3 * len(seg): # if pure east asian characters seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True) for i, c in enumerate(seg): if is_chinese(c): char_list.append(" ") char_list.append(seg_[i]) else: # if mixed characters, alphabets and symbols for c in seg: if ord(c) < 256: char_list.extend(c) elif is_chinese(c): char_list.append(" ") char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True)) else: char_list.append(c) final_text_list.append(char_list) return final_text_list def normalize(text): converted = convert_char_to_pinyin(text.split()) converted = [''.join(c) for c in converted] return ' '.join(converted).strip() device = "cuda:0" if torch.cuda.is_available() else "cpu" model = ParlerTTSForConditionalGeneration.from_pretrained("mesolitica/malaysian-parler-tts-mini-v1").to(device) tokenizer = AutoTokenizer.from_pretrained("mesolitica/malaysian-parler-tts-mini-v1") speakers = [ 'Husein', 'Shafiqah Idayu', 'Anwar Ibrahim', 'KP' ] # Also support context switching prompt = 'Husein zolkepli sangat comel dan kacak suka makan cendol. 其形成的门店数字化营销、, AI, 数字化服务、数字化、用户数字化等数字化成熟体系' prompt = normalizer.normalize(prompt) prompt = normalize(prompt['normalize']) for s in speakers: description = s input_ids = tokenizer(description, return_tensors="pt").to(device) prompt_input_ids = tokenizer(prompt, return_tensors="pt").to(device) generation = model.generate( input_ids=input_ids.input_ids, attention_mask=input_ids.attention_mask, prompt_input_ids=prompt_input_ids.input_ids, prompt_attention_mask=prompt_input_ids.attention_mask, ) audio_arr = generation.cpu() sf.write(f'{s}.mp3', audio_arr.numpy().squeeze(), 44100) ```