Malaysian Parler TTS Tiny V1

Finetuned https://huggingface.co/parler-tts/parler-tts-tiny-v1 on Mesolitica/TTS

Source code at https://github.com/malaysia-ai/cooking/tree/main/parlertts

Wandb at https://wandb.ai/huseinzol05/malaysian-parler-tts-tiny-v1

requirements

pip3 install git+https://github.com/malaysia-ai/async-parler-tts

how to

import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
from pypinyin import lazy_pinyin, Style
import soundfile as sf
import malaya
import jieba

normalizer = malaya.normalize.normalizer()
jieba.initialize()

def is_chinese(c):
    return (
        "\u3100" <= c <= "\u9fff"
    )

def convert_char_to_pinyin(text_list, polyphone=True):
    final_text_list = []
    custom_trans = str.maketrans(
        {";": ",", "β€œ": '"', "”": '"', "β€˜": "'", "’": "'", ',': ', ', '!': '. ', '。': '. '}
    ) 

    for text in text_list:
        char_list = []
        text = text.translate(custom_trans)
        for seg in jieba.cut(text):
            seg_byte_len = len(bytes(seg, "UTF-8"))
            if seg_byte_len == len(seg):  # if pure alphabets and symbols
                if char_list and seg_byte_len > 1 and char_list[-1] not in " :'\"":
                    char_list.append(" ")
                char_list.extend(seg)
            elif polyphone and seg_byte_len == 3 * len(seg):  # if pure east asian characters
                seg_ = lazy_pinyin(seg, style=Style.TONE3, tone_sandhi=True)
                for i, c in enumerate(seg):
                    if is_chinese(c):
                        char_list.append(" ")
                    char_list.append(seg_[i])
            else:  # if mixed characters, alphabets and symbols
                for c in seg:
                    if ord(c) < 256:
                        char_list.extend(c)
                    elif is_chinese(c):
                        char_list.append(" ")
                        char_list.extend(lazy_pinyin(c, style=Style.TONE3, tone_sandhi=True))
                    else:
                        char_list.append(c)
        final_text_list.append(char_list)

    return final_text_list

def normalize(text):
    converted = convert_char_to_pinyin(text.split())
    converted = [''.join(c) for c in converted]
    return ' '.join(converted).strip()

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("mesolitica/malaysian-parler-tts-mini-v1").to(device)
tokenizer = AutoTokenizer.from_pretrained("mesolitica/malaysian-parler-tts-mini-v1")

speakers = [
    'Husein',
    'Shafiqah Idayu',
    'Anwar Ibrahim',
    'KP'
]

# Also support context switching
prompt = 'Husein zolkepli sangat comel dan kacak suka makan cendol. ε…Άε½’ζˆηš„ι—¨εΊ—ζ•°ε­—εŒ–θ₯销、, AI, ζ•°ε­—εŒ–ζœεŠ‘γ€ζ•°ε­—εŒ–γ€η”¨ζˆ·ζ•°ε­—εŒ–η­‰ζ•°ε­—εŒ–ζˆη†Ÿδ½“η³»'
prompt = normalizer.normalize(prompt)
prompt = normalize(prompt['normalize'])

for s in speakers:
    description = s

    input_ids = tokenizer(description, return_tensors="pt").to(device)
    prompt_input_ids = tokenizer(prompt, return_tensors="pt").to(device)

    generation = model.generate(
      input_ids=input_ids.input_ids,
      attention_mask=input_ids.attention_mask,
      prompt_input_ids=prompt_input_ids.input_ids,
      prompt_attention_mask=prompt_input_ids.attention_mask,
    )

    audio_arr = generation.cpu()
    sf.write(f'{s}.mp3', audio_arr.numpy().squeeze(), 44100)
Downloads last month
34
Safetensors
Model size
317M params
Tensor type
F32
Β·
Inference Providers NEW
This model is not currently available via any of the supported Inference Providers.

Dataset used to train malaysia-ai/malaysian-parler-tts-tiny-v1