Spaces:
Running
Running
import logging | |
import regex as re | |
from tools.classify_language import classify_language, split_alpha_nonalpha | |
def check_is_none(item) -> bool: | |
"""none -> True, not none -> False""" | |
return ( | |
item is None | |
or (isinstance(item, str) and str(item).isspace()) | |
or str(item) == "" | |
) | |
def markup_language(text: str, target_languages: list = None) -> str: | |
pattern = ( | |
r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`" | |
r"\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」" | |
r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+" | |
) | |
sentences = re.split(pattern, text) | |
pre_lang = "" | |
p = 0 | |
sorted_target_languages = sorted(target_languages) | |
if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]: | |
new_sentences = [] | |
for sentence in sentences: | |
new_sentences.extend(split_alpha_nonalpha(sentence)) | |
sentences = new_sentences | |
for sentence in sentences: | |
if check_is_none(sentence): | |
continue | |
lang = classify_language(sentence, target_languages) | |
if pre_lang == "": | |
text = text[:p] + text[p:].replace( | |
sentence, f"[{lang.upper()}]{sentence}", 1 | |
) | |
p += len(f"[{lang.upper()}]") | |
elif pre_lang != lang: | |
text = text[:p] + text[p:].replace( | |
sentence, f"[{pre_lang.upper()}][{lang.upper()}]{sentence}", 1 | |
) | |
p += len(f"[{pre_lang.upper()}][{lang.upper()}]") | |
pre_lang = lang | |
p += text[p:].index(sentence) + len(sentence) | |
text += f"[{pre_lang.upper()}]" | |
return text | |
def split_by_language(text: str, target_languages: list = None) -> list: | |
pattern = ( | |
r"[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`" | |
r"\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」" | |
r"『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+" | |
) | |
sentences = re.split(pattern, text) | |
pre_lang = "" | |
start = 0 | |
end = 0 | |
sentences_list = [] | |
sorted_target_languages = sorted(target_languages) | |
if sorted_target_languages in [["en", "zh"], ["en", "ja"], ["en", "ja", "zh"]]: | |
new_sentences = [] | |
for sentence in sentences: | |
new_sentences.extend(split_alpha_nonalpha(sentence)) | |
sentences = new_sentences | |
for sentence in sentences: | |
if check_is_none(sentence): | |
continue | |
lang = classify_language(sentence, target_languages) | |
end += text[end:].index(sentence) | |
if pre_lang != "" and pre_lang != lang: | |
sentences_list.append((text[start:end], pre_lang)) | |
start = end | |
end += len(sentence) | |
pre_lang = lang | |
sentences_list.append((text[start:], pre_lang)) | |
return sentences_list | |
def sentence_split(text: str, max: int) -> list: | |
pattern = r"[!(),—+\-.:;??。,、;:]+" | |
sentences = re.split(pattern, text) | |
discarded_chars = re.findall(pattern, text) | |
sentences_list, count, p = [], 0, 0 | |
# 按被分割的符号遍历 | |
for i, discarded_chars in enumerate(discarded_chars): | |
count += len(sentences[i]) + len(discarded_chars) | |
if count >= max: | |
sentences_list.append(text[p : p + count].strip()) | |
p += count | |
count = 0 | |
# 加入最后剩余的文本 | |
if p < len(text): | |
sentences_list.append(text[p:]) | |
return sentences_list | |
def sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None): | |
# 如果该speaker只支持一种语言 | |
if speaker_lang is not None and len(speaker_lang) == 1: | |
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]: | |
logging.debug( | |
f'lang "{lang}" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}' | |
) | |
lang = speaker_lang[0] | |
sentences_list = [] | |
if lang.upper() != "MIX": | |
if max <= 0: | |
sentences_list.append( | |
markup_language(text, speaker_lang) | |
if lang.upper() == "AUTO" | |
else f"[{lang.upper()}]{text}[{lang.upper()}]" | |
) | |
else: | |
for i in sentence_split(text, max): | |
if check_is_none(i): | |
continue | |
sentences_list.append( | |
markup_language(i, speaker_lang) | |
if lang.upper() == "AUTO" | |
else f"[{lang.upper()}]{i}[{lang.upper()}]" | |
) | |
else: | |
sentences_list.append(text) | |
for i in sentences_list: | |
logging.debug(i) | |
return sentences_list | |
if __name__ == "__main__": | |
text = "这几天心里颇不宁静。今晚在院子里坐着乘凉,忽然想起日日走过的荷塘,在这满月的光里,总该另有一番样子吧。月亮渐渐地升高了,墙外马路上孩子们的欢笑,已经听不见了;妻在屋里拍着闰儿,迷迷糊糊地哼着眠歌。我悄悄地披了大衫,带上门出去。" | |
print(markup_language(text, target_languages=None)) | |
print(sentence_split(text, max=50)) | |
print(sentence_split_and_markup(text, max=50, lang="auto", speaker_lang=None)) | |
text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは自動ラベリングのテスト用テキストです.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了GAN Duration predictor和transformer flow,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。" | |
print(split_by_language(text, ["zh", "ja", "en"])) | |