Spaces:

Ailyth
/

Multi-voice-TTS-GPT-SoVITS

Running

App Files Files Community

Multi-voice-TTS-GPT-SoVITS / text /japanese.py

Ailyth

NEW-0216-050145

3c7a160 9 months ago

raw

history blame

5.98 kB

	# modified from https://github.com/CjangCjengh/vits/blob/main/text/japanese.py
	import re
	import sys

	import pyopenjtalk


	from text import symbols
	# Regular expression matching Japanese without punctuation marks:
	_japanese_characters = re.compile(
	r"[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
	)

	# Regular expression matching non-Japanese characters or punctuation marks:
	_japanese_marks = re.compile(
	r"[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]"
	)

	# List of (symbol, Japanese) pairs for marks:
	_symbols_to_japanese = [(re.compile("%s" % x[0]), x[1]) for x in [("％", "パーセント")]]


	# List of (consonant, sokuon) pairs:
	_real_sokuon = [
	(re.compile("%s" % x[0]), x[1])
	for x in [
	(r"Q([↑↓]*[kg])", r"k#\1"),
	(r"Q([↑↓]*[tdjʧ])", r"t#\1"),
	(r"Q([↑↓]*[sʃ])", r"s\1"),
	(r"Q([↑↓]*[pb])", r"p#\1"),
	]
	]

	# List of (consonant, hatsuon) pairs:
	_real_hatsuon = [
	(re.compile("%s" % x[0]), x[1])
	for x in [
	(r"N([↑↓]*[pbm])", r"m\1"),
	(r"N([↑↓]*[ʧʥj])", r"n^\1"),
	(r"N([↑↓]*[tdn])", r"n\1"),
	(r"N([↑↓]*[kg])", r"ŋ\1"),
	]
	]


	def post_replace_ph(ph):
	rep_map = {
	"：": ",",
	"；": ",",
	"，": ",",
	"。": ".",
	"！": "!",
	"？": "?",
	"\n": ".",
	"·": ",",
	"、": ",",
	"...": "…",
	}
	if ph in rep_map.keys():
	ph = rep_map[ph]
	if ph in symbols:
	return ph
	if ph not in symbols:
	ph = "UNK"
	return ph


	def symbols_to_japanese(text):
	for regex, replacement in _symbols_to_japanese:
	text = re.sub(regex, replacement, text)
	return text


	def preprocess_jap(text, with_prosody=False):
	"""Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html"""
	text = symbols_to_japanese(text)
	sentences = re.split(_japanese_marks, text)
	marks = re.findall(_japanese_marks, text)
	text = []
	for i, sentence in enumerate(sentences):
	if re.match(_japanese_characters, sentence):
	if with_prosody:
	text += pyopenjtalk_g2p_prosody(sentence)[1:-1]
	else:
	p = pyopenjtalk.g2p(sentence)
	text += p.split(" ")

	if i < len(marks):
	if marks[i] == " ":# 防止意外的UNK
	continue
	text += [marks[i].replace(" ", "")]
	return text


	def text_normalize(text):
	# todo: jap text normalize
	return text

	# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
	def pyopenjtalk_g2p_prosody(text, drop_unvoiced_vowels=True):
	"""Extract phoneme + prosoody symbol sequence from input full-context labels.

	The algorithm is based on `Prosodic features control by symbols as input of
	sequence-to-sequence acoustic modeling for neural TTS`_ with some r9y9's tweaks.

	Args:
	text (str): Input text.
	drop_unvoiced_vowels (bool): whether to drop unvoiced vowels.

	Returns:
	List[str]: List of phoneme + prosody symbols.

	Examples:
	>>> from espnet2.text.phoneme_tokenizer import pyopenjtalk_g2p_prosody
	>>> pyopenjtalk_g2p_prosody("こんにちは。")
	['^', 'k', 'o', '[', 'N', 'n', 'i', 'ch', 'i', 'w', 'a', '$']

	.. _`Prosodic features control by symbols as input of sequence-to-sequence acoustic
	modeling for neural TTS`: https://doi.org/10.1587/transinf.2020EDP7104

	"""
	labels = pyopenjtalk.make_label(pyopenjtalk.run_frontend(text))
	N = len(labels)

	phones = []
	for n in range(N):
	lab_curr = labels[n]

	# current phoneme
	p3 = re.search(r"\-(.*?)\+", lab_curr).group(1)
	# deal unvoiced vowels as normal vowels
	if drop_unvoiced_vowels and p3 in "AEIOU":
	p3 = p3.lower()

	# deal with sil at the beginning and the end of text
	if p3 == "sil":
	assert n == 0 or n == N - 1
	if n == 0:
	phones.append("^")
	elif n == N - 1:
	# check question form or not
	e3 = _numeric_feature_by_regex(r"!(\d+)_", lab_curr)
	if e3 == 0:
	phones.append("$")
	elif e3 == 1:
	phones.append("?")
	continue
	elif p3 == "pau":
	phones.append("_")
	continue
	else:
	phones.append(p3)

	# accent type and position info (forward or backward)
	a1 = _numeric_feature_by_regex(r"/A:([0-9\-]+)\+", lab_curr)
	a2 = _numeric_feature_by_regex(r"\+(\d+)\+", lab_curr)
	a3 = _numeric_feature_by_regex(r"\+(\d+)/", lab_curr)

	# number of mora in accent phrase
	f1 = _numeric_feature_by_regex(r"/F:(\d+)_", lab_curr)

	a2_next = _numeric_feature_by_regex(r"\+(\d+)\+", labels[n + 1])
	# accent phrase border
	if a3 == 1 and a2_next == 1 and p3 in "aeiouAEIOUNcl":
	phones.append("#")
	# pitch falling
	elif a1 == 0 and a2_next == a2 + 1 and a2 != f1:
	phones.append("]")
	# pitch rising
	elif a2 == 1 and a2_next == 2:
	phones.append("[")

	return phones

	# Copied from espnet https://github.com/espnet/espnet/blob/master/espnet2/text/phoneme_tokenizer.py
	def _numeric_feature_by_regex(regex, s):
	match = re.search(regex, s)
	if match is None:
	return -50
	return int(match.group(1))

	def g2p(norm_text, with_prosody=False):
	phones = preprocess_jap(norm_text, with_prosody)
	phones = [post_replace_ph(i) for i in phones]
	# todo: implement tones and word2ph
	return phones


	if __name__ == "__main__":
	phones = g2p("こんにちは, hello, AKITOです,よろしくお願いしますね！")
	print(phones)