Spaces:

kevinwang676
/

MuseV-test

No application file

App Files Files Community

MuseV-test / mmcm /music /music_map /lyric_process.py

kevinwang676

Upload folder using huggingface_hub

6755a2d verified 11 months ago

raw

history blame contribute delete

17.9 kB

	from genericpath import isfile
	import re
	import os

	from ...text.utils.read_text import read_xml2json


	# 一个正则表达式非常好用的网站
	# https://regex101.com/r/cW8jA6/2


	CHINESE_PATTERN = r"[\u4e00-\u9fff]+"
	NOT_CHINESE_PATTERN = r"[^\u4e00-\u9fa5]"
	ENGLISH_CHARACHTER_PATTERN = r"[a-zA-Z]+"
	WORD_PATTERN = r"\w+" # equal to [a-zA-Z0-9_].
	NOT_WORD_PATTERN = r"\W+"


	def has_target_string(lyric: str, pattern: str) -> bool:
	"""本句歌词是否有目标字符串

	Args:
	lyric (str):
	pattern (str): 目标字符串的正则表达式式patteren

	Returns:
	bool: 有没有目标字符串
	"""
	matched = re.findall(pattern, lyric)
	flag = len(matched) > 0
	return flag


	def has_chinese_char(lyric: str) -> bool:
	"""是否有中文字符

	Args:
	lyric (str):

	Returns:
	bool: 是否有中文字符
	"""
	return has_target_string(lyric, CHINESE_PATTERN)


	def has_non_chinese_char(lyric: str) -> bool:
	"""是否有非中文字符，参考https://git.woa.com/innovative_tech/CopyrightGroup/LyricTools/blob/master/lyric_tools/dataProcess.py#L53

	Args:
	lyric (str):

	Returns:
	bool: 是否有中文字符
	"""
	return has_target_string(lyric, NOT_CHINESE_PATTERN)


	def has_english_alphabet_char(lyric: str) -> bool:
	"""是否有英文字母表字符

	Args:
	lyric (str):

	Returns:
	bool:
	"""
	return has_target_string(lyric, ENGLISH_CHARACHTER_PATTERN)


	def check_is_lyric_row(lyric: str) -> bool:
	"""该字符串是否是歌词

	Args:
	lyric (str): 待判断的字符串

	Returns:
	bool: 该字符串是否是歌词
	"""
	is_not_lyric = [
	re.search(r"\[ti[:：]?", lyric),
	re.search(r"\[ar[:：]?", lyric),
	re.search(r"\[al[:：]?", lyric),
	re.search(r"\[by[:：]?", lyric),
	re.search(r"\[offset[:：]?", lyric),
	re.search(r"词[:：]?\(\d+,\d+\)[:：]?", lyric),
	re.search(r"曲[:：]?\(\d+,\d+\)[:：]?", lyric),
	re.search(r"作\(\d+,\d+\)词[:：]?", lyric),
	re.search(r"作\(\d+,\d+\)曲[:：]?", lyric),
	re.search(r"演\(\d+,\d+\)唱[:：]?", lyric),
	re.search(r"编\(\d+,\d+\)曲[:：]?", lyric),
	re.search(r"吉\(\d+,\d+\)他[:：]", lyric),
	re.search(r"人\(\d+,\d+\)声\(\d+,\d+\)录\(\d+,\d+\)音\(\d+,\d+\)师[:：]?", lyric),
	re.search(r"人\(\d+,\d+\)声\(\d+,\d+\)录\(\d+,\d+\)音\(\d+,\d+\)棚[:：]?", lyric),
	re.search(r"Vocal\s+\(\d+,\d+\)edite[:：]?", lyric),
	re.search(r"混\(\d+,\d+\)音\(\d+,\d+\)/\(\d+,\d+\)母\(\d+,\d+\)带[:：]?", lyric),
	re.search(r"混\(\d+,\d+\)音", lyric),
	re.search(r"和\(\d+,\d+\)声\(\d+,\d+\)编\(\d+,\d+\)写[:：]?", lyric),
	re.search(
	r"词\(\d+,\d+\)版\(\d+,\d+\)权\(\d+,\d+\)管\(\d+,\d+\)理\(\d+,\d+\)方[:：]?", lyric
	),
	re.search(
	r"曲\(\d+,\d+\)版\(\d+,\d+\)权\(\d+,\d+\)管\(\d+,\d+\)理\(\d+,\d+\)方[:：]?", lyric
	),
	re.search(r"联\(\d+,\d+\)合\(\d+,\d+\)出\(\d+,\d+\)品[:：]?", lyric),
	re.search(r"录\(\d+,\d+\)音\(\d+,\d+\)作\(\d+,\d+\)品", lyric),
	re.search(
	r"录\(\d+,\d+\)音\(\d+,\d+\)作\(\d+,\d+\)品\(\d+,\d+\)监\(\d+,\d+\)制[:：]?", lyric
	),
	re.search(r"制\(\d+,\d+\)作\(\d+,\d+\)人[:：]?", lyric),
	re.search(r"制\(\d+,\d+\)作\(\d+,\d+\)人[:：]?", lyric),
	re.search(r"不\(\d+,\d+\)得\(\d+,\d+\)翻\(\d+,\d+\)唱", lyric),
	re.search(r"未\(\d+,\d+\)经\(\d+,\d+\)许\(\d+,\d+\)可", lyric),
	re.search(r"酷\(\d+,\d+\)狗\(\d+,\d+\)音\(\d+,\d+\)乐", lyric),
	re.search(r"[:：]", lyric),
	]
	is_not_lyric = [x is not None for x in is_not_lyric]
	is_not_lyric = any(is_not_lyric)
	is_lyric = not is_not_lyric
	return is_lyric


	def lyric2clip(lyric: str) -> dict:
	"""convert a line of lyric into a clip
	Clip定义可以参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/clip.py
	Args:
	lyric (str): _description_

	Returns:
	dict: 转化成Clip 字典
	"""
	time_str_groups = re.findall(r"\d+,\d+", lyric)
	line_time_start = round(int(time_str_groups[0].split(",")[0]) / 1000, 3)
	line_duration = round(int(time_str_groups[0].split(",")[-1]) / 1000, 3)
	line_end_time = line_time_start + line_duration
	last_word_time_start = round(int(time_str_groups[-1].split(",")[0]) / 1000, 3)
	last_word_duration = round(int(time_str_groups[-1].split(",")[-1]) / 1000, 3)
	last_word_end_time = last_word_time_start + last_word_duration
	actual_duration = min(line_end_time, last_word_end_time) - line_time_start
	lyric = re.sub(r"\[\d+,\d+\]", "", lyric)

	# by yuuhong: 把每个字的起始时间点、结束时间点、具体的字拆分出来
	words_with_timestamp = get_words_with_timestamp(lyric)

	lyric = re.sub(r"\(\d+,\d+\)", "", lyric)
	dct = {
	"time_start": line_time_start,
	"duration": actual_duration,
	"text": lyric,
	"original_text": lyric,
	"timepoint_type": -1,
	"clips": words_with_timestamp,
	}
	return dct


	# by yuuhong
	# 把一句QRC中的每个字拆分出来
	# lyric示例：漫(17316,178)步(17494,174)走(17668,193)在(17861,183) (18044,0)莎(18044,153)玛(18197,159)丽(18356,176)丹(18532,200)
	def get_words_with_timestamp(lyric):
	words_with_timestamp = []
	elements = lyric.split(")")
	for element in elements:
	sub_elements = element.split("(")
	if len(sub_elements) != 2:
	continue
	text = sub_elements[0]
	timestamp = sub_elements[1]
	if re.match(r"\d+,\d+", timestamp):
	# 有效时间戳
	time_start_str = timestamp.split(",")[0]
	time_start = round(int(time_start_str) / 1000, 3)
	duration_str = timestamp.split(",")[1]
	duration = round(int(duration_str) / 1000, 3)
	clip = {"text": text, "time_start": time_start, "duration": duration}
	words_with_timestamp.append(clip)
	return words_with_timestamp


	def lyric2clips(lyric: str, th: float = 0.75) -> list:
	"""将一句歌词转换为至少1个的clip。拆分主要是针对中文空格拆分，如果拆分后片段过短，也会整句处理。
	Args:
	lyric (str): such as [173247,3275]去(173247,403)吗(173649,677) 配(174326,189)吗(174516,593) 这(175108,279)
	th (float, optional): 后面如果拆分后片段过短，也会整句处理. Defaults to 1.0.

	Returns:
	list: 歌词Clip序列
	"""
	# 目前只对中文的一句歌词按照空格拆分，如果是英文空格则整句处理
	# 后面如果拆分后片段过短，也会整句处理
	if has_english_alphabet_char(lyric):
	return [lyric2clip(lyric)]
	splited_lyric = lyric.split(" ")
	if len(splited_lyric) == 1:
	return [lyric2clip(splited_lyric[0])]
	line_time_str, sub_lyric = re.split(r"]", splited_lyric[0])
	line_time_groups = re.findall(r"\d+,\d+", line_time_str)
	line_time_start = round(int(line_time_groups[0].split(",")[0]) / 1000, 3)
	line_duration = round(int(line_time_groups[0].split(",")[-1]) / 1000, 3)
	splited_lyric[0] = sub_lyric
	# 歌词xml都是歌词仅跟着时间，如果有空格空格也应该是在时间后面，但有时候空格却在字后面、在时间前，因此需要修正
	# 错误的：[173247,3275]去(173247,403)吗 (173649,677)配(174326,189)吗 (174516,593)这(175108,279)
	# 错误的：[46122,2082]以(46122,213)身(46335,260)淬(46595,209)炼(46804,268)天(47072,250)地(47322,370)造(47692,341)化 (48033,172)
	# 修正成：[173247,3275]去(173247,403)吗(173649,677) 配(174326,189)吗(174516,593) 这(175108,279)
	for i in range(len(splited_lyric)):
	if splited_lyric[i] == "":
	del splited_lyric[i]
	break
	if splited_lyric[i][-1] != ")":
	next_lyric_time_start = re.search(
	r"\(\d+,\d+\)", splited_lyric[i + 1]
	).group(0)
	splited_lyric[i] += next_lyric_time_start
	splited_lyric[i + 1] = re.sub(
	next_lyric_time_start, "", splited_lyric[i + 1]
	)
	splited_lyric[i + 1] = re.sub("\(\)", "", splited_lyric[i + 1])
	lyric_text = re.sub(r"\[\d+,\d+\]", "", lyric)
	lyric_text = re.sub(r"\(\d+,\d+\)", "", lyric_text)
	clips = []
	has_short_clip = False
	for sub_lyric in splited_lyric:
	sub_lyric_groups = re.findall(r"\d+,\d+", sub_lyric)
	sub_lyric_1st_word_time_start = round(
	int(sub_lyric_groups[0].split(",")[0]) / 1000, 3
	)
	sub_lyric_last_word_time_start = round(
	int(sub_lyric_groups[-1].split(",")[0]) / 1000, 3
	)
	sub_lyric_last_word_duration = round(
	int(sub_lyric_groups[-1].split(",")[-1]) / 1000, 3
	)
	sub_lyric_last_word_time_end = (
	sub_lyric_last_word_time_start + sub_lyric_last_word_duration
	)
	sub_lyric_duration = (
	sub_lyric_last_word_time_end - sub_lyric_1st_word_time_start
	)
	if sub_lyric_duration <= th:
	has_short_clip = True
	break
	sub_lyric_text = re.sub(r"\[\d+,\d+\]", "", sub_lyric)
	sub_lyric_text = re.sub(r"\(\d+,\d+\)", "", sub_lyric_text)
	# 使用原始lyric，而不是sub_lyric_text 主要是保留相关clip的歌词信息，便于语义连续
	dct = {
	"time_start": sub_lyric_1st_word_time_start,
	"duration": sub_lyric_duration,
	"text": sub_lyric_text,
	"original_text": lyric_text,
	"timepoint_type": -1,
	}
	clips.append(dct)
	if has_short_clip:
	clips = [lyric2clip(lyric)]
	return clips


	def is_songname(lyric: str) -> bool:
	"""是否是歌名，歌名文本含有ti, 如[ti:霍元甲 (《霍元甲》电影主题曲)]

	Args:
	lyric (str):

	Returns:
	bool:
	"""
	return has_target_string(lyric, r"\[ti[:：]?")


	def get_songname(lyric: str) -> str:
	"""获取文本中的歌名，输入必须类似[ti:霍元甲 (《霍元甲》电影主题曲)]

	Args:
	lyric (str): 含有歌名的QRC文本行

	Returns:
	str: 歌名
	"""
	return lyric.split("(")[0][4:-1]


	def is_album(lyric: str) -> bool:
	"""是否含有专辑名，文本必须类似[al:霍元甲]

	Args:
	lyric (str): _description_

	Returns:
	bool: _description_
	"""

	return has_target_string(lyric, r"\[al[:：]?")


	def get_album(lyric: str) -> str:
	"""提取专辑名，文本必须类似[al:霍元甲]


	Args:
	lyric (str): 含有专辑名的QRC文本行

	Returns:
	str: 专辑名
	"""
	return lyric[4:-1]


	def is_singer(lyric: str) -> bool:
	"""是否有歌手名，目标文本类似 [ar:周杰伦]

	Args:
	lyric (str): _description_

	Returns:
	bool: _description_
	"""
	return has_target_string(lyric, r"\[ar[:：]?")


	def get_singer(lyric: str) -> str:
	"""提取歌手信息，文本必须类似[ar:周杰伦]

	Args:
	lyric (str): 含有歌手名的QRC文本行

	Returns:
	str: 歌手名
	"""
	return lyric[4:-1]


	def lyric2musicinfo(lyric: str) -> dict:
	"""convert lyric content from str into musicinfo, a dict
	参考https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/media_info.py#L19
	{
	"meta_info": {},
	"sub_meta_info": [],
	"clips": [
	clip
	]
	}

	Args:
	lyric (str): 来自QRC的歌词字符串

	Returns:
	musicinfo: 音乐谱面字典，https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/media_info.py#L19
	"""
	lyrics = lyric["QrcInfos"]["LyricInfo"]["Lyric_1"]["@LyricContent"]
	musicinfo = {
	"meta_info": {
	"mediaid": None,
	"media_name": None,
	"singer": None,
	},
	"sub_meata_info": {},
	"clips": [],
	}
	# lyrics = [line.strip() for line in re.split(r"[\t\n\s+]", lyrics)]
	lyrics = ["[" + line.strip() for line in re.split(r"\[", lyrics)]
	next_is_title_row = False
	lyric_clips = []
	for line in lyrics:
	if is_songname(line):
	musicinfo["meta_info"]["media_name"] = get_songname(line)
	continue
	if is_singer(line):
	musicinfo["meta_info"]["singer"] = get_singer(line)
	continue
	if is_album(line):
	musicinfo["meta_info"]["album"] = get_album(line)
	continue
	is_lyric_row = check_is_lyric_row(line)
	if next_is_title_row:
	next_is_title_row = False
	continue
	# remove tille row
	if not next_is_title_row and re.search(r"\[offset[:：]", line):
	next_is_title_row = True
	if is_lyric_row and re.match(r"\[\d+,\d+\]", line):
	lyric_clip = lyric2clip(line)
	lyric_clips.append(lyric_clip)
	clips = lyric2clips(line)
	musicinfo["clips"].extend(clips)
	musicinfo["meta_info"]["lyric"] = lyric_clips
	return musicinfo


	def lrc_timestr2time(time_str: str) -> float:
	"""提取lrc中的时间戳文本，类似[00:00.00]，转化成秒的浮点数

	Args:
	time_str (str):

	Returns:
	float: 时间浮点数
	"""
	m, s, ms = (float(x) for x in re.split(r"[:.]", time_str))
	return round((m * 60 + s + ms / 1000), 3)


	def get_lrc_line_time(text: str, time_pattern: str) -> str:
	"""提取lrc中的时间字符串, 类似 \"[00:00.00]本字幕由天琴实验室独家AI字幕技术生成\"

	Args:
	text (str): 输入文本
	time_pattern (str): 时间字符串正则表达式

	Returns:
	str: 符合正则表达式的时间信息文本
	"""
	time_str = re.search(time_pattern, text).group(0)
	return lrc_timestr2time(time_str)


	def lrc_lyric2clip(lyric: str, time_pattern: str, duration: float) -> dict:
	"""将一行lrc文本字符串转化为Clip 字典

	Args:
	lyric (str): 类似 \"[00:00.00]本字幕由天琴实验室独家AI字幕技术生成\"
	time_pattern (str): 时间字符串正则表达式，类似 r"\d+:\d+\.\d+"
	duration (float): clip的时长信息，

	Returns:
	dict: 转化后Clip
	Clip定义可以参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/clip.py
	"""
	time_str = get_lrc_line_time(lyric, time_pattern=time_pattern)
	text = re.sub(time_pattern, "", lyric)
	text = text[2:]
	clip = {
	"time_start": time_str,
	"duration": duration,
	"text": text,
	"timepoint_type": -1,
	}
	return clip


	def lrc2musicinfo(lyric: str, time_pattern: str = "\d+:\d+\.\d+") -> dict:
	"""将lrc转化为音乐谱面

	Args:
	lyric (str): lrc文本路径
	time_pattern (str, optional): lrc时间戳字符串正则表达式. Defaults to "\d+:\d+\.\d+".

	Returns:
	dict: 生成的音乐谱面字典，定义可参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/music/music_info.py
	"""
	if isinstance(lyric, str):
	if os.path.isfile(lyric):
	with open(lyric, "r") as f:
	lyric = [line.strip() for line in f.readlines()]
	return lrc2musicinfo(lyric)
	else:
	lyric = lyric.split("\n")
	return lrc2musicinfo(lyric)
	else:
	musicinfo = {
	"meta_info": {
	"mediaid": None,
	"media_name": None,
	"singer": None,
	},
	"sub_meata_info": {},
	"clips": [],
	}
	# lyrics = [line.strip() for line in re.split(r"[\t\n\s+]", lyrics)]
	lyric_clips = []
	rows = len(lyric)
	for i, line in enumerate(lyric):
	if is_songname(line):
	musicinfo["meta_info"]["media_name"] = line[4:-1]
	continue
	if is_singer(line):
	musicinfo["meta_info"]["singer"] = line[4:-1]
	continue
	if is_album(line):
	musicinfo["meta_info"]["album"] = line[4:-1]
	continue
	if len(re.findall(time_pattern, line)) > 0:
	if i < rows - 1:
	time_start = get_lrc_line_time(line, time_pattern=time_pattern)
	next_line_time_start = get_lrc_line_time(
	lyric[i + 1], time_pattern=time_pattern
	)
	duration = next_line_time_start - time_start
	else:
	duration = 1
	clip = lrc_lyric2clip(
	line, duration=duration, time_pattern=time_pattern
	)
	musicinfo["clips"].append(clip)
	musicinfo["meta_info"]["lyric"] = lyric_clips
	return musicinfo


	def lyricfile2musicinfo(path: str) -> dict:
	"""将歌词文件转化为音乐谱面，歌词文件可以是QRC的xml文件、也可以是lrc对应的lrc文件
	TODO：待支持osu

	Args:
	path (str): 歌词文件路径

	Returns:
	dict: 音乐谱面字典，定义可参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/music/music_info.py
	"""

	filename, ext = os.path.basename(path).split(".")
	if ext == "xml":
	lyric = read_xml2json(path)
	musicinfo = lyric2musicinfo(lyric)
	elif ext == "lrc":
	musicinfo = lrc2musicinfo(path)
	musicinfo["meta_info"]["mediaid"] = filename
	return musicinfo