Spaces:
No application file
No application file
from genericpath import isfile | |
import re | |
import os | |
from ...text.utils.read_text import read_xml2json | |
# 一个正则表达式非常好用的网站 | |
# https://regex101.com/r/cW8jA6/2 | |
CHINESE_PATTERN = r"[\u4e00-\u9fff]+" | |
NOT_CHINESE_PATTERN = r"[^\u4e00-\u9fa5]" | |
ENGLISH_CHARACHTER_PATTERN = r"[a-zA-Z]+" | |
WORD_PATTERN = r"\w+" # equal to [a-zA-Z0-9_]. | |
NOT_WORD_PATTERN = r"\W+" | |
def has_target_string(lyric: str, pattern: str) -> bool: | |
"""本句歌词是否有目标字符串 | |
Args: | |
lyric (str): | |
pattern (str): 目标字符串的正则表达式式patteren | |
Returns: | |
bool: 有没有目标字符串 | |
""" | |
matched = re.findall(pattern, lyric) | |
flag = len(matched) > 0 | |
return flag | |
def has_chinese_char(lyric: str) -> bool: | |
"""是否有中文字符 | |
Args: | |
lyric (str): | |
Returns: | |
bool: 是否有中文字符 | |
""" | |
return has_target_string(lyric, CHINESE_PATTERN) | |
def has_non_chinese_char(lyric: str) -> bool: | |
"""是否有非中文字符,参考https://git.woa.com/innovative_tech/CopyrightGroup/LyricTools/blob/master/lyric_tools/dataProcess.py#L53 | |
Args: | |
lyric (str): | |
Returns: | |
bool: 是否有中文字符 | |
""" | |
return has_target_string(lyric, NOT_CHINESE_PATTERN) | |
def has_english_alphabet_char(lyric: str) -> bool: | |
"""是否有英文字母表字符 | |
Args: | |
lyric (str): | |
Returns: | |
bool: | |
""" | |
return has_target_string(lyric, ENGLISH_CHARACHTER_PATTERN) | |
def check_is_lyric_row(lyric: str) -> bool: | |
"""该字符串是否是歌词 | |
Args: | |
lyric (str): 待判断的字符串 | |
Returns: | |
bool: 该字符串是否是歌词 | |
""" | |
is_not_lyric = [ | |
re.search(r"\[ti[::]?", lyric), | |
re.search(r"\[ar[::]?", lyric), | |
re.search(r"\[al[::]?", lyric), | |
re.search(r"\[by[::]?", lyric), | |
re.search(r"\[offset[::]?", lyric), | |
re.search(r"词[::]?\(\d+,\d+\)[::]?", lyric), | |
re.search(r"曲[::]?\(\d+,\d+\)[::]?", lyric), | |
re.search(r"作\(\d+,\d+\)词[::]?", lyric), | |
re.search(r"作\(\d+,\d+\)曲[::]?", lyric), | |
re.search(r"演\(\d+,\d+\)唱[::]?", lyric), | |
re.search(r"编\(\d+,\d+\)曲[::]?", lyric), | |
re.search(r"吉\(\d+,\d+\)他[::]", lyric), | |
re.search(r"人\(\d+,\d+\)声\(\d+,\d+\)录\(\d+,\d+\)音\(\d+,\d+\)师[::]?", lyric), | |
re.search(r"人\(\d+,\d+\)声\(\d+,\d+\)录\(\d+,\d+\)音\(\d+,\d+\)棚[::]?", lyric), | |
re.search(r"Vocal\s+\(\d+,\d+\)edite[::]?", lyric), | |
re.search(r"混\(\d+,\d+\)音\(\d+,\d+\)/\(\d+,\d+\)母\(\d+,\d+\)带[::]?", lyric), | |
re.search(r"混\(\d+,\d+\)音", lyric), | |
re.search(r"和\(\d+,\d+\)声\(\d+,\d+\)编\(\d+,\d+\)写[::]?", lyric), | |
re.search( | |
r"词\(\d+,\d+\)版\(\d+,\d+\)权\(\d+,\d+\)管\(\d+,\d+\)理\(\d+,\d+\)方[::]?", lyric | |
), | |
re.search( | |
r"曲\(\d+,\d+\)版\(\d+,\d+\)权\(\d+,\d+\)管\(\d+,\d+\)理\(\d+,\d+\)方[::]?", lyric | |
), | |
re.search(r"联\(\d+,\d+\)合\(\d+,\d+\)出\(\d+,\d+\)品[::]?", lyric), | |
re.search(r"录\(\d+,\d+\)音\(\d+,\d+\)作\(\d+,\d+\)品", lyric), | |
re.search( | |
r"录\(\d+,\d+\)音\(\d+,\d+\)作\(\d+,\d+\)品\(\d+,\d+\)监\(\d+,\d+\)制[::]?", lyric | |
), | |
re.search(r"制\(\d+,\d+\)作\(\d+,\d+\)人[::]?", lyric), | |
re.search(r"制\(\d+,\d+\)作\(\d+,\d+\)人[::]?", lyric), | |
re.search(r"不\(\d+,\d+\)得\(\d+,\d+\)翻\(\d+,\d+\)唱", lyric), | |
re.search(r"未\(\d+,\d+\)经\(\d+,\d+\)许\(\d+,\d+\)可", lyric), | |
re.search(r"酷\(\d+,\d+\)狗\(\d+,\d+\)音\(\d+,\d+\)乐", lyric), | |
re.search(r"[::]", lyric), | |
] | |
is_not_lyric = [x is not None for x in is_not_lyric] | |
is_not_lyric = any(is_not_lyric) | |
is_lyric = not is_not_lyric | |
return is_lyric | |
def lyric2clip(lyric: str) -> dict: | |
"""convert a line of lyric into a clip | |
Clip定义可以参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/clip.py | |
Args: | |
lyric (str): _description_ | |
Returns: | |
dict: 转化成Clip 字典 | |
""" | |
time_str_groups = re.findall(r"\d+,\d+", lyric) | |
line_time_start = round(int(time_str_groups[0].split(",")[0]) / 1000, 3) | |
line_duration = round(int(time_str_groups[0].split(",")[-1]) / 1000, 3) | |
line_end_time = line_time_start + line_duration | |
last_word_time_start = round(int(time_str_groups[-1].split(",")[0]) / 1000, 3) | |
last_word_duration = round(int(time_str_groups[-1].split(",")[-1]) / 1000, 3) | |
last_word_end_time = last_word_time_start + last_word_duration | |
actual_duration = min(line_end_time, last_word_end_time) - line_time_start | |
lyric = re.sub(r"\[\d+,\d+\]", "", lyric) | |
# by yuuhong: 把每个字的起始时间点、结束时间点、具体的字拆分出来 | |
words_with_timestamp = get_words_with_timestamp(lyric) | |
lyric = re.sub(r"\(\d+,\d+\)", "", lyric) | |
dct = { | |
"time_start": line_time_start, | |
"duration": actual_duration, | |
"text": lyric, | |
"original_text": lyric, | |
"timepoint_type": -1, | |
"clips": words_with_timestamp, | |
} | |
return dct | |
# by yuuhong | |
# 把一句QRC中的每个字拆分出来 | |
# lyric示例:漫(17316,178)步(17494,174)走(17668,193)在(17861,183) (18044,0)莎(18044,153)玛(18197,159)丽(18356,176)丹(18532,200) | |
def get_words_with_timestamp(lyric): | |
words_with_timestamp = [] | |
elements = lyric.split(")") | |
for element in elements: | |
sub_elements = element.split("(") | |
if len(sub_elements) != 2: | |
continue | |
text = sub_elements[0] | |
timestamp = sub_elements[1] | |
if re.match(r"\d+,\d+", timestamp): | |
# 有效时间戳 | |
time_start_str = timestamp.split(",")[0] | |
time_start = round(int(time_start_str) / 1000, 3) | |
duration_str = timestamp.split(",")[1] | |
duration = round(int(duration_str) / 1000, 3) | |
clip = {"text": text, "time_start": time_start, "duration": duration} | |
words_with_timestamp.append(clip) | |
return words_with_timestamp | |
def lyric2clips(lyric: str, th: float = 0.75) -> list: | |
"""将一句歌词转换为至少1个的clip。拆分主要是针对中文空格拆分,如果拆分后片段过短,也会整句处理。 | |
Args: | |
lyric (str): such as [173247,3275]去(173247,403)吗(173649,677) 配(174326,189)吗(174516,593) 这(175108,279) | |
th (float, optional): 后面如果拆分后片段过短,也会整句处理. Defaults to 1.0. | |
Returns: | |
list: 歌词Clip序列 | |
""" | |
# 目前只对中文的一句歌词按照空格拆分,如果是英文空格则整句处理 | |
# 后面如果拆分后片段过短,也会整句处理 | |
if has_english_alphabet_char(lyric): | |
return [lyric2clip(lyric)] | |
splited_lyric = lyric.split(" ") | |
if len(splited_lyric) == 1: | |
return [lyric2clip(splited_lyric[0])] | |
line_time_str, sub_lyric = re.split(r"]", splited_lyric[0]) | |
line_time_groups = re.findall(r"\d+,\d+", line_time_str) | |
line_time_start = round(int(line_time_groups[0].split(",")[0]) / 1000, 3) | |
line_duration = round(int(line_time_groups[0].split(",")[-1]) / 1000, 3) | |
splited_lyric[0] = sub_lyric | |
# 歌词xml都是歌词仅跟着时间,如果有空格 空格也应该是在时间后面,但有时候空格却在字后面、在时间前,因此需要修正 | |
# 错误的:[173247,3275]去(173247,403)吗 (173649,677)配(174326,189)吗 (174516,593)这(175108,279) | |
# 错误的:[46122,2082]以(46122,213)身(46335,260)淬(46595,209)炼(46804,268)天(47072,250)地(47322,370)造(47692,341)化 (48033,172) | |
# 修正成:[173247,3275]去(173247,403)吗(173649,677) 配(174326,189)吗(174516,593) 这(175108,279) | |
for i in range(len(splited_lyric)): | |
if splited_lyric[i] == "": | |
del splited_lyric[i] | |
break | |
if splited_lyric[i][-1] != ")": | |
next_lyric_time_start = re.search( | |
r"\(\d+,\d+\)", splited_lyric[i + 1] | |
).group(0) | |
splited_lyric[i] += next_lyric_time_start | |
splited_lyric[i + 1] = re.sub( | |
next_lyric_time_start, "", splited_lyric[i + 1] | |
) | |
splited_lyric[i + 1] = re.sub("\(\)", "", splited_lyric[i + 1]) | |
lyric_text = re.sub(r"\[\d+,\d+\]", "", lyric) | |
lyric_text = re.sub(r"\(\d+,\d+\)", "", lyric_text) | |
clips = [] | |
has_short_clip = False | |
for sub_lyric in splited_lyric: | |
sub_lyric_groups = re.findall(r"\d+,\d+", sub_lyric) | |
sub_lyric_1st_word_time_start = round( | |
int(sub_lyric_groups[0].split(",")[0]) / 1000, 3 | |
) | |
sub_lyric_last_word_time_start = round( | |
int(sub_lyric_groups[-1].split(",")[0]) / 1000, 3 | |
) | |
sub_lyric_last_word_duration = round( | |
int(sub_lyric_groups[-1].split(",")[-1]) / 1000, 3 | |
) | |
sub_lyric_last_word_time_end = ( | |
sub_lyric_last_word_time_start + sub_lyric_last_word_duration | |
) | |
sub_lyric_duration = ( | |
sub_lyric_last_word_time_end - sub_lyric_1st_word_time_start | |
) | |
if sub_lyric_duration <= th: | |
has_short_clip = True | |
break | |
sub_lyric_text = re.sub(r"\[\d+,\d+\]", "", sub_lyric) | |
sub_lyric_text = re.sub(r"\(\d+,\d+\)", "", sub_lyric_text) | |
# 使用原始lyric,而不是sub_lyric_text 主要是保留相关clip的歌词信息,便于语义连续 | |
dct = { | |
"time_start": sub_lyric_1st_word_time_start, | |
"duration": sub_lyric_duration, | |
"text": sub_lyric_text, | |
"original_text": lyric_text, | |
"timepoint_type": -1, | |
} | |
clips.append(dct) | |
if has_short_clip: | |
clips = [lyric2clip(lyric)] | |
return clips | |
def is_songname(lyric: str) -> bool: | |
"""是否是歌名,歌名文本含有ti, 如[ti:霍元甲 (《霍元甲》电影主题曲)] | |
Args: | |
lyric (str): | |
Returns: | |
bool: | |
""" | |
return has_target_string(lyric, r"\[ti[::]?") | |
def get_songname(lyric: str) -> str: | |
"""获取文本中的歌名,输入必须类似[ti:霍元甲 (《霍元甲》电影主题曲)] | |
Args: | |
lyric (str): 含有歌名的QRC文本行 | |
Returns: | |
str: 歌名 | |
""" | |
return lyric.split("(")[0][4:-1] | |
def is_album(lyric: str) -> bool: | |
"""是否含有专辑名,文本必须类似[al:霍元甲] | |
Args: | |
lyric (str): _description_ | |
Returns: | |
bool: _description_ | |
""" | |
return has_target_string(lyric, r"\[al[::]?") | |
def get_album(lyric: str) -> str: | |
"""提取专辑名,文本必须类似[al:霍元甲] | |
Args: | |
lyric (str): 含有专辑名的QRC文本行 | |
Returns: | |
str: 专辑名 | |
""" | |
return lyric[4:-1] | |
def is_singer(lyric: str) -> bool: | |
"""是否有歌手名,目标文本类似 [ar:周杰伦] | |
Args: | |
lyric (str): _description_ | |
Returns: | |
bool: _description_ | |
""" | |
return has_target_string(lyric, r"\[ar[::]?") | |
def get_singer(lyric: str) -> str: | |
"""提取歌手信息,文本必须类似[ar:周杰伦] | |
Args: | |
lyric (str): 含有歌手名的QRC文本行 | |
Returns: | |
str: 歌手名 | |
""" | |
return lyric[4:-1] | |
def lyric2musicinfo(lyric: str) -> dict: | |
"""convert lyric content from str into musicinfo, a dict | |
参考https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/media_info.py#L19 | |
{ | |
"meta_info": {}, | |
"sub_meta_info": [], | |
"clips": [ | |
clip | |
] | |
} | |
Args: | |
lyric (str): 来自QRC的歌词字符串 | |
Returns: | |
musicinfo: 音乐谱面字典,https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/media_info.py#L19 | |
""" | |
lyrics = lyric["QrcInfos"]["LyricInfo"]["Lyric_1"]["@LyricContent"] | |
musicinfo = { | |
"meta_info": { | |
"mediaid": None, | |
"media_name": None, | |
"singer": None, | |
}, | |
"sub_meata_info": {}, | |
"clips": [], | |
} | |
# lyrics = [line.strip() for line in re.split(r"[\t\n\s+]", lyrics)] | |
lyrics = ["[" + line.strip() for line in re.split(r"\[", lyrics)] | |
next_is_title_row = False | |
lyric_clips = [] | |
for line in lyrics: | |
if is_songname(line): | |
musicinfo["meta_info"]["media_name"] = get_songname(line) | |
continue | |
if is_singer(line): | |
musicinfo["meta_info"]["singer"] = get_singer(line) | |
continue | |
if is_album(line): | |
musicinfo["meta_info"]["album"] = get_album(line) | |
continue | |
is_lyric_row = check_is_lyric_row(line) | |
if next_is_title_row: | |
next_is_title_row = False | |
continue | |
# remove tille row | |
if not next_is_title_row and re.search(r"\[offset[::]", line): | |
next_is_title_row = True | |
if is_lyric_row and re.match(r"\[\d+,\d+\]", line): | |
lyric_clip = lyric2clip(line) | |
lyric_clips.append(lyric_clip) | |
clips = lyric2clips(line) | |
musicinfo["clips"].extend(clips) | |
musicinfo["meta_info"]["lyric"] = lyric_clips | |
return musicinfo | |
def lrc_timestr2time(time_str: str) -> float: | |
"""提取lrc中的时间戳文本,类似[00:00.00],转化成秒的浮点数 | |
Args: | |
time_str (str): | |
Returns: | |
float: 时间浮点数 | |
""" | |
m, s, ms = (float(x) for x in re.split(r"[:.]", time_str)) | |
return round((m * 60 + s + ms / 1000), 3) | |
def get_lrc_line_time(text: str, time_pattern: str) -> str: | |
"""提取lrc中的时间字符串, 类似 \"[00:00.00]本字幕由天琴实验室独家AI字幕技术生成\" | |
Args: | |
text (str): 输入文本 | |
time_pattern (str): 时间字符串正则表达式 | |
Returns: | |
str: 符合正则表达式的时间信息文本 | |
""" | |
time_str = re.search(time_pattern, text).group(0) | |
return lrc_timestr2time(time_str) | |
def lrc_lyric2clip(lyric: str, time_pattern: str, duration: float) -> dict: | |
"""将一行lrc文本字符串转化为Clip 字典 | |
Args: | |
lyric (str): 类似 \"[00:00.00]本字幕由天琴实验室独家AI字幕技术生成\" | |
time_pattern (str): 时间字符串正则表达式,类似 r"\d+:\d+\.\d+" | |
duration (float): clip的时长信息, | |
Returns: | |
dict: 转化后Clip | |
Clip定义可以参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/clip.py | |
""" | |
time_str = get_lrc_line_time(lyric, time_pattern=time_pattern) | |
text = re.sub(time_pattern, "", lyric) | |
text = text[2:] | |
clip = { | |
"time_start": time_str, | |
"duration": duration, | |
"text": text, | |
"timepoint_type": -1, | |
} | |
return clip | |
def lrc2musicinfo(lyric: str, time_pattern: str = "\d+:\d+\.\d+") -> dict: | |
"""将lrc转化为音乐谱面 | |
Args: | |
lyric (str): lrc文本路径 | |
time_pattern (str, optional): lrc时间戳字符串正则表达式. Defaults to "\d+:\d+\.\d+". | |
Returns: | |
dict: 生成的音乐谱面字典,定义可参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/music/music_info.py | |
""" | |
if isinstance(lyric, str): | |
if os.path.isfile(lyric): | |
with open(lyric, "r") as f: | |
lyric = [line.strip() for line in f.readlines()] | |
return lrc2musicinfo(lyric) | |
else: | |
lyric = lyric.split("\n") | |
return lrc2musicinfo(lyric) | |
else: | |
musicinfo = { | |
"meta_info": { | |
"mediaid": None, | |
"media_name": None, | |
"singer": None, | |
}, | |
"sub_meata_info": {}, | |
"clips": [], | |
} | |
# lyrics = [line.strip() for line in re.split(r"[\t\n\s+]", lyrics)] | |
lyric_clips = [] | |
rows = len(lyric) | |
for i, line in enumerate(lyric): | |
if is_songname(line): | |
musicinfo["meta_info"]["media_name"] = line[4:-1] | |
continue | |
if is_singer(line): | |
musicinfo["meta_info"]["singer"] = line[4:-1] | |
continue | |
if is_album(line): | |
musicinfo["meta_info"]["album"] = line[4:-1] | |
continue | |
if len(re.findall(time_pattern, line)) > 0: | |
if i < rows - 1: | |
time_start = get_lrc_line_time(line, time_pattern=time_pattern) | |
next_line_time_start = get_lrc_line_time( | |
lyric[i + 1], time_pattern=time_pattern | |
) | |
duration = next_line_time_start - time_start | |
else: | |
duration = 1 | |
clip = lrc_lyric2clip( | |
line, duration=duration, time_pattern=time_pattern | |
) | |
musicinfo["clips"].append(clip) | |
musicinfo["meta_info"]["lyric"] = lyric_clips | |
return musicinfo | |
def lyricfile2musicinfo(path: str) -> dict: | |
"""将歌词文件转化为音乐谱面,歌词文件可以是QRC的xml文件、也可以是lrc对应的lrc文件 | |
TODO: 待支持osu | |
Args: | |
path (str): 歌词文件路径 | |
Returns: | |
dict: 音乐谱面字典,定义可参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/music/music_info.py | |
""" | |
filename, ext = os.path.basename(path).split(".") | |
if ext == "xml": | |
lyric = read_xml2json(path) | |
musicinfo = lyric2musicinfo(lyric) | |
elif ext == "lrc": | |
musicinfo = lrc2musicinfo(path) | |
musicinfo["meta_info"]["mediaid"] = filename | |
return musicinfo | |