Spaces:

kevinwang676
/

MuseV-test

No application file

File size: 17,898 Bytes

6755a2d

from genericpath import isfile
import re
import os

from ...text.utils.read_text import read_xml2json


# 一个正则表达式非常好用的网站
# https://regex101.com/r/cW8jA6/2


CHINESE_PATTERN = r"[\u4e00-\u9fff]+"
NOT_CHINESE_PATTERN = r"[^\u4e00-\u9fa5]"
ENGLISH_CHARACHTER_PATTERN = r"[a-zA-Z]+"
WORD_PATTERN = r"\w+"  # equal to [a-zA-Z0-9_].
NOT_WORD_PATTERN = r"\W+"


def has_target_string(lyric: str, pattern: str) -> bool:
    """本句歌词是否有目标字符串

    Args:
        lyric (str):
        pattern (str): 目标字符串的正则表达式式patteren

    Returns:
        bool: 有没有目标字符串
    """
    matched = re.findall(pattern, lyric)
    flag = len(matched) > 0
    return flag


def has_chinese_char(lyric: str) -> bool:
    """是否有中文字符

    Args:
        lyric (str):

    Returns:
        bool: 是否有中文字符
    """
    return has_target_string(lyric, CHINESE_PATTERN)


def has_non_chinese_char(lyric: str) -> bool:
    """是否有非中文字符，参考https://git.woa.com/innovative_tech/CopyrightGroup/LyricTools/blob/master/lyric_tools/dataProcess.py#L53

    Args:
        lyric (str):

    Returns:
        bool: 是否有中文字符
    """
    return has_target_string(lyric, NOT_CHINESE_PATTERN)


def has_english_alphabet_char(lyric: str) -> bool:
    """是否有英文字母表字符

    Args:
        lyric (str):

    Returns:
        bool:
    """
    return has_target_string(lyric, ENGLISH_CHARACHTER_PATTERN)


def check_is_lyric_row(lyric: str) -> bool:
    """该字符串是否是歌词

    Args:
        lyric (str): 待判断的字符串

    Returns:
        bool: 该字符串是否是歌词
    """
    is_not_lyric = [
        re.search(r"\[ti[:：]?", lyric),
        re.search(r"\[ar[:：]?", lyric),
        re.search(r"\[al[:：]?", lyric),
        re.search(r"\[by[:：]?", lyric),
        re.search(r"\[offset[:：]?", lyric),
        re.search(r"词[:：]?\(\d+,\d+\)[:：]?", lyric),
        re.search(r"曲[:：]?\(\d+,\d+\)[:：]?", lyric),
        re.search(r"作\(\d+,\d+\)词[:：]?", lyric),
        re.search(r"作\(\d+,\d+\)曲[:：]?", lyric),
        re.search(r"演\(\d+,\d+\)唱[:：]?", lyric),
        re.search(r"编\(\d+,\d+\)曲[:：]?", lyric),
        re.search(r"吉\(\d+,\d+\)他[:：]", lyric),
        re.search(r"人\(\d+,\d+\)声\(\d+,\d+\)录\(\d+,\d+\)音\(\d+,\d+\)师[:：]?", lyric),
        re.search(r"人\(\d+,\d+\)声\(\d+,\d+\)录\(\d+,\d+\)音\(\d+,\d+\)棚[:：]?", lyric),
        re.search(r"Vocal\s+\(\d+,\d+\)edite[:：]?", lyric),
        re.search(r"混\(\d+,\d+\)音\(\d+,\d+\)/\(\d+,\d+\)母\(\d+,\d+\)带[:：]?", lyric),
        re.search(r"混\(\d+,\d+\)音", lyric),
        re.search(r"和\(\d+,\d+\)声\(\d+,\d+\)编\(\d+,\d+\)写[:：]?", lyric),
        re.search(
            r"词\(\d+,\d+\)版\(\d+,\d+\)权\(\d+,\d+\)管\(\d+,\d+\)理\(\d+,\d+\)方[:：]?", lyric
        ),
        re.search(
            r"曲\(\d+,\d+\)版\(\d+,\d+\)权\(\d+,\d+\)管\(\d+,\d+\)理\(\d+,\d+\)方[:：]?", lyric
        ),
        re.search(r"联\(\d+,\d+\)合\(\d+,\d+\)出\(\d+,\d+\)品[:：]?", lyric),
        re.search(r"录\(\d+,\d+\)音\(\d+,\d+\)作\(\d+,\d+\)品", lyric),
        re.search(
            r"录\(\d+,\d+\)音\(\d+,\d+\)作\(\d+,\d+\)品\(\d+,\d+\)监\(\d+,\d+\)制[:：]?", lyric
        ),
        re.search(r"制\(\d+,\d+\)作\(\d+,\d+\)人[:：]?", lyric),
        re.search(r"制\(\d+,\d+\)作\(\d+,\d+\)人[:：]?", lyric),
        re.search(r"不\(\d+,\d+\)得\(\d+,\d+\)翻\(\d+,\d+\)唱", lyric),
        re.search(r"未\(\d+,\d+\)经\(\d+,\d+\)许\(\d+,\d+\)可", lyric),
        re.search(r"酷\(\d+,\d+\)狗\(\d+,\d+\)音\(\d+,\d+\)乐", lyric),
        re.search(r"[:：]", lyric),
    ]
    is_not_lyric = [x is not None for x in is_not_lyric]
    is_not_lyric = any(is_not_lyric)
    is_lyric = not is_not_lyric
    return is_lyric


def lyric2clip(lyric: str) -> dict:
    """convert a line of lyric into a clip
    Clip定义可以参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/clip.py
    Args:
        lyric (str): _description_

    Returns:
        dict: 转化成Clip 字典
    """
    time_str_groups = re.findall(r"\d+,\d+", lyric)
    line_time_start = round(int(time_str_groups[0].split(",")[0]) / 1000, 3)
    line_duration = round(int(time_str_groups[0].split(",")[-1]) / 1000, 3)
    line_end_time = line_time_start + line_duration
    last_word_time_start = round(int(time_str_groups[-1].split(",")[0]) / 1000, 3)
    last_word_duration = round(int(time_str_groups[-1].split(",")[-1]) / 1000, 3)
    last_word_end_time = last_word_time_start + last_word_duration
    actual_duration = min(line_end_time, last_word_end_time) - line_time_start
    lyric = re.sub(r"\[\d+,\d+\]", "", lyric)

    # by yuuhong: 把每个字的起始时间点、结束时间点、具体的字拆分出来
    words_with_timestamp = get_words_with_timestamp(lyric)

    lyric = re.sub(r"\(\d+,\d+\)", "", lyric)
    dct = {
        "time_start": line_time_start,
        "duration": actual_duration,
        "text": lyric,
        "original_text": lyric,
        "timepoint_type": -1,
        "clips": words_with_timestamp,
    }
    return dct


# by yuuhong
# 把一句QRC中的每个字拆分出来
# lyric示例：漫(17316,178)步(17494,174)走(17668,193)在(17861,183) (18044,0)莎(18044,153)玛(18197,159)丽(18356,176)丹(18532,200)
def get_words_with_timestamp(lyric):
    words_with_timestamp = []
    elements = lyric.split(")")
    for element in elements:
        sub_elements = element.split("(")
        if len(sub_elements) != 2:
            continue
        text = sub_elements[0]
        timestamp = sub_elements[1]
        if re.match(r"\d+,\d+", timestamp):
            # 有效时间戳
            time_start_str = timestamp.split(",")[0]
            time_start = round(int(time_start_str) / 1000, 3)
            duration_str = timestamp.split(",")[1]
            duration = round(int(duration_str) / 1000, 3)
            clip = {"text": text, "time_start": time_start, "duration": duration}
            words_with_timestamp.append(clip)
    return words_with_timestamp


def lyric2clips(lyric: str, th: float = 0.75) -> list:
    """将一句歌词转换为至少1个的clip。拆分主要是针对中文空格拆分，如果拆分后片段过短，也会整句处理。
    Args:
        lyric (str): such as [173247,3275]去(173247,403)吗(173649,677) 配(174326,189)吗(174516,593) 这(175108,279)
        th (float, optional): 后面如果拆分后片段过短，也会整句处理. Defaults to 1.0.

    Returns:
        list: 歌词Clip序列
    """
    # 目前只对中文的一句歌词按照空格拆分，如果是英文空格则整句处理
    # 后面如果拆分后片段过短，也会整句处理
    if has_english_alphabet_char(lyric):
        return [lyric2clip(lyric)]
    splited_lyric = lyric.split(" ")
    if len(splited_lyric) == 1:
        return [lyric2clip(splited_lyric[0])]
    line_time_str, sub_lyric = re.split(r"]", splited_lyric[0])
    line_time_groups = re.findall(r"\d+,\d+", line_time_str)
    line_time_start = round(int(line_time_groups[0].split(",")[0]) / 1000, 3)
    line_duration = round(int(line_time_groups[0].split(",")[-1]) / 1000, 3)
    splited_lyric[0] = sub_lyric
    # 歌词xml都是歌词仅跟着时间，如果有空格 空格也应该是在时间后面，但有时候空格却在字后面、在时间前，因此需要修正
    # 错误的：[173247,3275]去(173247,403)吗 (173649,677)配(174326,189)吗 (174516,593)这(175108,279)
    # 错误的：[46122,2082]以(46122,213)身(46335,260)淬(46595,209)炼(46804,268)天(47072,250)地(47322,370)造(47692,341)化 (48033,172)
    # 修正成：[173247,3275]去(173247,403)吗(173649,677) 配(174326,189)吗(174516,593) 这(175108,279)
    for i in range(len(splited_lyric)):
        if splited_lyric[i] == "":
            del splited_lyric[i]
            break
        if splited_lyric[i][-1] != ")":
            next_lyric_time_start = re.search(
                r"\(\d+,\d+\)", splited_lyric[i + 1]
            ).group(0)
            splited_lyric[i] += next_lyric_time_start
            splited_lyric[i + 1] = re.sub(
                next_lyric_time_start, "", splited_lyric[i + 1]
            )
            splited_lyric[i + 1] = re.sub("\(\)", "", splited_lyric[i + 1])
    lyric_text = re.sub(r"\[\d+,\d+\]", "", lyric)
    lyric_text = re.sub(r"\(\d+,\d+\)", "", lyric_text)
    clips = []
    has_short_clip = False
    for sub_lyric in splited_lyric:
        sub_lyric_groups = re.findall(r"\d+,\d+", sub_lyric)
        sub_lyric_1st_word_time_start = round(
            int(sub_lyric_groups[0].split(",")[0]) / 1000, 3
        )
        sub_lyric_last_word_time_start = round(
            int(sub_lyric_groups[-1].split(",")[0]) / 1000, 3
        )
        sub_lyric_last_word_duration = round(
            int(sub_lyric_groups[-1].split(",")[-1]) / 1000, 3
        )
        sub_lyric_last_word_time_end = (
            sub_lyric_last_word_time_start + sub_lyric_last_word_duration
        )
        sub_lyric_duration = (
            sub_lyric_last_word_time_end - sub_lyric_1st_word_time_start
        )
        if sub_lyric_duration <= th:
            has_short_clip = True
            break
        sub_lyric_text = re.sub(r"\[\d+,\d+\]", "", sub_lyric)
        sub_lyric_text = re.sub(r"\(\d+,\d+\)", "", sub_lyric_text)
        # 使用原始lyric，而不是sub_lyric_text 主要是保留相关clip的歌词信息，便于语义连续
        dct = {
            "time_start": sub_lyric_1st_word_time_start,
            "duration": sub_lyric_duration,
            "text": sub_lyric_text,
            "original_text": lyric_text,
            "timepoint_type": -1,
        }
        clips.append(dct)
    if has_short_clip:
        clips = [lyric2clip(lyric)]
    return clips


def is_songname(lyric: str) -> bool:
    """是否是歌名，歌名文本含有ti, 如[ti:霍元甲 (《霍元甲》电影主题曲)]

    Args:
        lyric (str):

    Returns:
        bool:
    """
    return has_target_string(lyric, r"\[ti[:：]?")


def get_songname(lyric: str) -> str:
    """获取文本中的歌名，输入必须类似[ti:霍元甲 (《霍元甲》电影主题曲)]

    Args:
        lyric (str): 含有歌名的QRC文本行

    Returns:
        str: 歌名
    """
    return lyric.split("(")[0][4:-1]


def is_album(lyric: str) -> bool:
    """是否含有专辑名，文本必须类似[al:霍元甲]

    Args:
        lyric (str): _description_

    Returns:
        bool: _description_
    """

    return has_target_string(lyric, r"\[al[:：]?")


def get_album(lyric: str) -> str:
    """提取专辑名，文本必须类似[al:霍元甲]


    Args:
        lyric (str): 含有专辑名的QRC文本行

    Returns:
        str: 专辑名
    """
    return lyric[4:-1]


def is_singer(lyric: str) -> bool:
    """是否有歌手名，目标文本类似 [ar:周杰伦]

    Args:
        lyric (str): _description_

    Returns:
        bool: _description_
    """
    return has_target_string(lyric, r"\[ar[:：]?")


def get_singer(lyric: str) -> str:
    """提取歌手信息，文本必须类似[ar:周杰伦]

    Args:
        lyric (str): 含有歌手名的QRC文本行

    Returns:
        str: 歌手名
    """
    return lyric[4:-1]


def lyric2musicinfo(lyric: str) -> dict:
    """convert lyric content from str into musicinfo, a dict
    参考https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/media_info.py#L19
    {
        "meta_info": {},
        "sub_meta_info": [],
        "clips": [
            clip
        ]
    }

    Args:
        lyric (str): 来自QRC的歌词字符串

    Returns:
        musicinfo: 音乐谱面字典，https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/media_info.py#L19
    """
    lyrics = lyric["QrcInfos"]["LyricInfo"]["Lyric_1"]["@LyricContent"]
    musicinfo = {
        "meta_info": {
            "mediaid": None,
            "media_name": None,
            "singer": None,
        },
        "sub_meata_info": {},
        "clips": [],
    }
    # lyrics = [line.strip() for line in re.split(r"[\t\n\s+]", lyrics)]
    lyrics = ["[" + line.strip() for line in re.split(r"\[", lyrics)]
    next_is_title_row = False
    lyric_clips = []
    for line in lyrics:
        if is_songname(line):
            musicinfo["meta_info"]["media_name"] = get_songname(line)
            continue
        if is_singer(line):
            musicinfo["meta_info"]["singer"] = get_singer(line)
            continue
        if is_album(line):
            musicinfo["meta_info"]["album"] = get_album(line)
            continue
        is_lyric_row = check_is_lyric_row(line)
        if next_is_title_row:
            next_is_title_row = False
            continue
        # remove tille row
        if not next_is_title_row and re.search(r"\[offset[:：]", line):
            next_is_title_row = True
        if is_lyric_row and re.match(r"\[\d+,\d+\]", line):
            lyric_clip = lyric2clip(line)
            lyric_clips.append(lyric_clip)
            clips = lyric2clips(line)
            musicinfo["clips"].extend(clips)
    musicinfo["meta_info"]["lyric"] = lyric_clips
    return musicinfo


def lrc_timestr2time(time_str: str) -> float:
    """提取lrc中的时间戳文本，类似[00:00.00]，转化成秒的浮点数

    Args:
        time_str (str):

    Returns:
        float: 时间浮点数
    """
    m, s, ms = (float(x) for x in re.split(r"[:.]", time_str))
    return round((m * 60 + s + ms / 1000), 3)


def get_lrc_line_time(text: str, time_pattern: str) -> str:
    """提取lrc中的时间字符串, 类似 \"[00:00.00]本字幕由天琴实验室独家AI字幕技术生成\"

    Args:
        text (str): 输入文本
        time_pattern (str): 时间字符串正则表达式

    Returns:
        str: 符合正则表达式的时间信息文本
    """
    time_str = re.search(time_pattern, text).group(0)
    return lrc_timestr2time(time_str)


def lrc_lyric2clip(lyric: str, time_pattern: str, duration: float) -> dict:
    """将一行lrc文本字符串转化为Clip 字典

    Args:
        lyric (str):  类似 \"[00:00.00]本字幕由天琴实验室独家AI字幕技术生成\"
        time_pattern (str): 时间字符串正则表达式，类似 r"\d+:\d+\.\d+"
        duration (float): clip的时长信息，

    Returns:
        dict: 转化后Clip
            Clip定义可以参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/media/clip.py
    """
    time_str = get_lrc_line_time(lyric, time_pattern=time_pattern)
    text = re.sub(time_pattern, "", lyric)
    text = text[2:]
    clip = {
        "time_start": time_str,
        "duration": duration,
        "text": text,
        "timepoint_type": -1,
    }
    return clip


def lrc2musicinfo(lyric: str, time_pattern: str = "\d+:\d+\.\d+") -> dict:
    """将lrc转化为音乐谱面

    Args:
        lyric (str): lrc文本路径
        time_pattern (str, optional): lrc时间戳字符串正则表达式. Defaults to "\d+:\d+\.\d+".

    Returns:
        dict: 生成的音乐谱面字典，定义可参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/music/music_info.py
    """
    if isinstance(lyric, str):
        if os.path.isfile(lyric):
            with open(lyric, "r") as f:
                lyric = [line.strip() for line in f.readlines()]
            return lrc2musicinfo(lyric)
        else:
            lyric = lyric.split("\n")
            return lrc2musicinfo(lyric)
    else:
        musicinfo = {
            "meta_info": {
                "mediaid": None,
                "media_name": None,
                "singer": None,
            },
            "sub_meata_info": {},
            "clips": [],
        }
        # lyrics = [line.strip() for line in re.split(r"[\t\n\s+]", lyrics)]
        lyric_clips = []
        rows = len(lyric)
        for i, line in enumerate(lyric):
            if is_songname(line):
                musicinfo["meta_info"]["media_name"] = line[4:-1]
                continue
            if is_singer(line):
                musicinfo["meta_info"]["singer"] = line[4:-1]
                continue
            if is_album(line):
                musicinfo["meta_info"]["album"] = line[4:-1]
                continue
            if len(re.findall(time_pattern, line)) > 0:
                if i < rows - 1:
                    time_start = get_lrc_line_time(line, time_pattern=time_pattern)
                    next_line_time_start = get_lrc_line_time(
                        lyric[i + 1], time_pattern=time_pattern
                    )
                    duration = next_line_time_start - time_start
                else:
                    duration = 1
                clip = lrc_lyric2clip(
                    line, duration=duration, time_pattern=time_pattern
                )
                musicinfo["clips"].append(clip)
        musicinfo["meta_info"]["lyric"] = lyric_clips
        return musicinfo


def lyricfile2musicinfo(path: str) -> dict:
    """将歌词文件转化为音乐谱面，歌词文件可以是QRC的xml文件、也可以是lrc对应的lrc文件
        TODO： 待支持osu

    Args:
        path (str): 歌词文件路径

    Returns:
        dict: 音乐谱面字典，定义可参考 https://git.woa.com/innovative_tech/VideoMashup/blob/master/videomashup/music/music_info.py
    """

    filename, ext = os.path.basename(path).split(".")
    if ext == "xml":
        lyric = read_xml2json(path)
        musicinfo = lyric2musicinfo(lyric)
    elif ext == "lrc":
        musicinfo = lrc2musicinfo(path)
    musicinfo["meta_info"]["mediaid"] = filename
    return musicinfo