File size: 2,767 Bytes
5422b18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re

def is_japanese(string):
        for ch in string:
            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
                return True
        return False

def is_chinese(string):
    for ch in string:
        if '\u4e00' <= ch <= '\u9fff':
            return True
    return False

def replace_quotes(text):
    # 替换中文、日文引号为英文引号
    text = re.sub(r'[“”‘’『』「」()()]', '"', text)
    return text

def extrac(text):
    text = replace_quotes(text)  # 替换引号
    text = re.sub("<[^>]*>", "", text)  # 移除 HTML 标签
    # 使用换行符和标点符号进行初步分割
    preliminary_sentences = re.split(r'([\n。!?\.\?!])', text)
    final_sentences = []

    temp_sentence = ""
    for piece in preliminary_sentences:
        if re.match(r'[\n。!?\.\?!]', piece):
            temp_sentence += piece
            # 分割句子并保留标点
            sub_sentences = re.split(r'(?<=[。!?\.\?!])', temp_sentence)
            for sub_sentence in sub_sentences:
                # 检查是否混合语言
                if len(sub_sentence) > 20 or is_mixed_language(sub_sentence):
                    # 进一步分割
                    final_sentences.extend(split_mixed_language(sub_sentence))
                else:
                    final_sentences.append(sub_sentence)
            temp_sentence = ""
        else:
            temp_sentence += piece

    # 添加最后一个句子(如果有)
    if temp_sentence:
        final_sentences.append(temp_sentence)

    return [s.replace('"','').replace("<","").replace(">","") for s in final_sentences if s]  # 移除空字符串

def is_mixed_language(sentence):
    contains_chinese = re.search(r'[\u4e00-\u9fff]', sentence) is not None
    contains_japanese = re.search(r'[\u3040-\u30ff\u31f0-\u31ff]', sentence) is not None
    contains_english = re.search(r'[a-zA-Z]', sentence) is not None
    languages_count = sum([contains_chinese, contains_japanese, contains_english])
    return languages_count > 1

def split_mixed_language(sentence):
    # 分割混合语言句子
    sub_sentences = re.split(r'(?<=[。!?\.\?!])(?=")|(?<=")(?=[\u4e00-\u9fff\u3040-\u30ff\u31f0-\u31ff]|[a-zA-Z])', sentence)
    return [s.strip() for s in sub_sentences if s.strip()]

if __name__ == "__main__":
    text = "你好,这是一段用来测试自动标注的文本。こんにちは,これは「自動ラベリングのテスト用テキスト」です.Hello, this is a piece of text to test autotagging.你好!今天我们要介绍VITS项目,其重点是使用了“GAN Duration predictor”和“transformer flow”,并且接入了Bert模型来提升韵律。Bert embedding会在稍后介绍。"
    print(extrac(text))