SYMBOL_SPLITS = { | |
"。", | |
"?", | |
"!", | |
"……", | |
".", | |
"?", | |
"!", | |
"~", | |
"…", | |
} | |
def make_text_chunk(original_text, strat_index, max_len=5, max_try=5000): | |
cut_string = original_text | |
end_index = strat_index | |
while True: | |
if original_text[end_index] in SYMBOL_SPLITS: | |
end_index += 1 | |
cut_string = original_text[strat_index:end_index] | |
break | |
else: | |
end_index += 1 | |
if end_index >= len(original_text): | |
# 文本太短,没找到 | |
return 0, "" | |
if end_index > max_try: | |
# 有问题 | |
raise ValueError("Reach max try") | |
return end_index, cut_string | |