Spaces:
Sleeping
Sleeping
File size: 4,792 Bytes
09cf842 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
import json
import os
import re
from os.path import join as join_path
from text_split import split_by_sentences
class RUAccent:
vowels = "аеёиоуыэюя"
def __init__(self):
self.omographs = None
self.accents = None
self.workdir = os.getcwd()
def load(self, custom_accent=None, custom_omographs=None):
if custom_omographs is None:
custom_omographs = {}
if custom_accent is None:
custom_accent = {}
self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "omographs.json"), encoding='utf-8'))
self.omographs.update(custom_omographs)
self.accents = json.load(open(join_path(self.workdir, "dictionaries", "accents.json"), encoding='utf-8'))
self.accents.update(custom_accent)
# self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
def split_by_words(self, string):
result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
return [res for res in result if res]
def process_all(self, text):
"""Ядро всей программы. Тут текст проходит через ряд функций,
где по итогу получается строка с проставленными ударениями
Input:
text: string
Output:
accented_sentence: list[string]
omographs_list: list[string]
unknown_list: list[string]
"""
accented_sentence = []
omographs_list = []
unknown_list = []
sentences = split_by_sentences(text)
outputs = []
for sentence in sentences:
text = self.split_by_words(sentence)
# processed_text = self._process_yo(text)
# processed_text = self._process_omographs(text)
founded_omographs = self._process_omographs(text)
omographs_list.extend(founded_omographs)
processed_text, unknown_words = self._process_accent(text, founded_omographs)
unknown_list.extend(unknown_words)
processed_text = " ".join(processed_text)
processed_text = self.delete_spaces_before_punc(processed_text)
# outputs.append(processed_text)
accented_sentence.append(processed_text)
# " ".join(outputs)
omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
return accented_sentence, omographs_list, unknown_list
def _process_yo(self, text):
splitted_text = text
for i, word in enumerate(splitted_text):
splitted_text[i] = self.yo_words.get(word, word)
return splitted_text
def _process_omographs(self, text):
splitted_text = text
founded_omographs = []
for i, word in enumerate(splitted_text):
variants = self.omographs.get(word)
if variants:
founded_omographs.append(
{word: variants}
)
# for omograph in founded_omographs:
# splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
# cls = omograph["variants"][0] # Just take the first variant from the dictionary
# splitted_text[omograph["position"]] = cls
# return splitted_text
return founded_omographs
def _process_accent(self, text, founded_omographs):
splitted_text = text
unknown_words = []
for i, word in enumerate(splitted_text):
stressed_word = self.accents.get(word, word)
if stressed_word == word:
# if len(word) > 4:
if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
unknown_words.append(word)
splitted_text[i] = word
elif stressed_word != word and word in [list(d.keys())[0] for d in founded_omographs]:
splitted_text[i] = word
else:
splitted_text[i] = stressed_word
# stressed_word = self.accents.get(word, word)
# splitted_text[i] = stressed_word
return splitted_text, unknown_words
def delete_spaces_before_punc(self, text):
punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
for char in punc:
text = text.replace(" " + char, char)
return text
# # Example usage:
# ru_accent = RUAccent()
# ru_accent.load()
#
# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
# processed_text = ru_accent.process_all(text_to_process)
#
# print(processed_text)
|