Spaces:

Shakhovak
/

RU_accent_flask

Sleeping

File size: 4,792 Bytes

09cf842

import json
import os
import re
from os.path import join as join_path

from text_split import split_by_sentences


class RUAccent:
    vowels = "аеёиоуыэюя"
    def __init__(self):
        self.omographs = None
        self.accents = None
        self.workdir = os.getcwd()


    def load(self, custom_accent=None, custom_omographs=None):

        if custom_omographs is None:
            custom_omographs = {}

        if custom_accent is None:
            custom_accent = {}

        self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "omographs.json"), encoding='utf-8'))

        self.omographs.update(custom_omographs)

        self.accents = json.load(open(join_path(self.workdir, "dictionaries", "accents.json"), encoding='utf-8'))

        self.accents.update(custom_accent)

        # self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')

    def split_by_words(self, string):
        result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
        return [res for res in result if res]

    def process_all(self, text):
        """Ядро всей программы. Тут текст проходит через ряд функций,
        где по итогу получается строка с проставленными ударениями
        Input:
        text: string

        Output:
        accented_sentence: list[string]
        omographs_list: list[string]
        unknown_list: list[string]
        """
        accented_sentence = []
        omographs_list = []
        unknown_list = []

        sentences = split_by_sentences(text)
        outputs = []
        for sentence in sentences:
            text = self.split_by_words(sentence)
            # processed_text = self._process_yo(text)

            # processed_text = self._process_omographs(text)
            founded_omographs = self._process_omographs(text)
            omographs_list.extend(founded_omographs)

            processed_text, unknown_words = self._process_accent(text, founded_omographs)
            unknown_list.extend(unknown_words)

            processed_text = " ".join(processed_text)
            processed_text = self.delete_spaces_before_punc(processed_text)
            # outputs.append(processed_text)

            accented_sentence.append(processed_text)
            # " ".join(outputs)

        omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
        return accented_sentence, omographs_list, unknown_list

    def _process_yo(self, text):
        splitted_text = text

        for i, word in enumerate(splitted_text):
            splitted_text[i] = self.yo_words.get(word, word)
        return splitted_text

    def _process_omographs(self, text):
        splitted_text = text

        founded_omographs = []
        for i, word in enumerate(splitted_text):
            variants = self.omographs.get(word)
            if variants:
                founded_omographs.append(
                    {word: variants}
                )


        # for omograph in founded_omographs:
        #     splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
        #     cls = omograph["variants"][0]  # Just take the first variant from the dictionary
        #     splitted_text[omograph["position"]] = cls
        # return splitted_text
        return founded_omographs

    def _process_accent(self, text, founded_omographs):
        splitted_text = text
        unknown_words = []
        for i, word in enumerate(splitted_text):
            stressed_word = self.accents.get(word, word)
            if stressed_word == word:
                # if len(word) > 4:
                if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
                    unknown_words.append(word)
                splitted_text[i] = word

            elif stressed_word != word and word in [list(d.keys())[0] for d in founded_omographs]:
                splitted_text[i] = word

            else:
                splitted_text[i] = stressed_word




            # stressed_word = self.accents.get(word, word)
            # splitted_text[i] = stressed_word

        return splitted_text, unknown_words

    def delete_spaces_before_punc(self, text):
        punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
        for char in punc:
            text = text.replace(" " + char, char)
        return text


# # Example usage:
# ru_accent = RUAccent()
# ru_accent.load()
#
# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
# processed_text = ru_accent.process_all(text_to_process)
#
# print(processed_text)