Spaces:
Sleeping
Sleeping
shakhovak
commited on
Commit
•
09cf842
1
Parent(s):
18cd63e
added files
Browse files- Dockerfile +11 -0
- requirements.txt +11 -0
- ruaccent.py +142 -0
- text_split.py +134 -0
- web_interface.py +44 -0
Dockerfile
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-alpine
|
2 |
+
|
3 |
+
WORKDIR /code
|
4 |
+
|
5 |
+
COPY ./requirements.txt /code/requirements.txt
|
6 |
+
|
7 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
8 |
+
|
9 |
+
COPY . .
|
10 |
+
|
11 |
+
CMD ["gunicorn", "-b", "0.0.0.0:7860", "code.web_interface:app"]
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
blinker==1.7.0
|
2 |
+
click==8.1.7
|
3 |
+
colorama==0.4.6
|
4 |
+
Flask==3.0.0
|
5 |
+
importlib-metadata==7.0.0
|
6 |
+
itsdangerous==2.1.2
|
7 |
+
Jinja2==3.1.2
|
8 |
+
MarkupSafe==2.1.3
|
9 |
+
Werkzeug==3.0.1
|
10 |
+
zipp==3.17.0
|
11 |
+
gunicorn==20.1.0
|
ruaccent.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from os.path import join as join_path
|
5 |
+
|
6 |
+
from text_split import split_by_sentences
|
7 |
+
|
8 |
+
|
9 |
+
class RUAccent:
|
10 |
+
vowels = "аеёиоуыэюя"
|
11 |
+
def __init__(self):
|
12 |
+
self.omographs = None
|
13 |
+
self.accents = None
|
14 |
+
self.workdir = os.getcwd()
|
15 |
+
|
16 |
+
|
17 |
+
def load(self, custom_accent=None, custom_omographs=None):
|
18 |
+
|
19 |
+
if custom_omographs is None:
|
20 |
+
custom_omographs = {}
|
21 |
+
|
22 |
+
if custom_accent is None:
|
23 |
+
custom_accent = {}
|
24 |
+
|
25 |
+
self.omographs = json.load(open(join_path(self.workdir, "dictionaries", "omographs.json"), encoding='utf-8'))
|
26 |
+
|
27 |
+
self.omographs.update(custom_omographs)
|
28 |
+
|
29 |
+
self.accents = json.load(open(join_path(self.workdir, "dictionaries", "accents.json"), encoding='utf-8'))
|
30 |
+
|
31 |
+
self.accents.update(custom_accent)
|
32 |
+
|
33 |
+
# self.yo_words = json.load(open("dictionaries/yo_words.json"), encoding='utf-8')
|
34 |
+
|
35 |
+
def split_by_words(self, string):
|
36 |
+
result = re.findall(r"\w*(?:\+\w+)*|[^\w\s]+", string.lower())
|
37 |
+
return [res for res in result if res]
|
38 |
+
|
39 |
+
def process_all(self, text):
|
40 |
+
"""Ядро всей программы. Тут текст проходит через ряд функций,
|
41 |
+
где по итогу получается строка с проставленными ударениями
|
42 |
+
Input:
|
43 |
+
text: string
|
44 |
+
|
45 |
+
Output:
|
46 |
+
accented_sentence: list[string]
|
47 |
+
omographs_list: list[string]
|
48 |
+
unknown_list: list[string]
|
49 |
+
"""
|
50 |
+
accented_sentence = []
|
51 |
+
omographs_list = []
|
52 |
+
unknown_list = []
|
53 |
+
|
54 |
+
sentences = split_by_sentences(text)
|
55 |
+
outputs = []
|
56 |
+
for sentence in sentences:
|
57 |
+
text = self.split_by_words(sentence)
|
58 |
+
# processed_text = self._process_yo(text)
|
59 |
+
|
60 |
+
# processed_text = self._process_omographs(text)
|
61 |
+
founded_omographs = self._process_omographs(text)
|
62 |
+
omographs_list.extend(founded_omographs)
|
63 |
+
|
64 |
+
processed_text, unknown_words = self._process_accent(text, founded_omographs)
|
65 |
+
unknown_list.extend(unknown_words)
|
66 |
+
|
67 |
+
processed_text = " ".join(processed_text)
|
68 |
+
processed_text = self.delete_spaces_before_punc(processed_text)
|
69 |
+
# outputs.append(processed_text)
|
70 |
+
|
71 |
+
accented_sentence.append(processed_text)
|
72 |
+
# " ".join(outputs)
|
73 |
+
|
74 |
+
omographs_list = [f"{key}: {value}" for elem in omographs_list for key, value in elem.items()]
|
75 |
+
return accented_sentence, omographs_list, unknown_list
|
76 |
+
|
77 |
+
def _process_yo(self, text):
|
78 |
+
splitted_text = text
|
79 |
+
|
80 |
+
for i, word in enumerate(splitted_text):
|
81 |
+
splitted_text[i] = self.yo_words.get(word, word)
|
82 |
+
return splitted_text
|
83 |
+
|
84 |
+
def _process_omographs(self, text):
|
85 |
+
splitted_text = text
|
86 |
+
|
87 |
+
founded_omographs = []
|
88 |
+
for i, word in enumerate(splitted_text):
|
89 |
+
variants = self.omographs.get(word)
|
90 |
+
if variants:
|
91 |
+
founded_omographs.append(
|
92 |
+
{word: variants}
|
93 |
+
)
|
94 |
+
|
95 |
+
|
96 |
+
# for omograph in founded_omographs:
|
97 |
+
# splitted_text[omograph["position"]] = f"<w>{splitted_text[omograph['position']]}</w>"
|
98 |
+
# cls = omograph["variants"][0] # Just take the first variant from the dictionary
|
99 |
+
# splitted_text[omograph["position"]] = cls
|
100 |
+
# return splitted_text
|
101 |
+
return founded_omographs
|
102 |
+
|
103 |
+
def _process_accent(self, text, founded_omographs):
|
104 |
+
splitted_text = text
|
105 |
+
unknown_words = []
|
106 |
+
for i, word in enumerate(splitted_text):
|
107 |
+
stressed_word = self.accents.get(word, word)
|
108 |
+
if stressed_word == word:
|
109 |
+
# if len(word) > 4:
|
110 |
+
if sum(word.count(vowel) for vowel in RUAccent.vowels) > 1:
|
111 |
+
unknown_words.append(word)
|
112 |
+
splitted_text[i] = word
|
113 |
+
|
114 |
+
elif stressed_word != word and word in [list(d.keys())[0] for d in founded_omographs]:
|
115 |
+
splitted_text[i] = word
|
116 |
+
|
117 |
+
else:
|
118 |
+
splitted_text[i] = stressed_word
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
+
# stressed_word = self.accents.get(word, word)
|
124 |
+
# splitted_text[i] = stressed_word
|
125 |
+
|
126 |
+
return splitted_text, unknown_words
|
127 |
+
|
128 |
+
def delete_spaces_before_punc(self, text):
|
129 |
+
punc = "!\"#$%&'()*,./:;<=>?@[\\]^_`{|}~"
|
130 |
+
for char in punc:
|
131 |
+
text = text.replace(" " + char, char)
|
132 |
+
return text
|
133 |
+
|
134 |
+
|
135 |
+
# # Example usage:
|
136 |
+
# ru_accent = RUAccent()
|
137 |
+
# ru_accent.load()
|
138 |
+
#
|
139 |
+
# text_to_process = "В этом замке совершенно нет ни одного замка. Наверно я не буду ругаться с нига из-за этого сучонка"
|
140 |
+
# processed_text = ru_accent.process_all(text_to_process)
|
141 |
+
#
|
142 |
+
# print(processed_text)
|
text_split.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import logging
|
3 |
+
from typing import Set, Tuple, List
|
4 |
+
|
5 |
+
SENTENCE_SPLITTER = re.compile(r'[^\.?!…]+[\.?!…]*["»“]*')
|
6 |
+
|
7 |
+
LAST_WORD_PATTERN = re.compile(r'(?:\b|\d)([a-zа-я]+)\.$', re.IGNORECASE)
|
8 |
+
FIRST_WORD_PATTERN = re.compile(r'^\W*(\w+)')
|
9 |
+
ENDS_WITH_ONE_LETTER_LAT_AND_DOT_PATTERN = re.compile(r'(\d|\W|\b)([a-zA-Z])\.$')
|
10 |
+
HAS_DOT_INSIDE_PATTERN = re.compile(r'[\w]+\.[\w]+\.$', re.IGNORECASE)
|
11 |
+
INITIALS_PATTERN = re.compile(r'(\W|\b)([A-ZА-Я]{1})\.$')
|
12 |
+
ONLY_RUS_CONSONANTS_PATTERN = re.compile(r'^[бвгджзйклмнпрстфхцчшщ]{1,4}$', re.IGNORECASE)
|
13 |
+
STARTS_WITH_EMPTYNESS_PATTERN = re.compile(r'^\s+')
|
14 |
+
ENDS_WITH_EMOTION_PATTERN = re.compile(r'[!?…]|\.{2,}\s?[)"«»,“]?$')
|
15 |
+
STARTS_WITH_LOWER_PATTERN = re.compile(r'^\s*[–-—-("«]?\s*[a-zа-я]')
|
16 |
+
STARTS_WITH_DIGIT_PATTERN = re.compile(r'^\s*\d')
|
17 |
+
NUMERATION_PATTERN = re.compile(r'^\W*[IVXMCL\d]+\.$')
|
18 |
+
PAIRED_SHORTENING_IN_THE_END_PATTERN = re.compile(r'\b(\w+)\. (\w+)\.\W*$')
|
19 |
+
|
20 |
+
JOIN = 0
|
21 |
+
MAYBE = 1
|
22 |
+
SPLIT = 2
|
23 |
+
|
24 |
+
JOINING_SHORTENINGS = {
|
25 |
+
'mr', 'mrs', 'ms', 'dr', 'vs', 'англ', 'итал', 'греч', 'евр', 'араб', 'яп', 'слав', 'кит',
|
26 |
+
'тел', 'св', 'ул', 'устар', 'им', 'г', 'см', 'д', 'стр', 'корп', 'пл', 'пер', 'сокр', 'рис'
|
27 |
+
}
|
28 |
+
|
29 |
+
SHORTENINGS = {
|
30 |
+
'co', 'corp', 'inc', 'авт', 'адм', 'барр', 'внутр', 'га', 'дифф', 'дол', 'долл', 'зав', 'зам', 'искл',
|
31 |
+
'коп', 'корп', 'куб', 'лат', 'мин', 'о', 'обл', 'обр', 'прим', 'проц', 'р', 'ред', 'руб', 'рус', 'русск',
|
32 |
+
'сан', 'сек', 'тыс', 'эт', 'яз', 'гос', 'мн', 'жен', 'муж', 'накл', 'повел', 'букв', 'шутл', 'ед'
|
33 |
+
}
|
34 |
+
|
35 |
+
PAIRED_SHORTENINGS = {('и', 'о'), ('т', 'е'), ('т', 'п'), ('у', 'е'), ('н', 'э')}
|
36 |
+
|
37 |
+
|
38 |
+
def split_sentences(text: str) -> List[str]:
|
39 |
+
return [x.strip() for x in SENTENCE_SPLITTER.findall(text)]
|
40 |
+
|
41 |
+
|
42 |
+
def is_sentence_end(left: str, right: str,
|
43 |
+
shortenings: Set[str],
|
44 |
+
joining_shortenings: Set[str],
|
45 |
+
paired_shortenings: Set[Tuple[str, str]]) -> int:
|
46 |
+
if not STARTS_WITH_EMPTYNESS_PATTERN.match(right):
|
47 |
+
return JOIN
|
48 |
+
|
49 |
+
if HAS_DOT_INSIDE_PATTERN.search(left):
|
50 |
+
return JOIN
|
51 |
+
|
52 |
+
left_last_word = LAST_WORD_PATTERN.search(left)
|
53 |
+
lw = ' '
|
54 |
+
if left_last_word:
|
55 |
+
lw = left_last_word.group(1)
|
56 |
+
|
57 |
+
if lw.lower() in joining_shortenings:
|
58 |
+
return JOIN
|
59 |
+
|
60 |
+
if ONLY_RUS_CONSONANTS_PATTERN.search(lw) and lw[-1].islower():
|
61 |
+
return MAYBE
|
62 |
+
|
63 |
+
pse = PAIRED_SHORTENING_IN_THE_END_PATTERN.search(left)
|
64 |
+
if pse:
|
65 |
+
s1, s2 = pse.groups()
|
66 |
+
if (s1, s2) in paired_shortenings:
|
67 |
+
return MAYBE
|
68 |
+
|
69 |
+
right_first_word = FIRST_WORD_PATTERN.match(right)
|
70 |
+
if right_first_word:
|
71 |
+
rw = right_first_word.group(1)
|
72 |
+
if (lw, rw) in paired_shortenings:
|
73 |
+
return MAYBE
|
74 |
+
|
75 |
+
if ENDS_WITH_EMOTION_PATTERN.search(left) and STARTS_WITH_LOWER_PATTERN.match(right):
|
76 |
+
return JOIN
|
77 |
+
|
78 |
+
initials = INITIALS_PATTERN.search(left)
|
79 |
+
if initials:
|
80 |
+
border, _ = initials.groups()
|
81 |
+
if (border or ' ') not in "°'":
|
82 |
+
return JOIN
|
83 |
+
|
84 |
+
if lw.lower() in shortenings:
|
85 |
+
return MAYBE
|
86 |
+
|
87 |
+
last_letter = ENDS_WITH_ONE_LETTER_LAT_AND_DOT_PATTERN.search(left)
|
88 |
+
if last_letter:
|
89 |
+
border, _ = last_letter.groups()
|
90 |
+
if (border or ' ') not in "°'":
|
91 |
+
return MAYBE
|
92 |
+
if NUMERATION_PATTERN.match(left):
|
93 |
+
return JOIN
|
94 |
+
return SPLIT
|
95 |
+
|
96 |
+
|
97 |
+
def split_by_sentences(text: str,
|
98 |
+
shortenings: Set[str] = SHORTENINGS,
|
99 |
+
joining_shortenings: Set[str] = JOINING_SHORTENINGS,
|
100 |
+
paired_shortenings: Set[Tuple[str, str]] = PAIRED_SHORTENINGS) -> List[str]:
|
101 |
+
sentences = []
|
102 |
+
sents = split_sentences(text)
|
103 |
+
si = 0
|
104 |
+
processed_index = 0
|
105 |
+
sent_start = 0
|
106 |
+
while si < len(sents):
|
107 |
+
s = sents[si]
|
108 |
+
span_start = text[processed_index:].index(s) + processed_index
|
109 |
+
span_end = span_start + len(s)
|
110 |
+
processed_index += len(s)
|
111 |
+
|
112 |
+
si += 1
|
113 |
+
|
114 |
+
send = is_sentence_end(text[sent_start: span_end], text[span_end:],
|
115 |
+
shortenings, joining_shortenings, paired_shortenings)
|
116 |
+
if send == JOIN:
|
117 |
+
continue
|
118 |
+
|
119 |
+
if send == MAYBE:
|
120 |
+
if STARTS_WITH_LOWER_PATTERN.match(text[span_end:]):
|
121 |
+
continue
|
122 |
+
if STARTS_WITH_DIGIT_PATTERN.match(text[span_end:]):
|
123 |
+
continue
|
124 |
+
|
125 |
+
if not text[sent_start: span_end].strip():
|
126 |
+
print(text)
|
127 |
+
sentences.append(text[sent_start: span_end].strip())
|
128 |
+
sent_start = span_end
|
129 |
+
processed_index = span_end
|
130 |
+
|
131 |
+
if sent_start != len(text):
|
132 |
+
if text[sent_start:].strip():
|
133 |
+
sentences.append(text[sent_start:].strip())
|
134 |
+
return sentences
|
web_interface.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from flask import Flask, render_template, request, send_file
|
2 |
+
import os
|
3 |
+
from ruaccent import RUAccent
|
4 |
+
import text_split
|
5 |
+
|
6 |
+
app = Flask(__name__)
|
7 |
+
|
8 |
+
ru_accent = RUAccent()
|
9 |
+
ru_accent.load()
|
10 |
+
|
11 |
+
@app.route('/')
|
12 |
+
def index():
|
13 |
+
return render_template('index.html')
|
14 |
+
|
15 |
+
@app.route('/process', methods=['POST'])
|
16 |
+
def process():
|
17 |
+
if request.method == 'POST':
|
18 |
+
input_text = request.form['input_text']
|
19 |
+
processed_text = ru_accent.process_all(input_text)
|
20 |
+
|
21 |
+
# Create three text files with the same content
|
22 |
+
|
23 |
+
file_name = 'accented_text.txt'
|
24 |
+
with open(file_name, 'w', encoding="utf-8") as file:
|
25 |
+
file.write(" ".join(processed_text[0]))
|
26 |
+
|
27 |
+
file_name = 'omographs.txt'
|
28 |
+
with open(file_name, 'w', encoding="utf-8") as file:
|
29 |
+
file.write("\n".join(processed_text[1]))
|
30 |
+
|
31 |
+
file_name = 'unknown.txt'
|
32 |
+
with open(file_name, 'w', encoding="utf-8") as file:
|
33 |
+
file.write("\n".join(processed_text[2]))
|
34 |
+
|
35 |
+
|
36 |
+
return render_template('result.html')
|
37 |
+
|
38 |
+
@app.route('/download/<file_name>')
|
39 |
+
def download(file_name):
|
40 |
+
file_name = f'{file_name}'
|
41 |
+
return send_file(file_name, as_attachment=True, download_name=f'{file_name}')
|
42 |
+
|
43 |
+
if __name__ == '__main__':
|
44 |
+
#app.run(debug=True, port=5001)
|