Spaces:
Sleeping
Sleeping
Montazerh82
commited on
Commit
•
4a0869b
1
Parent(s):
07b83b9
add pipeline
Browse files- app.py +18 -0
- normalizer.py +96 -0
- requirements.txt +3 -0
app.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from transformers import pipeline
|
3 |
+
from normalizer import cleaning
|
4 |
+
|
5 |
+
pipe = pipeline("fill-mask", model="HooshvareLab/albert-fa-zwnj-base-v2")
|
6 |
+
|
7 |
+
|
8 |
+
def greet(text):
|
9 |
+
text = cleaning(text)
|
10 |
+
results = pipe(text)
|
11 |
+
return results
|
12 |
+
|
13 |
+
|
14 |
+
demo = gr.Interface(fn=greet, inputs=gr.Textbox(label='input text'), outputs=gr.Textbox(label="Normalized text:"),
|
15 |
+
allow_flagging='never')
|
16 |
+
|
17 |
+
|
18 |
+
demo.launch()
|
normalizer.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import hazm
|
2 |
+
from cleantext import clean
|
3 |
+
import re
|
4 |
+
|
5 |
+
|
6 |
+
def cleanhtml(raw_html):
|
7 |
+
cleanr = re.compile('<.*?>')
|
8 |
+
cleantext = re.sub(cleanr, '', raw_html)
|
9 |
+
return cleantext
|
10 |
+
|
11 |
+
|
12 |
+
normalizer = hazm.Normalizer()
|
13 |
+
wierd_pattern = re.compile("["
|
14 |
+
u"\U0001F600-\U0001F64F" # emoticons
|
15 |
+
u"\U0001F300-\U0001F5FF" # symbols & pictographs
|
16 |
+
u"\U0001F680-\U0001F6FF" # transport & map symbols
|
17 |
+
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
|
18 |
+
u"\U00002702-\U000027B0"
|
19 |
+
u"\U000024C2-\U0001F251"
|
20 |
+
u"\U0001f926-\U0001f937"
|
21 |
+
u'\U00010000-\U0010ffff'
|
22 |
+
u"\u200d"
|
23 |
+
u"\u2640-\u2642"
|
24 |
+
u"\u2600-\u2B55"
|
25 |
+
u"\u23cf"
|
26 |
+
u"\u23e9"
|
27 |
+
u"\u231a"
|
28 |
+
u"\u3030"
|
29 |
+
u"\ufe0f"
|
30 |
+
u"\u2069"
|
31 |
+
u"\u2066"
|
32 |
+
# u"\u200c"
|
33 |
+
u"\u2068"
|
34 |
+
u"\u2067"
|
35 |
+
"]+", flags=re.UNICODE)
|
36 |
+
|
37 |
+
|
38 |
+
def cleaning(text):
|
39 |
+
text = text.strip()
|
40 |
+
|
41 |
+
# regular cleaning
|
42 |
+
# text = clean(text,
|
43 |
+
# fix_unicode=True,
|
44 |
+
# to_ascii=False,
|
45 |
+
# lower=True,
|
46 |
+
# no_line_breaks=True,
|
47 |
+
# no_urls=True,
|
48 |
+
# no_emails=True,
|
49 |
+
# no_phone_numbers=True,
|
50 |
+
# no_numbers=False,
|
51 |
+
# no_digits=False,
|
52 |
+
# no_currency_symbols=True,
|
53 |
+
# no_punct=False,
|
54 |
+
# replace_with_url="",
|
55 |
+
# replace_with_email="",
|
56 |
+
# replace_with_phone_number="",
|
57 |
+
# replace_with_number="",
|
58 |
+
# replace_with_digit="0",
|
59 |
+
# replace_with_currency_symbol="",
|
60 |
+
# )
|
61 |
+
text = clean(text,
|
62 |
+
extra_spaces=True,
|
63 |
+
lowercase=True
|
64 |
+
)
|
65 |
+
|
66 |
+
# cleaning htmls
|
67 |
+
text = cleanhtml(text)
|
68 |
+
|
69 |
+
# normalizing
|
70 |
+
text = normalizer.normalize(text)
|
71 |
+
|
72 |
+
# removing wierd patterns
|
73 |
+
text = wierd_pattern.sub(r'', text)
|
74 |
+
|
75 |
+
# removing extra spaces, hashtags
|
76 |
+
text = re.sub("#", "", text)
|
77 |
+
text = re.sub("\s+", " ", text)
|
78 |
+
|
79 |
+
# replace some characters
|
80 |
+
text = re.sub("ة", "ه", text)
|
81 |
+
|
82 |
+
return text
|
83 |
+
|
84 |
+
|
85 |
+
# with open('./ghavanins.txt', encoding="utf-8") as fp:
|
86 |
+
# current_content = fp.read()
|
87 |
+
|
88 |
+
# current_content = cleaning(current_content)
|
89 |
+
|
90 |
+
|
91 |
+
# with open('./ghavanins2.txt', 'wb') as f:
|
92 |
+
# f.write(current_content.encode('utf-8', 'ignore'))
|
93 |
+
|
94 |
+
if __name__ == "__main__":
|
95 |
+
q = ' نامة شمارة 237441 /121 مورخ 21 /05 /1402 «مركز مشاوران حقوقي و تدوين مقررات بازار سرمايه» متضمن پيشنهاد مديريت نظارت بر كارگزاران و جمعبندي كميتة تدوين مقررات دربارة پيشنويس «دستورالعمل بهرهبرداري از زيرساخت شناسايي، ثبت، نگهداري و گزارشگري عمليات حسابداري شركتهاي كارگزاري» مطرح و به شرح پيوست تصويب شد كه دستورالعمل و ضوابط موضوع تبصره 2 ماده 8 آن، در دو مرحله به شرح زير اجرايي ميشود'
|
96 |
+
print(cleaning(q))
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
transformers[sentencepiece, torch]
|
2 |
+
hazm
|
3 |
+
cleantext
|