Montazerh82 commited on
Commit
4a0869b
1 Parent(s): 07b83b9

add pipeline

Browse files
Files changed (3) hide show
  1. app.py +18 -0
  2. normalizer.py +96 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ from normalizer import cleaning
4
+
5
+ pipe = pipeline("fill-mask", model="HooshvareLab/albert-fa-zwnj-base-v2")
6
+
7
+
8
+ def greet(text):
9
+ text = cleaning(text)
10
+ results = pipe(text)
11
+ return results
12
+
13
+
14
+ demo = gr.Interface(fn=greet, inputs=gr.Textbox(label='input text'), outputs=gr.Textbox(label="Normalized text:"),
15
+ allow_flagging='never')
16
+
17
+
18
+ demo.launch()
normalizer.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import hazm
2
+ from cleantext import clean
3
+ import re
4
+
5
+
6
+ def cleanhtml(raw_html):
7
+ cleanr = re.compile('<.*?>')
8
+ cleantext = re.sub(cleanr, '', raw_html)
9
+ return cleantext
10
+
11
+
12
+ normalizer = hazm.Normalizer()
13
+ wierd_pattern = re.compile("["
14
+ u"\U0001F600-\U0001F64F" # emoticons
15
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
16
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
17
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
18
+ u"\U00002702-\U000027B0"
19
+ u"\U000024C2-\U0001F251"
20
+ u"\U0001f926-\U0001f937"
21
+ u'\U00010000-\U0010ffff'
22
+ u"\u200d"
23
+ u"\u2640-\u2642"
24
+ u"\u2600-\u2B55"
25
+ u"\u23cf"
26
+ u"\u23e9"
27
+ u"\u231a"
28
+ u"\u3030"
29
+ u"\ufe0f"
30
+ u"\u2069"
31
+ u"\u2066"
32
+ # u"\u200c"
33
+ u"\u2068"
34
+ u"\u2067"
35
+ "]+", flags=re.UNICODE)
36
+
37
+
38
+ def cleaning(text):
39
+ text = text.strip()
40
+
41
+ # regular cleaning
42
+ # text = clean(text,
43
+ # fix_unicode=True,
44
+ # to_ascii=False,
45
+ # lower=True,
46
+ # no_line_breaks=True,
47
+ # no_urls=True,
48
+ # no_emails=True,
49
+ # no_phone_numbers=True,
50
+ # no_numbers=False,
51
+ # no_digits=False,
52
+ # no_currency_symbols=True,
53
+ # no_punct=False,
54
+ # replace_with_url="",
55
+ # replace_with_email="",
56
+ # replace_with_phone_number="",
57
+ # replace_with_number="",
58
+ # replace_with_digit="0",
59
+ # replace_with_currency_symbol="",
60
+ # )
61
+ text = clean(text,
62
+ extra_spaces=True,
63
+ lowercase=True
64
+ )
65
+
66
+ # cleaning htmls
67
+ text = cleanhtml(text)
68
+
69
+ # normalizing
70
+ text = normalizer.normalize(text)
71
+
72
+ # removing wierd patterns
73
+ text = wierd_pattern.sub(r'', text)
74
+
75
+ # removing extra spaces, hashtags
76
+ text = re.sub("#", "", text)
77
+ text = re.sub("\s+", " ", text)
78
+
79
+ # replace some characters
80
+ text = re.sub("ة", "ه", text)
81
+
82
+ return text
83
+
84
+
85
+ # with open('./ghavanins.txt', encoding="utf-8") as fp:
86
+ # current_content = fp.read()
87
+
88
+ # current_content = cleaning(current_content)
89
+
90
+
91
+ # with open('./ghavanins2.txt', 'wb') as f:
92
+ # f.write(current_content.encode('utf-8', 'ignore'))
93
+
94
+ if __name__ == "__main__":
95
+ q = ' نامة شمارة 237441 /121 مورخ 21 /05 /1402 «مركز مشاوران حقوقي و تدوين مقررات بازار سرمايه» متضمن پيشنهاد مديريت نظارت بر كارگزاران و جمع‌بندي كميتة تدوين مقررات دربارة پيش‌نويس «دستورالعمل بهره‌برداري از زيرساخت شناسايي، ثبت، نگهداري و گزارشگري عمليات حسابداري شركت‌هاي كارگزاري» مطرح و به شرح پيوست تصويب شد كه دستورالعمل و ضوابط موضوع تبصره 2 ماده 8 آن، در دو مرحله به شرح زير اجرايي مي‌شود'
96
+ print(cleaning(q))
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers[sentencepiece, torch]
2
+ hazm
3
+ cleantext