Spaces:
Runtime error
Runtime error
Duplicate from malmarjeh/arabic-text-summarization
Browse filesCo-authored-by: Mohammad Bani Almarjeh <[email protected]>
- .gitattributes +27 -0
- README.md +14 -0
- __pycache__/preprocess.cpython-310.pyc +0 -0
- __pycache__/summarize.cpython-310.pyc +0 -0
- app.py +103 -0
- preprocess.py +383 -0
- requirements.txt +10 -0
- summarize.py +150 -0
.gitattributes
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
19 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
26 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Arabic Text Summarization
|
3 |
+
emoji: 👀
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: blue
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.10.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: mpl-2.0
|
11 |
+
duplicated_from: malmarjeh/arabic-text-summarization
|
12 |
+
---
|
13 |
+
|
14 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
__pycache__/preprocess.cpython-310.pyc
ADDED
Binary file (11.2 kB). View file
|
|
__pycache__/summarize.cpython-310.pyc
ADDED
Binary file (3.43 kB). View file
|
|
app.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from urllib.parse import unquote
|
2 |
+
|
3 |
+
import arabic_reshaper
|
4 |
+
import streamlit as st
|
5 |
+
from bidi.algorithm import get_display
|
6 |
+
|
7 |
+
st.set_page_config(
|
8 |
+
page_title="Arabic Text Summarization",
|
9 |
+
page_icon="📖",
|
10 |
+
initial_sidebar_state="expanded"
|
11 |
+
# layout="wide"
|
12 |
+
)
|
13 |
+
|
14 |
+
from summarize import get_results
|
15 |
+
|
16 |
+
rtl = lambda w: get_display(f"{arabic_reshaper.reshape(w)}")
|
17 |
+
|
18 |
+
st.title("تَلْخِيصُ اَلنُّصُوصِ بِاللُّغَةِ اَلْعَرَبِيَّةِ")
|
19 |
+
|
20 |
+
st.markdown(
|
21 |
+
"""
|
22 |
+
<style>
|
23 |
+
@import url(https://fonts.googleapis.com/earlyaccess/scheherazade.css);
|
24 |
+
section.main {
|
25 |
+
background-color: beige;
|
26 |
+
}
|
27 |
+
.stMarkdown h1, .main .element-container.css-o7ulmj.e1tzin5v3 {
|
28 |
+
text-align: right;
|
29 |
+
}
|
30 |
+
.stMarkdown div.css-nlntq9.e16nr0p33 {
|
31 |
+
font-weight: bold;
|
32 |
+
}
|
33 |
+
textarea {
|
34 |
+
direction: rtl;
|
35 |
+
height: 140px;
|
36 |
+
}
|
37 |
+
.stTextArea .css-qrbaxs {
|
38 |
+
float: right;
|
39 |
+
font-size: 23px;
|
40 |
+
}
|
41 |
+
h1 {
|
42 |
+
font-family: 'Scheherazade', serif;
|
43 |
+
}
|
44 |
+
|
45 |
+
.main div.css-nlntq9.e16nr0p33 > p {
|
46 |
+
direction: rtl;
|
47 |
+
}
|
48 |
+
.main .stMarkdown div.css-nlntq9 p {
|
49 |
+
font-size: 22px;
|
50 |
+
}
|
51 |
+
.main .stMarkdown div.css-nlntq9 {
|
52 |
+
direction: rtl;
|
53 |
+
}
|
54 |
+
.main p, .main div, .main input, .main label {
|
55 |
+
text-align: right;
|
56 |
+
direction: rtl;
|
57 |
+
}
|
58 |
+
.main div>h1>div {
|
59 |
+
left: 0;
|
60 |
+
}
|
61 |
+
.main button {
|
62 |
+
font-size: 22px;
|
63 |
+
}
|
64 |
+
|
65 |
+
</style>
|
66 |
+
""",
|
67 |
+
unsafe_allow_html=True,
|
68 |
+
)
|
69 |
+
|
70 |
+
st.sidebar.write("Arabic Text Summarization")
|
71 |
+
st.sidebar.write("Contact: [email protected]")
|
72 |
+
st.sidebar.write("\n")
|
73 |
+
|
74 |
+
model_selected = st.sidebar.selectbox(
|
75 |
+
'Select a model',
|
76 |
+
('T5','BERT2BERT', 'GPT-2', 'mBERT2mBERT','Transformer'))
|
77 |
+
st.sidebar.write("\n")
|
78 |
+
num_beams = st.sidebar.slider(
|
79 |
+
"Number of beams", min_value=1, max_value=10, value=3, step=1
|
80 |
+
)
|
81 |
+
|
82 |
+
length_pe_slider_disabled = False
|
83 |
+
if model_selected == "GPT-2":
|
84 |
+
length_pe_slider_disabled = True
|
85 |
+
|
86 |
+
st.sidebar.write("\n")
|
87 |
+
length_penalty = st.sidebar.slider(
|
88 |
+
"Length penalty ", min_value=0.1, max_value=3.0, value=1.0, step=0.1, disabled=length_pe_slider_disabled
|
89 |
+
)
|
90 |
+
|
91 |
+
txt = """يجري علماء في بريطانيا تجربة لاختبار فعالية عقار إيبوبروفين لمساعدة المصابين بفيروس كورونا. وذكرت هيئة الإذاعة البريطانية "بي بي سي" أن فريق مشترك من أطباء مستشفيات "جاي" و"سانت توماس" و"كينغز كوليدج" في لندن يعتقد أن إيبوبروفين، وهو مضاد للالتهابات ومسكن للألم، يمكن أن يعالج صعوبات التنفس.
|
92 |
+
ويأمل العلماء أن يساعد هذا العلاج المنخفض التكلفة المرضى في الاستغناء عن أجهزة التنفس الصناعي. وذكرت أنه خلال فترة الاختبار، سيحصل نصف المرضى على إيبوبروفين بالإضافة إلى الرعاية المعتادة، حيث سيتم استخدام تركيبة خاصة من إيبوبروفين بدلا من الأقراص العادية التي قد يشتريها الناس عادة."""
|
93 |
+
text = st.text_area("أدخل نص ليتم تلخيصه", value=txt)
|
94 |
+
|
95 |
+
run_query = st.button("لخّص")
|
96 |
+
if run_query:
|
97 |
+
# https://discuss.streamlit.io/t/showing-a-gif-while-st-spinner-runs/5084
|
98 |
+
with st.spinner("جاري التلخيص ..."):
|
99 |
+
result = get_results(text, model_selected, num_beams, length_penalty)
|
100 |
+
if len(result) > 0:
|
101 |
+
st.write(result)
|
102 |
+
else:
|
103 |
+
st.write("")
|
preprocess.py
ADDED
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import html
|
2 |
+
import logging
|
3 |
+
import re
|
4 |
+
|
5 |
+
import pyarabic.araby as araby
|
6 |
+
|
7 |
+
ACCEPTED_MODELS = [
|
8 |
+
"bert-base-arabertv01",
|
9 |
+
"bert-base-arabert",
|
10 |
+
"bert-base-arabertv02",
|
11 |
+
"bert-base-arabertv2",
|
12 |
+
"bert-large-arabertv02",
|
13 |
+
"bert-large-arabertv2",
|
14 |
+
"araelectra-base",
|
15 |
+
"araelectra-base-discriminator",
|
16 |
+
"araelectra-base-generator",
|
17 |
+
"aragpt2-base",
|
18 |
+
"aragpt2-medium",
|
19 |
+
"aragpt2-large",
|
20 |
+
"aragpt2-mega",
|
21 |
+
]
|
22 |
+
|
23 |
+
SEGMENTED_MODELS = [
|
24 |
+
"bert-base-arabert",
|
25 |
+
"bert-base-arabertv2",
|
26 |
+
"bert-large-arabertv2",
|
27 |
+
]
|
28 |
+
|
29 |
+
|
30 |
+
class ArabertPreprocessor:
|
31 |
+
"""
|
32 |
+
A Preprocessor class that cleans and preprocesses text for all models in the AraBERT repo.
|
33 |
+
It also can unprocess the text ouput of the generated text
|
34 |
+
|
35 |
+
Args:
|
36 |
+
|
37 |
+
model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are:
|
38 |
+
|
39 |
+
- :obj:`"bert-base-arabertv01"`: No farasa segmentation.
|
40 |
+
- :obj:`"bert-base-arabert"`: with farasa segmentation.
|
41 |
+
- :obj:`"bert-base-arabertv02"`: No farasas egmentation.
|
42 |
+
- :obj:`"bert-base-arabertv2"`: with farasa segmentation.
|
43 |
+
- :obj:`"bert-large-arabertv02"`: No farasas egmentation.
|
44 |
+
- :obj:`"bert-large-arabertv2"`: with farasa segmentation.
|
45 |
+
- :obj:`"araelectra-base"`: No farasa segmentation.
|
46 |
+
- :obj:`"araelectra-base-discriminator"`: No farasa segmentation.
|
47 |
+
- :obj:`"araelectra-base-generator"`: No farasa segmentation.
|
48 |
+
- :obj:`"aragpt2-base"`: No farasa segmentation.
|
49 |
+
- :obj:`"aragpt2-medium"`: No farasa segmentation.
|
50 |
+
- :obj:`"aragpt2-large"`: No farasa segmentation.
|
51 |
+
- :obj:`"aragpt2-mega"`: No farasa segmentation.
|
52 |
+
|
53 |
+
keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False
|
54 |
+
|
55 |
+
remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True
|
56 |
+
|
57 |
+
replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True
|
58 |
+
|
59 |
+
strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA)
|
60 |
+
|
61 |
+
strip_tatweel(:obj: `bool`): remove tatweel '\\u0640'
|
62 |
+
|
63 |
+
insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words
|
64 |
+
|
65 |
+
remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character
|
66 |
+
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
|
70 |
+
ArabertPreprocessor: the preprocessor class
|
71 |
+
|
72 |
+
Example:
|
73 |
+
|
74 |
+
from preprocess import ArabertPreprocessor
|
75 |
+
|
76 |
+
arabert_prep = ArabertPreprocessor("aubmindlab/bert-base-arabertv2")
|
77 |
+
|
78 |
+
arabert_prep.preprocess("SOME ARABIC TEXT")
|
79 |
+
"""
|
80 |
+
|
81 |
+
def __init__(
|
82 |
+
self,
|
83 |
+
model_name,
|
84 |
+
keep_emojis=False,
|
85 |
+
remove_html_markup=True,
|
86 |
+
replace_urls_emails_mentions=True,
|
87 |
+
strip_tashkeel=True,
|
88 |
+
strip_tatweel=True,
|
89 |
+
insert_white_spaces=True,
|
90 |
+
remove_elongation=True,
|
91 |
+
):
|
92 |
+
"""
|
93 |
+
model_name (:obj:`str`): model name from the HuggingFace Models page without the aubmindlab tag. Defaults to "bert-base-arabertv02". Current accepted models are:
|
94 |
+
|
95 |
+
- :obj:`"bert-base-arabertv01"`: No farasa segmentation.
|
96 |
+
- :obj:`"bert-base-arabert"`: with farasa segmentation.
|
97 |
+
- :obj:`"bert-base-arabertv02"`: No farasas egmentation.
|
98 |
+
- :obj:`"bert-base-arabertv2"`: with farasa segmentation.
|
99 |
+
- :obj:`"bert-large-arabertv02"`: No farasas egmentation.
|
100 |
+
- :obj:`"bert-large-arabertv2"`: with farasa segmentation.
|
101 |
+
- :obj:`"araelectra-base"`: No farasa segmentation.
|
102 |
+
- :obj:`"araelectra-base-discriminator"`: No farasa segmentation.
|
103 |
+
- :obj:`"araelectra-base-generator"`: No farasa segmentation.
|
104 |
+
- :obj:`"aragpt2-base"`: No farasa segmentation.
|
105 |
+
- :obj:`"aragpt2-medium"`: No farasa segmentation.
|
106 |
+
- :obj:`"aragpt2-large"`: No farasa segmentation.
|
107 |
+
- :obj:`"aragpt2-mega"`: No farasa segmentation.
|
108 |
+
|
109 |
+
keep_emojis(:obj: `bool`): don't remove emojis while preprocessing. Defaults to False
|
110 |
+
|
111 |
+
remove_html_markup(:obj: `bool`): Whether to remove html artfacts, should be set to False when preprocessing TyDi QA. Defaults to True
|
112 |
+
|
113 |
+
replace_urls_emails_mentions(:obj: `bool`): Whether to replace email urls and mentions by special tokens. Defaults to True
|
114 |
+
|
115 |
+
strip_tashkeel(:obj: `bool`): remove diacritics (FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA)
|
116 |
+
|
117 |
+
strip_tatweel(:obj: `bool`): remove tatweel '\\u0640'
|
118 |
+
|
119 |
+
insert_white_spaces(:obj: `bool`): insert whitespace before and after all non Arabic digits or English digits or Arabic and English Alphabet or the 2 brackets, then inserts whitespace between words and numbers or numbers and words
|
120 |
+
|
121 |
+
remove_elongation(:obj: `bool`): replace repetition of more than 2 non-digit character with 2 of this character
|
122 |
+
|
123 |
+
"""
|
124 |
+
model_name = model_name.replace("aubmindlab/", "")
|
125 |
+
|
126 |
+
if model_name not in ACCEPTED_MODELS:
|
127 |
+
logging.warning(
|
128 |
+
"Model provided is not in the accepted model list. Assuming you don't want Farasa Segmentation"
|
129 |
+
)
|
130 |
+
self.model_name = "bert-base-arabertv02"
|
131 |
+
else:
|
132 |
+
self.model_name = model_name
|
133 |
+
|
134 |
+
|
135 |
+
self.keep_emojis = keep_emojis
|
136 |
+
|
137 |
+
self.remove_html_markup = remove_html_markup
|
138 |
+
self.replace_urls_emails_mentions = replace_urls_emails_mentions
|
139 |
+
self.strip_tashkeel = strip_tashkeel
|
140 |
+
self.strip_tatweel = strip_tatweel
|
141 |
+
self.insert_white_spaces = insert_white_spaces
|
142 |
+
self.remove_elongation = remove_elongation
|
143 |
+
|
144 |
+
def preprocess(self, text):
|
145 |
+
"""
|
146 |
+
Preprocess takes an input text line an applies the same preprocessing used in AraBERT
|
147 |
+
pretraining
|
148 |
+
|
149 |
+
Args:
|
150 |
+
|
151 |
+
text (:obj:`str`): inout text string
|
152 |
+
|
153 |
+
Returns:
|
154 |
+
|
155 |
+
string: A preprocessed string depending on which model was selected
|
156 |
+
"""
|
157 |
+
|
158 |
+
|
159 |
+
text = str(text)
|
160 |
+
text = html.unescape(text)
|
161 |
+
if self.strip_tashkeel:
|
162 |
+
text = araby.strip_tashkeel(text)
|
163 |
+
if self.strip_tatweel:
|
164 |
+
text = araby.strip_tatweel(text)
|
165 |
+
|
166 |
+
if self.replace_urls_emails_mentions:
|
167 |
+
# replace all possible URLs
|
168 |
+
for reg in url_regexes:
|
169 |
+
text = re.sub(reg, " [رابط] ", text)
|
170 |
+
# REplace Emails with [بريد]
|
171 |
+
for reg in email_regexes:
|
172 |
+
text = re.sub(reg, " [بريد] ", text)
|
173 |
+
# replace mentions with [مستخدم]
|
174 |
+
text = re.sub(user_mention_regex, " [مستخدم] ", text)
|
175 |
+
|
176 |
+
if self.remove_html_markup:
|
177 |
+
# remove html line breaks
|
178 |
+
text = re.sub("<br />", " ", text)
|
179 |
+
# remove html markup
|
180 |
+
text = re.sub("</?[^>]+>", " ", text)
|
181 |
+
|
182 |
+
# remove repeated characters >2
|
183 |
+
if self.remove_elongation:
|
184 |
+
text = self._remove_elongation(text)
|
185 |
+
|
186 |
+
# insert whitespace before and after all non Arabic digits or English Digits and Alphabet and the 2 brackets
|
187 |
+
if self.insert_white_spaces:
|
188 |
+
text = re.sub(
|
189 |
+
"([^0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669a-zA-Z\[\]])",
|
190 |
+
r" \1 ",
|
191 |
+
text,
|
192 |
+
)
|
193 |
+
|
194 |
+
# insert whitespace between words and numbers or numbers and words
|
195 |
+
text = re.sub(
|
196 |
+
"(\d+)([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)", r" \1 \2 ", text
|
197 |
+
)
|
198 |
+
text = re.sub(
|
199 |
+
"([\u0621-\u063A\u0641-\u064A\u0660-\u066C]+)(\d+)", r" \1 \2 ", text
|
200 |
+
)
|
201 |
+
|
202 |
+
|
203 |
+
text = re.sub(rejected_chars_regex, " ", text)
|
204 |
+
|
205 |
+
# remove extra spaces
|
206 |
+
text = " ".join(text.replace("\uFE0F", "").split())
|
207 |
+
|
208 |
+
# ALl the other models dont require Farasa Segmentation
|
209 |
+
return text
|
210 |
+
|
211 |
+
def unpreprocess(self, text, desegment=True):
|
212 |
+
"""Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces.
|
213 |
+
The objective is to make the generated text of any model appear natural and not preprocessed.
|
214 |
+
|
215 |
+
Args:
|
216 |
+
text (str): input text to be un-preprocessed
|
217 |
+
desegment (bool, optional): [whether or not to remove farasa pre-segmentation before]. Defaults to True.
|
218 |
+
|
219 |
+
Returns:
|
220 |
+
str: The unpreprocessed (and possibly Farasa-desegmented) text.
|
221 |
+
"""
|
222 |
+
|
223 |
+
# removes the spaces around quotation marks ex: i " ate " an apple --> i "ate" an apple
|
224 |
+
# https://stackoverflow.com/a/53436792/5381220
|
225 |
+
text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text)
|
226 |
+
text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text)
|
227 |
+
text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text)
|
228 |
+
text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text)
|
229 |
+
|
230 |
+
# during generation, sometimes the models don't put a space after the dot, this handles it
|
231 |
+
text = text.replace(".", " . ")
|
232 |
+
text = " ".join(text.split())
|
233 |
+
|
234 |
+
# handle decimals
|
235 |
+
text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text)
|
236 |
+
text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text)
|
237 |
+
|
238 |
+
text = re.sub(left_and_right_spaced_chars, r"\1", text)
|
239 |
+
text = re.sub(left_spaced_chars, r"\1", text)
|
240 |
+
text = re.sub(right_spaced_chars, r"\1", text)
|
241 |
+
|
242 |
+
return text
|
243 |
+
|
244 |
+
|
245 |
+
def _remove_elongation(self, text):
|
246 |
+
"""
|
247 |
+
:param text: the input text to remove elongation
|
248 |
+
:return: delongated text
|
249 |
+
"""
|
250 |
+
# loop over the number of times the regex matched the text
|
251 |
+
for index_ in range(len(re.findall(regex_tatweel, text))):
|
252 |
+
elongation = re.search(regex_tatweel, text)
|
253 |
+
if elongation:
|
254 |
+
elongation_pattern = elongation.group()
|
255 |
+
elongation_replacement = elongation_pattern[0]
|
256 |
+
elongation_pattern = re.escape(elongation_pattern)
|
257 |
+
text = re.sub(
|
258 |
+
elongation_pattern, elongation_replacement, text, flags=re.MULTILINE
|
259 |
+
)
|
260 |
+
else:
|
261 |
+
break
|
262 |
+
return text
|
263 |
+
|
264 |
+
def _remove_redundant_punct(self, text):
|
265 |
+
text_ = text
|
266 |
+
result = re.search(redundant_punct_pattern, text)
|
267 |
+
dif = 0
|
268 |
+
while result:
|
269 |
+
sub = result.group()
|
270 |
+
sub = sorted(set(sub), key=sub.index)
|
271 |
+
sub = " " + "".join(list(sub)) + " "
|
272 |
+
text = "".join(
|
273 |
+
(text[: result.span()[0] + dif], sub, text[result.span()[1] + dif :])
|
274 |
+
)
|
275 |
+
text_ = "".join(
|
276 |
+
(text_[: result.span()[0]], text_[result.span()[1] :])
|
277 |
+
).strip()
|
278 |
+
dif = abs(len(text) - len(text_))
|
279 |
+
result = re.search(redundant_punct_pattern, text_)
|
280 |
+
text = re.sub(r"\s+", " ", text)
|
281 |
+
return text.strip()
|
282 |
+
|
283 |
+
|
284 |
+
prefix_list = [
|
285 |
+
"ال",
|
286 |
+
"و",
|
287 |
+
"ف",
|
288 |
+
"ب",
|
289 |
+
"ك",
|
290 |
+
"ل",
|
291 |
+
"لل",
|
292 |
+
"\u0627\u0644",
|
293 |
+
"\u0648",
|
294 |
+
"\u0641",
|
295 |
+
"\u0628",
|
296 |
+
"\u0643",
|
297 |
+
"\u0644",
|
298 |
+
"\u0644\u0644",
|
299 |
+
"س",
|
300 |
+
]
|
301 |
+
suffix_list = [
|
302 |
+
"ه",
|
303 |
+
"ها",
|
304 |
+
"ك",
|
305 |
+
"ي",
|
306 |
+
"هما",
|
307 |
+
"كما",
|
308 |
+
"نا",
|
309 |
+
"كم",
|
310 |
+
"هم",
|
311 |
+
"هن",
|
312 |
+
"كن",
|
313 |
+
"ا",
|
314 |
+
"ان",
|
315 |
+
"ين",
|
316 |
+
"ون",
|
317 |
+
"وا",
|
318 |
+
"ات",
|
319 |
+
"ت",
|
320 |
+
"ن",
|
321 |
+
"ة",
|
322 |
+
"\u0647",
|
323 |
+
"\u0647\u0627",
|
324 |
+
"\u0643",
|
325 |
+
"\u064a",
|
326 |
+
"\u0647\u0645\u0627",
|
327 |
+
"\u0643\u0645\u0627",
|
328 |
+
"\u0646\u0627",
|
329 |
+
"\u0643\u0645",
|
330 |
+
"\u0647\u0645",
|
331 |
+
"\u0647\u0646",
|
332 |
+
"\u0643\u0646",
|
333 |
+
"\u0627",
|
334 |
+
"\u0627\u0646",
|
335 |
+
"\u064a\u0646",
|
336 |
+
"\u0648\u0646",
|
337 |
+
"\u0648\u0627",
|
338 |
+
"\u0627\u062a",
|
339 |
+
"\u062a",
|
340 |
+
"\u0646",
|
341 |
+
"\u0629",
|
342 |
+
]
|
343 |
+
other_tokens = ["[رابط]", "[مستخدم]", "[بريد]"]
|
344 |
+
|
345 |
+
# the never_split list is ussed with the transformers library
|
346 |
+
prefix_symbols = [x + "+" for x in prefix_list]
|
347 |
+
suffix_symblos = ["+" + x for x in suffix_list]
|
348 |
+
never_split_tokens = list(set(prefix_symbols + suffix_symblos + other_tokens))
|
349 |
+
|
350 |
+
url_regexes = [
|
351 |
+
r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",
|
352 |
+
r"@(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS",
|
353 |
+
r"http[s]?://[a-zA-Z0-9_\-./~\?=%&]+",
|
354 |
+
r"www[a-zA-Z0-9_\-?=%&/.~]+",
|
355 |
+
r"[a-zA-Z]+\.com",
|
356 |
+
r"(?=http)[^\s]+",
|
357 |
+
r"(?=www)[^\s]+",
|
358 |
+
r"://",
|
359 |
+
]
|
360 |
+
user_mention_regex = r"@[\w\d]+"
|
361 |
+
email_regexes = [r"[\w-]+@([\w-]+\.)+[\w-]+", r"\S+@\S+"]
|
362 |
+
redundant_punct_pattern = (
|
363 |
+
r"([!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ【»؛\s+«–…‘]{2,})"
|
364 |
+
)
|
365 |
+
regex_tatweel = r"(\D)\1{2,}"
|
366 |
+
rejected_chars_regex = r"[^0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘]"
|
367 |
+
|
368 |
+
regex_url_step1 = r"(?=http)[^\s]+"
|
369 |
+
regex_url_step2 = r"(?=www)[^\s]+"
|
370 |
+
regex_url = r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)"
|
371 |
+
regex_mention = r"@[\w\d]+"
|
372 |
+
regex_email = r"\S+@\S+"
|
373 |
+
|
374 |
+
chars_regex = r"0-9\u0621-\u063A\u0640-\u066C\u0671-\u0674a-zA-Z\[\]!\"#\$%\'\(\)\*\+,\.:;\-<=·>?@\[\\\]\^_ـ`{\|}~—٪’،؟`୍“؛”ۚ»؛\s+«–…‘"
|
375 |
+
|
376 |
+
white_spaced_double_quotation_regex = r'\"\s+([^"]+)\s+\"'
|
377 |
+
white_spaced_single_quotation_regex = r"\'\s+([^']+)\s+\'"
|
378 |
+
white_spaced_back_quotation_regex = r"\`\s+([^`]+)\s+\`"
|
379 |
+
white_spaced_em_dash = r"\—\s+([^—]+)\s+\—"
|
380 |
+
|
381 |
+
left_spaced_chars = r" ([\]!#\$%\),\.:;\?}٪’،؟”؛…»·])"
|
382 |
+
right_spaced_chars = r"([\[\(\{“«‘*\~]) "
|
383 |
+
left_and_right_spaced_chars = r" ([\+\-\<\=\>\@\\\^\_\|\–]) "
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
altair
|
2 |
+
pandas
|
3 |
+
streamlit
|
4 |
+
transformers[sentencepiece]
|
5 |
+
tokenizers
|
6 |
+
arabic-reshaper==2.1.3
|
7 |
+
python-bidi==0.4.2
|
8 |
+
PyArabic
|
9 |
+
torch
|
10 |
+
codetiming==1.3.0
|
summarize.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import re
|
4 |
+
from functools import lru_cache
|
5 |
+
from urllib.parse import unquote
|
6 |
+
|
7 |
+
import streamlit as st
|
8 |
+
from codetiming import Timer
|
9 |
+
from transformers import pipeline
|
10 |
+
from preprocess import ArabertPreprocessor
|
11 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
|
12 |
+
from transformers import GPT2TokenizerFast, BertTokenizer
|
13 |
+
import tokenizers
|
14 |
+
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
17 |
+
|
18 |
+
logger.info("Loading models...")
|
19 |
+
reader_time = Timer("loading", text="Time: {:.2f}", logger=logging.info)
|
20 |
+
reader_time.start()
|
21 |
+
#####
|
22 |
+
|
23 |
+
@st.cache(ttl=24*3600, hash_funcs={AutoModelForSeq2SeqLM: lambda _: None})
|
24 |
+
def load_seq2seqLM_model(model_path): #This function is not used
|
25 |
+
return AutoModelForSeq2SeqLM.from_pretrained(model_path)
|
26 |
+
@st.cache(ttl=24*3600, hash_funcs={AutoModelForCausalLM: lambda _: None})
|
27 |
+
def load_casualLM_model(model_path):
|
28 |
+
return AutoModelForCausalLM.from_pretrained(model_path)
|
29 |
+
|
30 |
+
@st.cache(ttl=24*3600, hash_funcs={tokenizers.Tokenizer: lambda _: None})
|
31 |
+
def load_autotokenizer_model(tokenizer_path):
|
32 |
+
return AutoTokenizer.from_pretrained(tokenizer_path)
|
33 |
+
@st.cache(ttl=24*3600, hash_funcs={BertTokenizer: lambda _: None})
|
34 |
+
def load_berttokenizer_model(tokenizer_path):
|
35 |
+
return BertTokenizer.from_pretrained(tokenizer_path)
|
36 |
+
@st.cache(ttl=24*3600, hash_funcs={GPT2TokenizerFast: lambda _: None})
|
37 |
+
def load_gpt2tokenizer_model(tokenizer_path):
|
38 |
+
return GPT2TokenizerFast.from_pretrained(tokenizer_path)
|
39 |
+
|
40 |
+
@st.cache(ttl=24*3600, allow_output_mutation=True, hash_funcs={pipeline: lambda _: None, tokenizers.Tokenizer: lambda _: None})
|
41 |
+
def load_generation_pipeline(model_path):
|
42 |
+
if model_path == "malmarjeh/mbert2mbert-arabic-text-summarization":
|
43 |
+
tokenizer = load_berttokenizer_model(model_path)
|
44 |
+
else:
|
45 |
+
tokenizer = load_autotokenizer_model(model_path)
|
46 |
+
#model = load_seq2seqLM_model(model_path)
|
47 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
|
48 |
+
return pipeline("text2text-generation",model=model,tokenizer=tokenizer)
|
49 |
+
|
50 |
+
@st.cache(ttl=24*3600, hash_funcs={ArabertPreprocessor: lambda _: None})
|
51 |
+
def load_preprocessor():
|
52 |
+
return ArabertPreprocessor(model_name="")
|
53 |
+
|
54 |
+
tokenizer = load_autotokenizer_model("malmarjeh/bert2bert")
|
55 |
+
generation_pipeline = load_generation_pipeline("malmarjeh/bert2bert")
|
56 |
+
logger.info("BERT2BERT is loaded")
|
57 |
+
|
58 |
+
tokenizer_mbert = load_berttokenizer_model("malmarjeh/mbert2mbert-arabic-text-summarization")
|
59 |
+
generation_pipeline_mbert = load_generation_pipeline("malmarjeh/mbert2mbert-arabic-text-summarization")
|
60 |
+
logger.info("mBERT2mBERT is loaded")
|
61 |
+
|
62 |
+
tokenizer_t5 = load_autotokenizer_model("malmarjeh/t5-arabic-text-summarization")
|
63 |
+
generation_pipeline_t5 = load_generation_pipeline("malmarjeh/t5-arabic-text-summarization")
|
64 |
+
logger.info("T5 is loaded")
|
65 |
+
|
66 |
+
tokenizer_transformer = load_autotokenizer_model("malmarjeh/transformer")
|
67 |
+
generation_pipeline_transformer = load_generation_pipeline("malmarjeh/transformer")
|
68 |
+
logger.info("Transformer is loaded")
|
69 |
+
|
70 |
+
tokenizer_gpt2 = load_gpt2tokenizer_model("aubmindlab/aragpt2-base")
|
71 |
+
model_gpt2 = load_casualLM_model("malmarjeh/gpt2")
|
72 |
+
logger.info("GPT-2 is loaded")
|
73 |
+
|
74 |
+
reader_time.stop()
|
75 |
+
|
76 |
+
preprocessor = load_preprocessor()
|
77 |
+
|
78 |
+
logger.info("Finished loading the models...")
|
79 |
+
logger.info(f"Time spent loading: {reader_time.last}")
|
80 |
+
|
81 |
+
@lru_cache(maxsize=200)
|
82 |
+
def get_results(text, model_selected, num_beams, length_penalty):
|
83 |
+
logger.info("\n=================================================================")
|
84 |
+
logger.info(f"Text: {text}")
|
85 |
+
logger.info(f"model_selected: {model_selected}")
|
86 |
+
logger.info(f"length_penalty: {length_penalty}")
|
87 |
+
reader_time = Timer("summarize", text="Time: {:.2f}", logger=logging.info)
|
88 |
+
reader_time.start()
|
89 |
+
if model_selected == 'GPT-2':
|
90 |
+
number_of_tokens_limit = 80
|
91 |
+
else:
|
92 |
+
number_of_tokens_limit = 150
|
93 |
+
text = preprocessor.preprocess(text)
|
94 |
+
logger.info(f"input length: {len(text.split())}")
|
95 |
+
text = ' '.join(text.split()[:number_of_tokens_limit])
|
96 |
+
|
97 |
+
if model_selected == 'Transformer':
|
98 |
+
result = generation_pipeline_transformer(text,
|
99 |
+
pad_token_id=tokenizer_transformer.eos_token_id,
|
100 |
+
num_beams=num_beams,
|
101 |
+
repetition_penalty=3.0,
|
102 |
+
max_length=200,
|
103 |
+
length_penalty=length_penalty,
|
104 |
+
no_repeat_ngram_size = 3)[0]['generated_text']
|
105 |
+
logger.info('Transformer')
|
106 |
+
elif model_selected == 'GPT-2':
|
107 |
+
text_processed = '\n النص: ' + text + ' \n الملخص: \n '
|
108 |
+
tokenizer_gpt2.add_special_tokens({'pad_token': '<pad>'})
|
109 |
+
text_tokens = tokenizer_gpt2.batch_encode_plus([text_processed], return_tensors='pt', padding='max_length', max_length=100)
|
110 |
+
output_ = model_gpt2.generate(input_ids=text_tokens['input_ids'],repetition_penalty=3.0, num_beams=num_beams, max_length=140, pad_token_id=2, eos_token_id=0, bos_token_id=10611)
|
111 |
+
result = tokenizer_gpt2.decode(output_[0][100:], skip_special_tokens=True).strip()
|
112 |
+
logger.info('GPT-2')
|
113 |
+
elif model_selected == 'mBERT2mBERT':
|
114 |
+
result = generation_pipeline_mbert(text,
|
115 |
+
pad_token_id=tokenizer_mbert.eos_token_id,
|
116 |
+
num_beams=num_beams,
|
117 |
+
repetition_penalty=3.0,
|
118 |
+
max_length=200,
|
119 |
+
length_penalty=length_penalty,
|
120 |
+
no_repeat_ngram_size = 3)[0]['generated_text']
|
121 |
+
logger.info('mBERT')
|
122 |
+
elif model_selected == 'T5':
|
123 |
+
result = generation_pipeline_t5(text,
|
124 |
+
pad_token_id=tokenizer_t5.eos_token_id,
|
125 |
+
num_beams=num_beams,
|
126 |
+
repetition_penalty=3.0,
|
127 |
+
max_length=200,
|
128 |
+
length_penalty=length_penalty,
|
129 |
+
no_repeat_ngram_size = 3)[0]['generated_text']
|
130 |
+
logger.info('t5')
|
131 |
+
elif model_selected == 'BERT2BERT':
|
132 |
+
result = generation_pipeline(text,
|
133 |
+
pad_token_id=tokenizer.eos_token_id,
|
134 |
+
num_beams=num_beams,
|
135 |
+
repetition_penalty=3.0,
|
136 |
+
max_length=200,
|
137 |
+
length_penalty=length_penalty,
|
138 |
+
no_repeat_ngram_size = 3)[0]['generated_text']
|
139 |
+
logger.info('bert2bert')
|
140 |
+
else:
|
141 |
+
result = "الرجاء اختيار نموذج"
|
142 |
+
|
143 |
+
reader_time.stop()
|
144 |
+
logger.info(f"Time spent summarizing: {reader_time.last}")
|
145 |
+
|
146 |
+
return result
|
147 |
+
|
148 |
+
|
149 |
+
if __name__ == "__main__":
|
150 |
+
results_dict = ""
|