|
import streamlit as st |
|
import awesome_streamlit as ast |
|
from .preprocess import ( |
|
ArabertPreprocessor, |
|
white_spaced_back_quotation_regex, |
|
white_spaced_double_quotation_regex, |
|
white_spaced_em_dash, |
|
white_spaced_single_quotation_regex, |
|
left_and_right_spaced_chars, |
|
left_spaced_chars, |
|
right_spaced_chars, |
|
) |
|
import re |
|
|
|
MODELS_to_SELECT = [ |
|
"None", |
|
"bert-base-arabertv01", |
|
"bert-base-arabert", |
|
"bert-base-arabertv02", |
|
"bert-base-arabertv2", |
|
"bert-large-arabertv02", |
|
"bert-large-arabertv2", |
|
"araelectra-base", |
|
"araelectra-base-discriminator", |
|
"araelectra-base-generator", |
|
"araelectra-base-artydiqa", |
|
"aragpt2-base", |
|
"aragpt2-medium", |
|
"aragpt2-large", |
|
"aragpt2-mega", |
|
] |
|
|
|
|
|
def unpreprocess(text: str) -> str: |
|
"""Re-formats the text to a classic format where punctuations, brackets, parenthesis are not seperated by whitespaces. |
|
The objective is to make the generated text of any model appear natural and not preprocessed. |
|
|
|
Args: |
|
text (:obj:`str`): input text to be un-preprocessed |
|
desegment (:obj:`bool`, optional): [whether or not to remove farasa pre-segmentation before].. |
|
|
|
Returns: |
|
str: The unpreprocessed (and possibly Farasa-desegmented) text. |
|
""" |
|
|
|
text = desegment(text) |
|
|
|
|
|
|
|
text = re.sub(white_spaced_double_quotation_regex, '"' + r"\1" + '"', text) |
|
text = re.sub(white_spaced_single_quotation_regex, "'" + r"\1" + "'", text) |
|
text = re.sub(white_spaced_back_quotation_regex, "\`" + r"\1" + "\`", text) |
|
text = re.sub(white_spaced_back_quotation_regex, "\—" + r"\1" + "\—", text) |
|
|
|
|
|
text = text.replace(".", " . ") |
|
text = " ".join(text.split()) |
|
|
|
|
|
text = re.sub(r"(\d+) \. (\d+)", r"\1.\2", text) |
|
text = re.sub(r"(\d+) \, (\d+)", r"\1,\2", text) |
|
|
|
text = re.sub(left_and_right_spaced_chars, r"\1", text) |
|
text = re.sub(left_spaced_chars, r"\1", text) |
|
text = re.sub(right_spaced_chars, r"\1", text) |
|
|
|
return text |
|
|
|
|
|
def desegment(text: str) -> str: |
|
""" |
|
Use this function if sentence tokenization was done using |
|
`from arabert.preprocess_arabert import preprocess` with Farasa enabled |
|
AraBERT segmentation using Farasa adds a space after the '+' for prefixes, |
|
and after before the '+' for suffixes |
|
|
|
Example: |
|
>>> desegment('ال+ دراس +ات') |
|
الدراسات |
|
""" |
|
text = text.replace("+ ", "+") |
|
text = text.replace(" +", "+") |
|
text = " ".join([_desegmentword(word) for word in text.split(" ")]) |
|
return text |
|
|
|
|
|
def _desegmentword(orig_word: str) -> str: |
|
""" |
|
Word segmentor that takes a Farasa Segmented Word and removes the '+' signs |
|
|
|
Example: |
|
>>> _desegmentword("ال+يومي+ة") |
|
اليومية |
|
""" |
|
word = orig_word.replace("ل+ال+", "لل") |
|
if "ال+ال" not in orig_word: |
|
word = word.replace("ل+ال", "لل") |
|
word = word.replace("+", "") |
|
word = word.replace("للل", "لل") |
|
return word |
|
|
|
|
|
def write(): |
|
|
|
st.markdown( |
|
""" |
|
<h1 style="text-align:left;">Arabic Text Pre-Processor</h1> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
st.markdown( |
|
""" |
|
<style> |
|
p, div, input, label { |
|
text-align: right; |
|
} |
|
</style> |
|
""", |
|
unsafe_allow_html=True, |
|
) |
|
input_text = st.text_input( |
|
"Text to Pre-Process", |
|
value="ولن نبالغ إذا قلنا: إن 'هاتف' أو 'كمبيوتر المكتب' في زمننا هذا ضروري", |
|
) |
|
|
|
st.sidebar.title("Model Selector") |
|
model_selector = st.sidebar.selectbox( |
|
"""Select None to enable further filters""", options=MODELS_to_SELECT, index=3 |
|
) |
|
if model_selector == "None": |
|
keep_emojis = st.sidebar.checkbox("Keep emojis", False) |
|
remove_html_markup = st.sidebar.checkbox("Remove html markup", True) |
|
strip_tashkeel = st.sidebar.checkbox("Strip tashkeel", True) |
|
replace_urls_emails_mentions = st.sidebar.checkbox( |
|
"Replace urls and emails", True |
|
) |
|
strip_tatweel = st.sidebar.checkbox("Strip tatweel", True) |
|
insert_white_spaces = st.sidebar.checkbox("Insert white spaces", True) |
|
remove_non_digit_repetition = st.sidebar.checkbox( |
|
"Remove non-digit repetition", True |
|
) |
|
replace_slash_with_dash = st.sidebar.checkbox("Replace slash with dash", None) |
|
map_hindi_numbers_to_arabic = st.sidebar.checkbox( |
|
"Map hindi numbers to arabic", None |
|
) |
|
apply_farasa_segmentation = st.sidebar.checkbox( |
|
"Apply farasa segmentation", None |
|
) |
|
|
|
run_preprocessor = st.button("Run Pre-Processor") |
|
|
|
prep_text = None |
|
if run_preprocessor: |
|
if model_selector == "None": |
|
arabert_preprocessor = ArabertPreprocessor( |
|
model_selector, |
|
keep_emojis, |
|
remove_html_markup, |
|
replace_urls_emails_mentions, |
|
strip_tashkeel, |
|
strip_tatweel, |
|
insert_white_spaces, |
|
remove_non_digit_repetition, |
|
replace_slash_with_dash, |
|
map_hindi_numbers_to_arabic, |
|
apply_farasa_segmentation, |
|
) |
|
else: |
|
arabert_preprocessor = ArabertPreprocessor(model_name=model_selector) |
|
prep_text = arabert_preprocessor._preprocess_v3(input_text) |
|
st.write(prep_text) |
|
|
|
st.write("-----") |
|
input_text_unprep = st.text_input( |
|
"Text to Undo the Pre-Processing", |
|
value=prep_text |
|
if prep_text |
|
else "و+ لن نبالغ إذا قل +نا : إن ' هاتف ' أو ' كمبيوتر ال+ مكتب ' في زمن +نا هذا ضروري", |
|
) |
|
run_unpreprocessor = st.button("Run Un-Pre-Processor") |
|
|
|
if run_unpreprocessor: |
|
st.write(unpreprocess(input_text_unprep)) |
|
|