Spaces:
Build error
Build error
File size: 10,590 Bytes
28f4a08 f564c9e a262acc f564c9e 28f4a08 a262acc 28f4a08 a262acc 28f4a08 a262acc 28f4a08 a262acc 28f4a08 f564c9e 28f4a08 f564c9e 28f4a08 f564c9e 28f4a08 f564c9e 28f4a08 f564c9e 28f4a08 f564c9e 28f4a08 f564c9e 28f4a08 81ab8b5 a399ef8 28f4a08 f564c9e e1980f6 f564c9e 81ab8b5 f564c9e 40eb027 81ab8b5 7089c59 f564c9e 81ab8b5 f564c9e 28f4a08 f564c9e 28f4a08 f564c9e 28f4a08 7089c59 28f4a08 f564c9e 40eb027 f564c9e 28f4a08 f564c9e 28f4a08 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
import streamlit as st
import spacy
from streamlit_echarts import st_echarts
from annotated_text import annotated_text
st.set_page_config(
page_title="LeetSpeak-NER",
page_icon=":mega:",
layout="wide",
initial_sidebar_state="expanded",
menu_items={
'Get Help': 'https://www.extremelycoolapp.com/help',
'Report a bug': "https://www.extremelycoolapp.com/bug",
'About': "# This is a header. This is an *extremely* cool app!"
}
)
@st.cache(show_spinner=False, allow_output_mutation=True, suppress_st_warning=True)
def load_models():
if selected_for == "Accuracy":
spanish_model = spacy.load("./spacy-models/output_full_ES_roberta-base-bne/model-best")
english_model = spacy.load("./spacy-models/output_full_EN_roberta_base/model-best")
if selected_for == "Efficiency":
spanish_model = spacy.load("./spacy-models/toy_output_es_blank/model-best")
english_model = spacy.load("./spacy-models/toy_output_en_blank/model-best/")
models = {"English": english_model, "Spanish": spanish_model}
return models
@st.cache(show_spinner=True, allow_output_mutation=True, suppress_st_warning=True)
def load_xx_model():
return spacy.load("xx_LeetSpeakNER_mstsb_mpnet")
# 'INV_CAMO', 'LEETSPEAK', 'MIX', 'PUNCT_CAMO'
def process_text(doc, selected_multi_ner):
tokens = []
for token in doc:
if selected_multi_ner == "Yes":
if token.ent_type_ == "INV_CAMO":
tokens.append((token.text, "INV_CAMO", "#faa"))
elif token.ent_type_ == "LEETSPEAK":
tokens.append((token.text, "LEETSPEAK", "#fda"))
elif token.ent_type_ == "MIX":
tokens.append((token.text, "MIX", "#afa"))
elif token.ent_type_ == "PUNCT_CAMO":
tokens.append((token.text, "PUNCT_CAMO", "#aaaaff"))
else:
tokens.append(" " + token.text + " ")
else:
if token.ent_type_ in ['INV_CAMO', 'LEETSPEAK', 'MIX', 'PUNCT_CAMO']:
tokens.append((token.text, "CAMOUFLAGE", "#ffd5aa"))
else:
tokens.append(" " + token.text + " ")
return tokens
# Side bar
# selected_language = st.sidebar.selectbox("Select a language", options=["English", "Spanish"])
selected_language = st.sidebar.selectbox("Select a language", options=["Multilingual"])
selected_multi_ner = st.sidebar.radio('Do you want to break down the Entities detected by type of leetspeak?', ['Yes', 'No'])
# selected_for = st.sidebar.radio('Select for:', ['Efficiency', 'Accuracy'])
# models = load_models()
# selected_model = models[selected_language]
selected_model = load_xx_model()
import base64
LOGO_IMAGE = "LeetSpeak-NER-cropped.png"
st.markdown(
"""
<style>
.logo-img {
margin-top: auto;
margin-left: 30%;
width: 30%;
}
.logo-img-2 {
margin-top: 10%;
margin-left: 20%;
width: 35%;
}
</style>
""",
unsafe_allow_html=True
)
col1, col2= st.columns([2, 2])
with col1:
# st.image('./aida_logo.png')
st.markdown(
f"""
<img class="logo-img" src="data:image/png;base64,{base64.b64encode(open(LOGO_IMAGE, "rb").read()).decode()}">
""",
unsafe_allow_html=True
)
with col2:
# st.image('./aida_logo.png')
st.markdown(
f"""
<img class="logo-img-2" src="data:image/png;base64,{base64.b64encode(open("aida_logo.png", "rb").read()).decode()}">
""",
unsafe_allow_html=True
)
# st.image([LOGO_IMAGE,"aida_logo.png"], width=100)
st.markdown("""
<style>
.big-font {
font-size:3em;
font-weight: bold;
}
</style>
""", unsafe_allow_html=True)
st.markdown('<p class="big-font">Welcome to <font color="#4B8BBE">Leet</font><font color=" #FFD43B">Speak</font><font color="#ff73a2">-NER</font></p>', unsafe_allow_html=True)
with st.expander("Project Description", expanded=False):
st.write("""
Developed by the Applied Intelligence and Data Analysis (AI+DA) group at the Polytechnic University of Madrid (UPM), this tool employs a Spacy-Transformer Named Entity Recognition (NER) model to detect camouflaged words.
Word camouflage is a technique used to evade content moderation on social media platforms, contributing to the spread of misinformation. This tool aims to counter such emerging threats by focusing on detecting camouflaged words in news articles, reports, and talks, as the models have been fine-tuned using TED talks, OPUS News Commentaries, and Wikipedia data.
The multilingual model currently supports over 20 languages, including Arabic, Azerbaijani, Danish, German, Greek, English, Spanish, Finnish, French, Hungarian, Indonesian, Italian, Kazakh, Norwegian Bokmål, Nepali, Dutch, Portuguese, Romanian, Russian, Slovenian, Swedish, Tajik, and Turkish.
The NER model has been tested in English, Spanish, French, Italian, and
German.
Additionally, users can choose to have detected entities broken down into three types of camouflaged words: Canonical Leetspeak, Punctuation Camouflaged, and Inversion Camouflaged.
Please cite us:
```
@misc{huertasgarcía2022countering,
title={Countering Malicious Content Moderation Evasion in Online Social Networks: Simulation and Detection of Word Camouflage},
author={Álvaro Huertas-García and Alejandro Martín and Javier Huertas Tato and David Camacho},
year={2022},
eprint={2212.14727},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
""")
with st.expander("Try any of these examples", expanded=False):
st.write("""
ENGLISH:
- Desperately dominated by fam1ly sitüatløns, he leaves her.
- You might as well come out to investigate a strang3 n'o?i+se or something.
- But one other thing that we have to re;think is the way that we dy£ our #c!l.o|th?£+s.
- And he wanted Baltimore city to get that same kind of att£ntløn from the outside, but )i)n)t)r)o)s)p)e)c)t)i)o)n from the inside about what was going on with us.
- Why do all these _r_e_p_o_r_t_e_r_s, who get praise and money for doing what Assange has done, maintain a cow;ardly silence (at best) while a fellow publisher faces threats of extradition, banning, and espionage charges (which can incur the death penalty), not to mention calls for his as'sa'ss1nat'i'on?
SPANISH
- _d+i%o"s mío!
- La C0v!d es un 3ng@ño de los G0b!3rno$
- pl@πd€m1∆ instead of “pandemia” (pandemic)
- se asocian con el m13;d0 y el d'o'lor. g£rønlmo solía decir
- Con las nuevas tecnologías digitales, los agrlcultør£s pueden manejar mejor el uso de sus tierras, su energía y su agua, y prepararse para el mal clima.
- En el tiempo transcurrido entre mi período de escuela %s%3%c%_%n%d%a%r%1%a y el mo'm3n'to de empezar a enseñar vimos surgir el fenómeno de in't£r'net
- Las pre0c_pac1on3s van desde inquietudes por las ramificaciones desestabilizadoras de una estrategia de salida de la FC, hasta aprehensión por pérdidas de capital en la rápidamente creciente cartera de valores de la Fed (actualmente de 3 billones y en camino a los 4 billones para finales de este año).
FRENCH
- Des poem£s. Je suis obligé d'écrire des poem3s.
- Placez-le ensuite dans un endroit f'r'ais jusqu'au moment de ser;vir.
- Cependant, parfois ils rencontrent des sc£narløs _i_n_c_o_n_n_u_s comme indiqué ci-dessus circonstance et font face à une grande quantité de perte de données.
- Il semble préférable qu'une telle permanence soit organisée de manière uniforme par l'Ordre des barr£a*x francophones et germanophone et l'Ordre des barr£a*x flamands.
- Elle doivent être mentvrai .fl%a~g'r]a+n/t=e:s avant que nous y prêtions attention.
ITALIAN
- Tornato a londra Händel produsse Ezio, un di'sa'stro sostoco.
- Vi devo dire cosè successo una 5eI2a quan;do tutti i 5£¬sø₹l hanno smesso di funzionare.
- 1877 New _h_a_m_p_s_h_1_r_3 diventa l'ultimo stato a concedere i pari diritti politici agli ebr31
- Per nostra esperienza non possiamo sbarazzarci della sen2a7_i[]ne che la non chiarezza delle cøm'p£'t£n'ze in questi casi gli fa più che comodo.
- In alternativa, invia a 1llary (proprietario) una richiesta di informazioni tramite il modulo Contatta il proprietario/gestore se prima desideri ricevere un preventivo o maggiori informazioni.
GERMAN
- Die +m+e+i+s+t+e z;£lt fan;gen die le_t3 mit dem #s"t.r}3$1+t an.
- Würde ich Gleichgesinnte erst nach Einbruch der d_nk3l;heit treffen können?
- Es war nur eine mi'nimal'e Menge an externen komponent£n notwendig, um ein komplettes sys;tem zu bauen.
- Die kir;che wurde von dem Schutzpatron der p;£st im Zeichen des Bundes und der dankbark31t £rrichtet.
""")
# - Why do all these _r_e_p_o_r_t_e_r_s, who get praise and money for doing what Assange has done, maintain a cow;ardly silence (at best) while a fellow publisher faces threats of extradition, banning, and espionage charges (which can incur the death penalty), not to mention calls for his as'sa'ss1nat'i'on?
# - Cada uno de estos es un crimen de guerra, un crimen contra la humanidad y, en el caso de los asesinatos masivos de la campaña de Anfal, y tal vez también en el caso de los árabes de los pantanos, el crimen más serio de todos, ge'no'ci'dio.
# - No quiere decir que debamos iniciar una campaña por los derechos de los lns£ctøs
st.subheader("Input Text")
with st.form("my_form"):
text_input = st.text_area('Insert a text to detect leetspeak entities',
# placeholder="@#plan#demia, pl@πd€m1∆ instead of “pandemia” (pandemic)",
# value="@#plan#demia, pl@πd€m1∆ instead of “pandemia” (pandemic)"
)
uploaded_file = st.file_uploader("or Upload a file", type=["doc", "docx", "pdf", "txt"])
if uploaded_file is not None:
text_input = uploaded_file.getvalue()
text_input = text_input.decode("utf-8")
# Every form must have a submit button.
submitted = st.form_submit_button("Submit")
st.subheader("Output")
with st.spinner('Wait for it...'):
doc = selected_model(text_input.lower())
tokens = process_text(doc, selected_multi_ner)
annotated_text(*tokens)
|