File size: 12,178 Bytes
8fdcd57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
import gradio as gr
import torch

# import homoglyphs
import confusable_homoglyphs.confusables
from unidecode import unidecode


from transformers import AutoModelForCausalLM, AutoTokenizer


model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")


from evaluate import load

perplexity_metric = load("perplexity", module_type="metric")


# homoglyphs_processor = homoglyphs.Homoglyphs(
#     ascii_strategy=homoglyphs.STRATEGY_LOAD, strategy=homoglyphs.STRATEGY_LOAD
# )


def calculate_perplexity(text_logits: torch.Tensor):
    # The logits are not normalized, so we need to normalize them, e.g. by doing a log softmax
    text_logits = torch.nn.functional.log_softmax(text_logits, dim=-1)
    # Calculate the perplexity of the text
    sequence_negative_log_likelihoods = -text_logits
    average_negative_log_likelihood = sequence_negative_log_likelihoods.mean()
    perplexity = torch.exp(average_negative_log_likelihood)
    return perplexity.item()


# Function to calculate burstiness using an LLM
def process_homoglyphed_text(homoglyphed_text, unhomoglyphed_text):
    # # Tokenize the texts
    # unhomoglyphed_text_tokens = tokenizer(unhomoglyphed_text, return_tensors="pt")[
    #     "input_ids"
    # ]
    # homoglyphed_text_tokens = tokenizer(homoglyphed_text, return_tensors="pt")[
    #     "input_ids"
    # ]
    # # Calculate the logits for the texts
    # with torch.no_grad():
    #     unhomoglyphed_text_logits = model(unhomoglyphed_text_tokens).logits
    #     homoglyphed_text_logits = model(homoglyphed_text_tokens).logits
    # # Calculate the perplexity for both texts
    # unhomoglyphed_text_perplexity = calculate_perplexity(unhomoglyphed_text_logits)
    # homoglyphed_text_perplexity = calculate_perplexity(homoglyphed_text_logits)
    unhomoglyphed_text_perplexity, homoglyphed_text_perplexity = (
        perplexity_metric.compute(
            predictions=[homoglyphed_text, unhomoglyphed_text], model_id="gpt2"
        )["perplexities"]
    )

    print(
        f"Unhomoglyphed text perplexity: {unhomoglyphed_text_perplexity}, homoglyphed text perplexity: {homoglyphed_text_perplexity}"
    )

    # If the version without homoglyphs is more than 1.5 of the perplexity of the version with homoglyphs, trigger the alarm
    difference_ratio = unhomoglyphed_text_perplexity / homoglyphed_text_perplexity
    print(f"Difference ratio: {difference_ratio}")
    alarm_triggered = difference_ratio > 1.5

    # Return the burstiness for both texts
    return alarm_triggered, difference_ratio


def unhomoglyphize_text(homoglyphed_text):
    confusables = confusable_homoglyphs.confusables.is_confusable(
        homoglyphed_text, greedy=True, preferred_aliases=["latin"]
    )
    print(f"Confusables: {confusables}")
    unhomoglyphed_text = homoglyphed_text
    # Returns something like:
    """
    [{'character': 'ρ',
  'alias': 'GREEK',
  'homoglyphs': [{'c': 'p', 'n': 'LATIN SMALL LETTER P'}]},
 {'character': 'τ',
  'alias': 'GREEK',
  'homoglyphs': [{'c': 'ᴛ', 'n': 'LATIN LETTER SMALL CAPITAL T'}]}]
    """
    for confusable in confusables:
        # Check if the character is in ASCII
        if ord(confusable["character"]) < 128:
            continue
        homoglyph = confusable["homoglyphs"][0]
        unhomoglyphed_text = unhomoglyphed_text.replace(
            confusable["character"], homoglyph["c"]
        )
    # Finally, remove any diacritics (this is not done by the homoglyphs library)
    unhomoglyphed_text = unidecode(unhomoglyphed_text)
    return unhomoglyphed_text


def process_user_text(user_text):
    # If the user text doesn't contain homoglyphs, don't trigger the alarm
    if not bool(
        confusable_homoglyphs.confusables.is_confusable(
            user_text, preferred_aliases=["latin"]
        )
    ):
        return False, 0.0, "# ✅ All good"

    unhomoglyphed_text = unhomoglyphize_text(user_text)

    print(f"Unhomoglyphed text: {unhomoglyphed_text}")

    alarm_triggered, difference_ratio = process_homoglyphed_text(
        homoglyphed_text=user_text, unhomoglyphed_text=unhomoglyphed_text
    )
    return (
        True,
        difference_ratio,
        "# 🚨 Alarm triggered" if alarm_triggered else "# ✅ All good",
    )


theme = gr.themes.Soft(
    primary_hue="fuchsia",
    secondary_hue="cyan",
    neutral_hue="gray",
    radius_size="none",
    font=[
        gr.themes.GoogleFont("IBM Plex Sans"),
        "ui-sans-serif",
        "system-ui",
        "sans-serif",
    ],
    font_mono=[
        gr.themes.GoogleFont("IBM Plex Mono"),
        "ui-monospace",
        "Consolas",
        "monospace",
    ],
)


# Create a Gradio interface
demo = gr.Interface(
    theme=theme,
    fn=process_user_text,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter your text here...", label="Text"),
    ],
    outputs=[
        # A checkbox: is dangerous or not
        gr.Checkbox(label="Is dangerous"),
        # The number of the difference ratio
        gr.Number(label="Difference ratio"),
        # Just an emoji: alarm triggered or not
        gr.Markdown(label="Alarm triggered", show_label=False),
    ],
    title="Homoglyphs Alarm 🚨",
    description="""Calculates the probablility that a given text has been the target of a homoglyph-based attack.

It calculates the perplexity of the text according to GPT-2 and compares it to the perplexity of the text with homoglyphs replaced by their ASCII equivalents.

Example texts adapted from:
- https://arxiv.org/abs/2401.12070 (also in a version adapted using https://huggingface.co/spaces/acmc/SilverSpeak with 5% replacement)
- https://huggingface.co/google/gemma-2-2b (also in a version adapted using https://huggingface.co/spaces/acmc/SilverSpeak with 5% replacement)
- https://www.persee.fr/doc/rbph_0035-0818_2012_num_90_3_8269
- https://arxiv.org/abs/2411.14257
- https://www.busuu.com/en/spanish/conditional

Written by: [Aldan Creo](https://acmc-website.web.app/intro)
""",
    allow_flagging="never",
    examples=[
        [
            "Dr. Capy Cosmos, a capybara unlike any other, astounded the scientific community with his groundbreaking research in astrophysics. With his keen sense of observation and unparalleled ability to interpret cosmic data, he uncovered new insights into the mysteries of black holes and the origins of the universe. As he peered through telescopes with his large, round eyes, fellow researchers often remarked that it seemed as if the stars themselves whispered their secrets directly to him. Dr. Cosmos not only became a beacon of inspiration to aspiring scientists but also proved that intellect and innovation can be found in the most unexpected of creatures."
        ],
        [
            "Dr. Capу Cosmos, a caрybаra unlіkе any other, astounded the scientific community with hіs groundbreakіng reѕearcһ in astrophysics. With hiѕ keen sense of observation and unparаlleled ability to interpret cosmic dаta, he uncovеred new іnsightѕ into tһe myѕteries of black holes аnd the origins of the universe. Aѕ he peered through telescopes with his large, round eyes, fellow reѕearchers often remarked that it seemed as if the stars themѕelves whiѕpered theіr secrets directlу to him. Dr. Cosmos not only became a beacon of inspіration to aspiring scientіsts but also proved thаt intellect and іnnovation can bе found in the most unexpecteԁ οf сreatures."
        ],
        [
            "Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights for both pre-trained variants and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as a laptop, desktop or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone."
        ],
        [
            "Gemma iѕ a family of lightweіght, ѕtatе-οf-the-art open models from Google, built from the same research аnd technolοgy uѕed to сreate tһe Gemini models. Theу are text-to-text, decoder-only lаrge lаnguage models, available in English, with οpen weightѕ for both рre-trainеd vаrіants аnd instruction-tuned variantѕ. Gemma models are well-suited for a vаrietу of text generation tasks, including question answering, summarization, and rеaѕoning. Their relatively small size makes it possible to dеploy them іn environments witһ limited resourceѕ such as a laptop, desktop or your own cloud infraѕtructure, democratizing acceѕs to state of the art AΙ models and һelping foster іnnovation for everyone."
        ],
        [
            "We run the model on the set of prompts containing known and unknown entities. Inspired by Meng et al. (2022a); Geva et al. (2023); Nanda et al. (2023) we use the residual stream of the final token of the entity, 𝒙 known and 𝒙 unknown. In each layer (l), we compute the activations of each latent in the SAE, i.e. al,j⁢(𝒙lknown) and al,j⁢(𝒙lunknown). For each latent, we obtain the fraction of the time that it is active (i.e. has a value greater than zero) on known and unknown entities respectively: fl,jknown=∑iNknown𝟙⁢[al,j⁢(𝒙l,iknown)>0]Nknown,fl,junknown=∑iNunknown𝟙⁢[al,j⁢(𝒙l,iunknown)>0]Nunknown,(6) where Nknown and Nunknown are the total number of prompts in each subset. Then, we take the difference, obtaining the latent separation scores sl,jknown=fl,jknown−fl,junknown and sl,junknown=fl,junknown−fl,jknown, for detecting known and unknown entities respectively."
        ],
        [
            "The national/ official name of the country, the people and the language are respectively Eλλάδα, Έλληνας, ελληνικά ([ eláδa, élinas, eliniká]), derived from Ancient Greek Ἑλλάς, Ἕλλην, ἑλληνικός ([ hellás, héllen, hellenikós]) ‘Greece, Greek (noun), Greek (adj.)’, which are also to be found in most European languages as Hellas, hellenic, hellénique etc.; Hellenic Republic is the official name of the country in the European Union. The etymology of these words is uncertain. They first occur in the Iliad of Homer (2.683-4) as a designation of a small area in Thessaly, the homeland of Achilles, and its people. (3) Also in Homer, it is possible to find the compound πανέλληνες ([ panhellenes]) denoting all Greeks (from adjective pan ‘all’ + noun hellen), and it is again uncertain under what historical circumstances this local appellation spread to the totality of the Greek nation, although various theories have been proposed (see Babiniotis 2002)."
        ],
        [
            "To form the conditional tense in Spanish, you need to use the infinitive form of the verb and add the corresponding endings for each subject pronoun. Regardless of the verb type (-ar, -er, or -ir), the endings remain the same. In singular-plural order, 1st-3rd, the terminations are: -ía, -ías, -ía, -íamos, -ían, -ían. For example: Hablaría (I would speak) Comerías (You would eat) Escribiría (He/She/You would write) Haríamos (We would do/make) Beberían (You all would drink) Leerían (They/You all would read)."
        ],
        [
            "To form the cοnԁіtіonal tense in Spanish, yοu need tο uѕе the infinitive fοrm of the verb and add the corresponding endіngs for eaсһ subject рronοun. Regardless of the verb type (-ar, -er, or -ir), the endings remain the same. In singular-plural order, 1st-3rd, thе terminations are: -ía, -ías, -ía, -íamos, -ían, -ían. For example: Hablaríа (I woulԁ speak) Comerías (You would еаt) Eѕcribiría (He/Shе/You would write) Haríаmos (We would do/make) Bеberían (Yοu all would ԁrink) Leerían (They/You all would read)."
        ],
    ],
)

# Launch the Gradio app
demo.launch()