ruanchaves commited on
Commit
2fe8373
1 Parent(s): b8d8720

feat: offensive language detection app

Browse files
Files changed (2) hide show
  1. app.py +105 -0
  2. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
3
+ import torch
4
+ from collections import Counter
5
+
6
+ article_string = "Author: <a href=\"https://huggingface.co/ruanchaves\">Ruan Chaves Rodrigues</a>. Read more about our <a href=\"https://github.com/ruanchaves/evaluation-portuguese-language-models\">research on the evaluation of Portuguese language models</a>."
7
+
8
+ app_title = "Offensive Language Detection (Detecção de Linguagem Ofensiva)"
9
+
10
+ app_description = """
11
+ This app detects offensive language on Portuguese text using multiple models. You can either introduce your own sentences by filling in "Text" or click on one of the examples provided below.
12
+
13
+ (Este aplicativo detecta linguagem ofensiva em texto em português usando vários modelos. Introduza suas próprias frases preenchendo o campo "Text", ou clique em um dos exemplos fornecidos abaixo.)
14
+ """
15
+
16
+ app_examples = [
17
+ ["Aquele cara é um babaca."],
18
+ ["Quem não deve não teme!!"],
19
+ ["Que nojo!🤮🤮🤮🤮🤮"]
20
+ ]
21
+
22
+ output_textbox_component_description = """
23
+ This box will display offensive language detection results based on the average score of multiple models.
24
+
25
+ (Esta caixa exibirá resultados da detecção de linguagem ofensiva com base na pontuação média de vários modelos.)
26
+ """
27
+
28
+ output_json_component_description = { "breakdown": """
29
+ This box presents a detailed breakdown of the evaluation for each model.
30
+ """,
31
+ "detalhamento": """
32
+ (Esta caixa apresenta um detalhamento da avaliação para cada modelo.)
33
+ """ }
34
+
35
+ score_descriptions = {
36
+ 0: "This text is not offensive.",
37
+ 1: "This text is offensive.",
38
+ }
39
+
40
+ score_descriptions_pt = {
41
+ 0: "(Este texto é ofensivo.)",
42
+ 1: "(Este texto não é ofensivo.)",
43
+ }
44
+
45
+ model_list = [
46
+ "ruanchaves/mdeberta-v3-base-hatebr",
47
+ "ruanchaves/bert-base-portuguese-cased-hatebr",
48
+ "ruanchaves/bert-large-portuguese-cased-hatebr",
49
+ ]
50
+
51
+ user_friendly_name = {
52
+ "ruanchaves/mdeberta-v3-base-hatebr": "mDeBERTa-v3 (HateBR)",
53
+ "ruanchaves/bert-base-portuguese-cased-hatebr": "BERTimbau base (HateBR)",
54
+ "ruanchaves/bert-large-portuguese-cased-hatebr": "BERTimbau large (HateBR)",
55
+ }
56
+
57
+ model_array = []
58
+
59
+ for model_name in model_list:
60
+ row = {}
61
+ row["name"] = model_name
62
+ row["tokenizer"] = AutoTokenizer.from_pretrained(model_name)
63
+ row["model"] = AutoModelForSequenceClassification.from_pretrained(model_name)
64
+ model_array.append(row)
65
+
66
+ def most_frequent(array):
67
+ occurence_count = Counter(array)
68
+ return occurence_count.most_common(1)[0][0]
69
+
70
+ def predict(s1):
71
+ scores = {}
72
+ for row in model_array:
73
+ name = user_friendly_name[row["name"]]
74
+ tokenizer = row["tokenizer"]
75
+ model = row["model"]
76
+ model_input = tokenizer(*([s1],), padding=True, return_tensors="pt")
77
+ with torch.no_grad():
78
+ output = model(**model_input)
79
+ score = output[0][0].argmax().item()
80
+ scores[name] = score
81
+ average_score = most_frequent(list(scores.values()))
82
+ description = score_descriptions[average_score]
83
+ description_pt = score_descriptions_pt[average_score]
84
+ final_description = description + "\n \n" + description_pt
85
+
86
+ for key, value in scores.items():
87
+ scores[key] = score_descriptions[value]
88
+
89
+ return final_description, scores
90
+
91
+
92
+ inputs = [
93
+ gr.inputs.Textbox(label="Text"),
94
+ ]
95
+
96
+ outputs = [
97
+ gr.Textbox(label="Evaluation", value=output_textbox_component_description),
98
+ gr.JSON(label="Results by model", value=output_json_component_description)
99
+ ]
100
+
101
+
102
+ gr.Interface(fn=predict, inputs=inputs, outputs=outputs, title=app_title,
103
+ description=app_description,
104
+ examples=app_examples,
105
+ article = article_string).launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ torch
2
+ gradio
3
+ transformers