nurasaki commited on
Commit
7360456
·
1 Parent(s): 704dc9c

gradio_nlp_berta_masked_example: first commit

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. README.md +20 -11
  3. app.py +95 -37
  4. flagged/log.csv +6 -0
.gitignore CHANGED
@@ -1,2 +1,3 @@
1
  __pycache__/
2
  .DS_Store
 
 
1
  __pycache__/
2
  .DS_Store
3
+ private.md
README.md CHANGED
@@ -10,20 +10,29 @@ pinned: false
10
  ---
11
 
12
 
 
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
 
16
- ```sh git commands
17
- git clone https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example
18
- echo __pycache__/ > .gitignore\n
19
- git status
20
- git add .
21
- git commit -am "gradio_nlp_berta_masked_example: first commit"
22
- git push
23
 
24
- git remote add gh_repo [email protected]:nurasaki/gradio_nlp_berta_masked_example.git
25
- git push gh_repo main
26
- ```
27
 
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
 
10
  ---
11
 
12
 
13
+ # Masked Language Modeling Example
14
 
15
+ by [nurasaki](https://huggingface.co/spaces/nurasaki)
16
 
17
+ * Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
18
+ * Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
19
+ * Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)
 
 
 
 
20
 
21
+ <br>
 
 
22
 
23
+ ## Model description
24
 
25
+ The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language.
26
+
27
+ It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.
28
+
29
+ <br>
30
+
31
+ ## Usage
32
+
33
+ The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.
34
+
35
+ Choose one of the provided examples or enter your own masked text.
36
+
37
+ <br>
38
 
app.py CHANGED
@@ -1,12 +1,23 @@
1
  import gradio as gr
2
  import os
3
 
 
 
 
 
 
 
 
 
 
 
4
  # save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
5
  auth_token = os.getenv("auth_token")
6
 
7
 
8
 
9
 
 
10
  print("========================================================================")
11
  print("Starting ... gradio_demo_nlp_autocomplete/app.py")
12
  print("AUTH TOKEN:", auth_token)
@@ -14,58 +25,105 @@ print("AUTH TOKEN:", auth_token)
14
 
15
  # load a model from https://hf.co/models as an interface, then use it as an api
16
  # you can remove the api_key parameter if you don't care about rate limiting.
17
- api = gr.Interface.load("huggingface/projecte-aina/roberta-base-ca-v2", api_key=auth_token,)
18
 
19
 
 
 
 
20
 
 
21
 
 
22
 
 
 
 
 
 
23
 
24
- def complete_with_gpt(text):
 
 
 
 
 
 
 
 
 
 
25
 
26
- print("------------------------------------------------------------------------")
27
- print("type(api):", type(api) )
28
- print("Api:", api, "\n" )
29
 
30
 
 
31
 
32
- print("------------------------------------------------------------------------")
33
- print("text:")
34
- print(text)
35
- print("------------------------------------------------------------------------")
36
- print("text[:-50]:")
37
- print(text[:-50])
38
- print("------------------------------------------------------------------------")
39
- print("api(text):")
40
- print(api(text))
41
- print("------------------------------------------------------------------------")
42
- print("text[-50:]:")
43
- print(text[-50:])
44
- print("------------------------------------------------------------------------")
45
- print("api(text[-50:]")
46
- print(api(text[-50:]))
47
- print("------------------------------------------------------------------------")
48
-
49
 
50
- return text[:-50] + api(text[-50:])
51
 
 
 
 
52
 
53
- with gr.Blocks() as demo:
54
-
55
- print("------------------------------------------------------------------------")
56
- print("with gr.Blocks")
57
 
58
- textbox = gr.Textbox(placeholder="Type here...", lines=4)
59
- btn = gr.Button("Autocomplete")
60
-
61
- print("textbox:", textbox)
62
 
63
- # define what will run when the button is clicked, here the textbox is used as both an input and an output
64
- btn.click(fn=complete_with_gpt, inputs=textbox, outputs=textbox, queue=False)
65
 
66
- demo.launch(favicon_path="favicon.png")
 
 
 
 
 
 
 
 
 
 
67
 
 
68
 
69
- # /Users/nurasaki/miniforge3/envs/conda_tfg_clone/lib/python3.8/site-packages/gradio/interface.py:93:
70
- # UserWarning: gr.Intrerface.load() will be deprecated. Use gr.load() instead.
71
- # warnings.warn("gr.Intrerface.load() will be deprecated. Use gr.load() instead.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import os
3
 
4
+
5
+
6
+ import torch
7
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
8
+ import logging
9
+ from torch.nn.functional import softmax
10
+ import pandas as pd
11
+
12
+
13
+
14
  # save your HF API token from https:/hf.co/settings/tokens as an env variable to avoid rate limiting
15
  auth_token = os.getenv("auth_token")
16
 
17
 
18
 
19
 
20
+
21
  print("========================================================================")
22
  print("Starting ... gradio_demo_nlp_autocomplete/app.py")
23
  print("AUTH TOKEN:", auth_token)
 
25
 
26
  # load a model from https://hf.co/models as an interface, then use it as an api
27
  # you can remove the api_key parameter if you don't care about rate limiting.
28
+ # api = gr.Interface.load(, api_key=auth_token,)
29
 
30
 
31
+ model_ref = "projecte-aina/roberta-base-ca-v2"
32
+ tokenizer = AutoTokenizer.from_pretrained(model_ref)
33
+ model = AutoModelForMaskedLM.from_pretrained(model_ref)
34
 
35
+ def get_topk(text, tokenizer, model, k):
36
 
37
+ print("Get top K,", text)
38
 
39
+ # Tokenize
40
+ # ==========================================================================================
41
+ tokenizer_kwargs = dict(padding='longest', return_token_type_ids=False, return_tensors="pt")
42
+ inputs = tokenizer(text, **tokenizer_kwargs).to("cpu")
43
+ input_ids = inputs.input_ids
44
 
45
+
46
+ # Get model outputs and probabilities
47
+ # ==========================================================================================
48
+ # logits = model(input_ids=input_ids, attention_mask=attention_mask).logits
49
+ logits = model.to("cpu")(**inputs).logits
50
+ probs = softmax(logits, dim=2)
51
+
52
+
53
+ # Index ok <mask> (ojo només funciona quan hi ha 1 MASK)
54
+ # ==========================================================================================
55
+ row_idx, mask_idx = torch.where(input_ids.to("cpu") == tokenizer.mask_token_id)
56
 
57
+ return probs[row_idx, mask_idx].topk(k), mask_idx
 
 
58
 
59
 
60
+ def generate_output(text, k):
61
 
62
+ # lines = print_topk(text, tokenizer, model, k=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
+ (values, indices), input_idx = get_topk(text, tokenizer, model, int(k))
65
 
66
+ for mask_vals, mask_indices, input_idx in zip(values, indices, input_idx):
67
+ labels = {tokenizer.decode(ind): val.item()
68
+ for val, ind in zip(mask_vals, mask_indices)}
69
 
70
+ return labels
 
 
 
71
 
 
 
 
 
72
 
73
+ md_text ="""
74
+ # Masked Language Modeling Example
75
 
76
+ by [nurasaki](https://huggingface.co/spaces/nurasaki)
77
+
78
+ * Space : [https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example](https://huggingface.co/spaces/nurasaki/gradio_nlp_berta_masked_example)
79
+ * Model used: Catalan BERTa-v2 (roberta-base-ca-v2) base model
80
+ * Hugginface link: [https://huggingface.co/projecte-aina/roberta-base-ca-v2](https://huggingface.co/projecte-aina/roberta-base-ca-v2)
81
+
82
+ <br>
83
+
84
+ ## Model description
85
+
86
+ The **roberta-base-ca-v2** is a transformer-based masked language model for the Catalan language.
87
 
88
+ It is based on the [RoBERTA](https://github.com/pytorch/fairseq/tree/master/examples/roberta) base model and has been trained on a medium-size corpus collected from publicly available corpora and crawlers.
89
 
90
+ <br>
91
+
92
+ ## Usage
93
+
94
+ The model accepts an input text with a *mask* (for example, "La meva mare es diu \<mask\>.") and generates the *k* most probable words that could fill the *mask* position in the sentence.
95
+
96
+ Choose one of the provided examples or enter your own masked text.
97
+
98
+ <br>
99
+
100
+
101
+
102
+ """
103
+
104
+ examples = [
105
+ "La meva mare es diu <mask>.",
106
+ "La meva mare treballa de <mask>.",
107
+ "El meu fill es diu <mask>.",
108
+ "El teu pare treballa de <mask>.",
109
+ ]
110
+
111
+
112
+
113
+ with gr.Blocks() as demo:
114
+ gr.Markdown(md_text)
115
+ with gr.Row():
116
+ with gr.Column():
117
+ text = gr.Textbox("La meva mare es diu <mask>.", label="Masked text")
118
+ k = gr.Number(value=10, label="Num. results")
119
+ btn = gr.Button("Generate")
120
+
121
+ with gr.Column():
122
+ out_label = gr.Label(label="Results")
123
+
124
+
125
+ btn.click(generate_output, inputs=[text, k], outputs=[out_label])
126
+ gr.Examples(examples, inputs=[text])
127
+
128
+ # if __name__ == "__main__":
129
+ demo.launch(favicon_path="favicon.png")
flagged/log.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Input Text,output,flag,username,timestamp
2
+ "The tower is 324 metres (1,063 ft) tall,",,,,2023-04-03 16:23:19.212953
3
+ ,"<p>Start typing below and then click <strong>Run</strong> to see the output.</p>
4
+ ",,,2023-04-03 16:28:32.735416
5
+ El teu pare treballa de <maks>.,"<p>Masked Text: xxx</p>
6
+ ",,,,2023-04-03 17:34:10.400919