camparchimedes commited on
Commit
eb1522b
·
verified ·
1 Parent(s): 0a1c65b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -26
app.py CHANGED
@@ -1,32 +1,34 @@
1
- def test_eos_pad():
2
- from datasets import load_dataset
3
- import torch
4
- from transformers import GPT2Tokenizer, GPT2LMHeadModel
5
 
6
- raw_text_batch = 'a'
 
 
7
 
8
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
9
- # print(f'{tokenizer.eos_token=}')
10
- # print(f'{tokenizer.eos_token_id=}')
11
- # print(f'{tokenizer.pad_token=}')
12
- # print(f'{tokenizer.pad_token_id=}')
13
 
14
- # print(f'{raw_text_batch=}')
15
- # tokenize_batch = tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
16
- # print(f'{tokenize_batch=}')
17
 
18
- if tokenizer.pad_token_id is None:
19
- tokenizer.pad_token = tokenizer.eos_token
20
- probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
21
- device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
22
- probe_network = probe_network.to(device)
 
23
 
24
- print(f'{tokenizer.eos_token=}')
25
- print(f'{tokenizer.eos_token_id=}')
26
- print(f'{tokenizer.pad_token=}')
27
- print(f'{tokenizer.pad_token_id=}')
 
 
 
 
28
 
29
- print(f'{raw_text_batch=}')
30
- tokenize_batch = tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
31
- print(f'{tokenize_batch=}')
32
- print('Done')
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 
4
 
5
+ # Load the processor and model
6
+ processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
7
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
8
 
9
+ # Ensure the pad token is set
10
+ if processor.tokenizer.pad_token_id is None:
11
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
 
 
12
 
13
+ # Move the model to the appropriate device
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+ model = model.to(device)
16
 
17
+ def transcribe_text(raw_text):
18
+ # Tokenize the input text
19
+ tokenize_batch = processor.tokenizer(raw_text, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
20
+
21
+ # Output the tokenized result for inspection
22
+ return tokenize_batch
23
 
24
+ # Gradio interface
25
+ iface = gr.Interface(
26
+ fn=transcribe_text,
27
+ inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."),
28
+ outputs="json",
29
+ title="Whisper Model Tokenization",
30
+ description="Test the EOS and PAD tokens for NbAiLab/nb-whisper-large-verbatim model."
31
+ )
32
 
33
+ if __name__ == "__main__":
34
+ iface.launch()