camparchimedes commited on
Commit
9f56bc0
·
verified ·
1 Parent(s): eb1522b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -21
app.py CHANGED
@@ -1,33 +1,49 @@
1
  import torch
2
  import gradio as gr
3
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 
 
4
 
5
- # Load the processor and model
6
- processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
7
- model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
8
 
9
- # Ensure the pad token is set
10
- if processor.tokenizer.pad_token_id is None:
11
- processor.tokenizer.pad_token = processor.tokenizer.eos_token
 
12
 
13
- # Move the model to the appropriate device
14
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
- model = model.to(device)
16
 
17
- def transcribe_text(raw_text):
18
- # Tokenize the input text
19
- tokenize_batch = processor.tokenizer(raw_text, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
20
-
21
- # Output the tokenized result for inspection
22
- return tokenize_batch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # Gradio interface
25
  iface = gr.Interface(
26
- fn=transcribe_text,
27
- inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."),
28
- outputs="json",
29
- title="Whisper Model Tokenization",
30
- description="Test the EOS and PAD tokens for NbAiLab/nb-whisper-large-verbatim model."
31
  )
32
 
33
  if __name__ == "__main__":
 
1
  import torch
2
  import gradio as gr
3
  from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
4
+ import io
5
+ import sys
6
 
7
+ def test_eos_pad():
8
+ raw_text_batch = 'a'
 
9
 
10
+ # Capture print statements
11
+ old_stdout = sys.stdout
12
+ new_stdout = io.StringIO()
13
+ sys.stdout = new_stdout
14
 
15
+ # Load the processor and model for the NbAiLab Whisper model
16
+ processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
17
+ model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
18
 
19
+ # Check if the pad token is set, if not, set it to the eos token
20
+ if processor.tokenizer.pad_token_id is None:
21
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
22
+
23
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
24
+ model = model.to(device)
25
+
26
+ print(f'{processor.tokenizer.eos_token=}')
27
+ print(f'{processor.tokenizer.eos_token_id=}')
28
+ print(f'{processor.tokenizer.pad_token=}')
29
+ print(f'{processor.tokenizer.pad_token_id=}')
30
+
31
+ # Tokenize the input batch
32
+ tokenize_batch = processor.tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
33
+ print(f'{tokenize_batch=}')
34
+ print('Done')
35
+
36
+ # Restore the original stdout and return the captured output
37
+ sys.stdout = old_stdout
38
+ output = new_stdout.getvalue()
39
+ return output
40
 
 
41
  iface = gr.Interface(
42
+ fn=test_eos_pad,
43
+ inputs=[],
44
+ outputs=gr.Textbox(label="Results"),
45
+ title="Test EOS and PAD Tokens",
46
+ description="This Gradio interface displays the output of the test_eos_pad function."
47
  )
48
 
49
  if __name__ == "__main__":