Spaces:

camparchimedes
/

eos_pad_cal

Running

App Files Files Community

camparchimedes commited on Aug 22, 2024

Commit

9f56bc0

verified ·

1 Parent(s): eb1522b

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -21

app.py CHANGED Viewed

@@ -1,33 +1,49 @@
 import torch
 import gradio as gr
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-# Load the processor and model
-processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
-model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
-# Ensure the pad token is set
-if processor.tokenizer.pad_token_id is None:
-    processor.tokenizer.pad_token = processor.tokenizer.eos_token
-# Move the model to the appropriate device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-model = model.to(device)
-def transcribe_text(raw_text):
-    # Tokenize the input text
-    tokenize_batch = processor.tokenizer(raw_text, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
-    # Output the tokenized result for inspection
-    return tokenize_batch
-# Gradio interface
 iface = gr.Interface(
-    fn=transcribe_text,
-    inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."),
-    outputs="json",
-    title="Whisper Model Tokenization",
-    description="Test the EOS and PAD tokens for NbAiLab/nb-whisper-large-verbatim model."
 )
 if __name__ == "__main__":

 import torch
 import gradio as gr
 from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import io
+import sys
+def test_eos_pad():
+    raw_text_batch = 'a'
+    # Capture print statements
+    old_stdout = sys.stdout
+    new_stdout = io.StringIO()
+    sys.stdout = new_stdout
+    # Load the processor and model for the NbAiLab Whisper model
+    processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
+    model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
+    # Check if the pad token is set, if not, set it to the eos token
+    if processor.tokenizer.pad_token_id is None:
+        processor.tokenizer.pad_token = processor.tokenizer.eos_token
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    print(f'{processor.tokenizer.eos_token=}')
+    print(f'{processor.tokenizer.eos_token_id=}')
+    print(f'{processor.tokenizer.pad_token=}')
+    print(f'{processor.tokenizer.pad_token_id=}')
+    # Tokenize the input batch
+    tokenize_batch = processor.tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
+    print(f'{tokenize_batch=}')
+    print('Done')
+    # Restore the original stdout and return the captured output
+    sys.stdout = old_stdout
+    output = new_stdout.getvalue()
+    return output
 iface = gr.Interface(
+    fn=test_eos_pad,
+    inputs=[],
+    outputs=gr.Textbox(label="Results"),
+    title="Test EOS and PAD Tokens",
+    description="This Gradio interface displays the output of the test_eos_pad function."
 )
 if __name__ == "__main__":