Spaces:
Running
Running
File size: 1,642 Bytes
eb1522b 9f56bc0 0a1c65b 9f56bc0 0a1c65b 9f56bc0 0a1c65b 9f56bc0 0a1c65b dc1f5e4 9f56bc0 0a1c65b eb1522b 9f56bc0 dc1f5e4 9f56bc0 eb1522b 0a1c65b eb1522b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import torch
import gradio as gr
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import io
import sys
def test_eos_pad():
raw_text_batch = 'a'
# Capture print statements
old_stdout = sys.stdout
new_stdout = io.StringIO()
sys.stdout = new_stdout
# Load the processor and model for the NbAiLab Whisper model
processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
# Check if pad token is set, if not, set it to eos token
if processor.tokenizer.pad_token_id is None:
processor.tokenizer.pad_token = processor.tokenizer.eos_token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f'{processor.tokenizer.eos_token=}')
print(f'{processor.tokenizer.eos_token_id=}')
print(f'{processor.tokenizer.pad_token=}')
print(f'{processor.tokenizer.pad_token_id=}')
# Tokenize the input batch
tokenize_batch = processor.tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
print(f'{tokenize_batch=}')
print('Done')
# Restore the original stdout and return the captured output
sys.stdout = old_stdout
output = new_stdout.getvalue()
return output
iface = gr.Interface(
fn=test_eos_pad,
inputs=[],
outputs=gr.Textbox(label="Results"),
title="Check EOS and PAD Tokens",
description="This Gradio interface displays the output of the test_eos_pad function."
)
if __name__ == "__main__":
iface.launch()
|