Spaces:
Sleeping
Sleeping
import torch | |
import gradio as gr | |
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq | |
import io | |
import sys | |
def test_eos_pad(): | |
raw_text_batch = 'a' | |
# Capture print statements | |
old_stdout = sys.stdout | |
new_stdout = io.StringIO() | |
sys.stdout = new_stdout | |
# Load the processor and model for the NbAiLab Whisper model | |
processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim") | |
model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim") | |
# Check if pad token is set, if not, set it to eos token | |
if processor.tokenizer.pad_token_id is None: | |
processor.tokenizer.pad_token = processor.tokenizer.eos_token | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = model.to(device) | |
print(f'{processor.tokenizer.eos_token=}') | |
print(f'{processor.tokenizer.eos_token_id=}') | |
print(f'{processor.tokenizer.pad_token=}') | |
print(f'{processor.tokenizer.pad_token_id=}') | |
# Tokenize the input batch | |
tokenize_batch = processor.tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt") | |
print(f'{tokenize_batch=}') | |
print('Done') | |
# Restore the original stdout and return the captured output | |
sys.stdout = old_stdout | |
output = new_stdout.getvalue() | |
return output | |
iface = gr.Interface( | |
fn=test_eos_pad, | |
inputs=[], | |
outputs=gr.Textbox(label="Results"), | |
title="Check EOS and PAD Tokens", | |
description="This Gradio interface displays the output of the test_eos_pad function." | |
) | |
if __name__ == "__main__": | |
iface.launch() | |