Spaces:

camparchimedes
/

eos_pad_cal

Running

File size: 1,642 Bytes

eb1522b
 
 
9f56bc0
 
0a1c65b
9f56bc0
 
0a1c65b
9f56bc0
 
 
 
0a1c65b
9f56bc0
 
 
0a1c65b
dc1f5e4
9f56bc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a1c65b
eb1522b
9f56bc0
 
 
dc1f5e4
9f56bc0
eb1522b
0a1c65b
eb1522b

import torch
import gradio as gr
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
import io
import sys

def test_eos_pad():
    raw_text_batch = 'a'

    # Capture print statements
    old_stdout = sys.stdout
    new_stdout = io.StringIO()
    sys.stdout = new_stdout

    # Load the processor and model for the NbAiLab Whisper model
    processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
    model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")

    # Check if pad token is set, if not, set it to eos token
    if processor.tokenizer.pad_token_id is None:
        processor.tokenizer.pad_token = processor.tokenizer.eos_token

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    print(f'{processor.tokenizer.eos_token=}')
    print(f'{processor.tokenizer.eos_token_id=}')
    print(f'{processor.tokenizer.pad_token=}')
    print(f'{processor.tokenizer.pad_token_id=}')

    # Tokenize the input batch
    tokenize_batch = processor.tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
    print(f'{tokenize_batch=}')
    print('Done')

    # Restore the original stdout and return the captured output
    sys.stdout = old_stdout
    output = new_stdout.getvalue()
    return output

iface = gr.Interface(
    fn=test_eos_pad,
    inputs=[],
    outputs=gr.Textbox(label="Results"),
    title="Check EOS and PAD Tokens",
    description="This Gradio interface displays the output of the test_eos_pad function."
)

if __name__ == "__main__":
    iface.launch()