Spaces:

camparchimedes
/

eos_pad_cal

Sleeping

eos_pad_cal / app.py

Update app.py

dc1f5e4 verified 6 months ago

1.64 kB

	import torch
	import gradio as gr
	from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
	import io
	import sys

	def test_eos_pad():
	raw_text_batch = 'a'

	# Capture print statements
	old_stdout = sys.stdout
	new_stdout = io.StringIO()
	sys.stdout = new_stdout

	# Load the processor and model for the NbAiLab Whisper model
	processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
	model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")

	# Check if pad token is set, if not, set it to eos token
	if processor.tokenizer.pad_token_id is None:
	processor.tokenizer.pad_token = processor.tokenizer.eos_token

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = model.to(device)

	print(f'{processor.tokenizer.eos_token=}')
	print(f'{processor.tokenizer.eos_token_id=}')
	print(f'{processor.tokenizer.pad_token=}')
	print(f'{processor.tokenizer.pad_token_id=}')

	# Tokenize the input batch
	tokenize_batch = processor.tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
	print(f'{tokenize_batch=}')
	print('Done')

	# Restore the original stdout and return the captured output
	sys.stdout = old_stdout
	output = new_stdout.getvalue()
	return output

	iface = gr.Interface(
	fn=test_eos_pad,
	inputs=[],
	outputs=gr.Textbox(label="Results"),
	title="Check EOS and PAD Tokens",
	description="This Gradio interface displays the output of the test_eos_pad function."
	)

	if __name__ == "__main__":
	iface.launch()