Spaces:

camparchimedes
/

eos_pad_cal

Running

camparchimedes commited on Aug 22, 2024

Commit

eb1522b

verified ·

1 Parent(s): 0a1c65b

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,32 +1,34 @@
-def test_eos_pad():
-    from datasets import load_dataset
-    import torch
-    from transformers import GPT2Tokenizer, GPT2LMHeadModel
-    raw_text_batch = 'a'
-    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    # print(f'{tokenizer.eos_token=}')
-    # print(f'{tokenizer.eos_token_id=}')
-    # print(f'{tokenizer.pad_token=}')
-    # print(f'{tokenizer.pad_token_id=}')
-    # print(f'{raw_text_batch=}')
-    # tokenize_batch = tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
-    # print(f'{tokenize_batch=}')
-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token = tokenizer.eos_token
-    probe_network = GPT2LMHeadModel.from_pretrained("gpt2")
-    device = torch.device(f"cuda:{0}" if torch.cuda.is_available() else "cpu")
-    probe_network = probe_network.to(device)
-    print(f'{tokenizer.eos_token=}')
-    print(f'{tokenizer.eos_token_id=}')
-    print(f'{tokenizer.pad_token=}')
-    print(f'{tokenizer.pad_token_id=}')
-    print(f'{raw_text_batch=}')
-    tokenize_batch = tokenizer(raw_text_batch, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
-    print(f'{tokenize_batch=}')
-    print('Done')

+import torch
+import gradio as gr
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+# Load the processor and model
+processor = AutoProcessor.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
+model = AutoModelForSpeechSeq2Seq.from_pretrained("NbAiLab/nb-whisper-large-verbatim")
+# Ensure the pad token is set
+if processor.tokenizer.pad_token_id is None:
+    processor.tokenizer.pad_token = processor.tokenizer.eos_token
+# Move the model to the appropriate device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device)
+def transcribe_text(raw_text):
+    # Tokenize the input text
+    tokenize_batch = processor.tokenizer(raw_text, padding="max_length", max_length=5, truncation=True, return_tensors="pt")
+    # Output the tokenized result for inspection
+    return tokenize_batch
+# Gradio interface
+iface = gr.Interface(
+    fn=transcribe_text,
+    inputs=gr.inputs.Textbox(lines=2, placeholder="Enter text here..."),
+    outputs="json",
+    title="Whisper Model Tokenization",
+    description="Test the EOS and PAD tokens for NbAiLab/nb-whisper-large-verbatim model."
+)
+if __name__ == "__main__":
+    iface.launch()