test / app.py
mskov's picture
Update app.py
68ed0e8
raw
history blame
3.99 kB
import os
import sys
os.system("pip install transformers==4.27.0")
os.system("pip install numpy==1.23")
from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig, WhisperProcessor, WhisperForConditionalGeneration
os.system("pip install jiwer")
from jiwer import wer
os.system("pip install datasets[audio]")
from evaluate import evaluator, load
import evaluate
from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
import gradio as gr
import torch
import re
set_caching_enabled(False)
disable_caching()
huggingface_token = os.environ["huggingface_token"]
pipe = pipeline(model="mskov/whisper-small-esc50")
print(pipe)
processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50")
dataset = load_dataset("ashraq/esc50", split="train").cast_column("audio", Audio(sampling_rate=16000))
# print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])
model = WhisperForConditionalGeneration.from_pretrained("mskov/whisper-small-esc50")
# Evaluate the model
# model.eval()
#print("model.eval ", model.eval())
# Remove brackets and extra spaces
def map_to_pred(batch):
audio = batch["audio"]
input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
batch["reference"] = processor.tokenizer._normalize(batch['category'])
with torch.no_grad():
predicted_ids = model.generate(input_features.to("cuda"))[0]
transcription = processor.decode(predicted_ids)
batch["prediction"] = processor.tokenizer._normalize(transcription)
return batch
result = dataset.map(map_to_pred)
wer = load("wer")
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
'''
def map_to_pred(batch):
cleaned_transcription = re.sub(r'\[[^\]]+\]', '', batch['category']).strip()
print("cleaned transcript", cleaned_transcription)
cleaned_transcription = preprocess_transcription(batch['category'])
normalized_transcription = processor.tokenizer._normalize(cleaned_transcription)
audio = batch["audio"]
input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
batch["reference"] = processor.tokenizer._normalize(batch['category'])
with torch.no_grad():
predicted_ids = model.generate(input_features)[0]
transcription = processor.decode(predicted_ids)
batch["prediction"] = processor.tokenizer._normalize(transcription)
return batch
result = dataset.map(map_to_pred)
wer = load("wer")
print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
'''
'''
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
print("outputs ", outputs)
# Convert predicted token IDs back to text
predicted_text = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)
# Get ground truth labels from the dataset
labels = dataset["audio"] # Replace "labels" with the appropriate key in your dataset
print("labels are ", labels)
# Compute WER
wer = load("wer")
wer_score = wer(labels, predicted_text)
# Print or return WER score
print(f"Word Error Rate (WER): {wer_score}")
'''
def transcribe(audio):
text = pipe(audio)["text"]
return text, test
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs="text",
title="Whisper Small Miso Test",
)
iface.launch()
'''
print("check check")
print(inputs)
input_features = inputs.input_features
decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
list(last_hidden_state.shape)
print(list(last_hidden_state.shape))
'''