Spaces:

mskov
/

test

Runtime error

App Files Files Community

test / app.py

mskov

Update app.py

68ed0e8 over 1 year ago

raw

history blame

3.99 kB

	import os
	import sys
	os.system("pip install transformers==4.27.0")
	os.system("pip install numpy==1.23")
	from transformers import pipeline, WhisperModel, WhisperTokenizer, WhisperFeatureExtractor, AutoFeatureExtractor, AutoProcessor, WhisperConfig, WhisperProcessor, WhisperForConditionalGeneration
	os.system("pip install jiwer")
	from jiwer import wer
	os.system("pip install datasets[audio]")
	from evaluate import evaluator, load
	import evaluate
	from datasets import load_dataset, Audio, disable_caching, set_caching_enabled
	import gradio as gr
	import torch
	import re

	set_caching_enabled(False)
	disable_caching()

	huggingface_token = os.environ["huggingface_token"]
	pipe = pipeline(model="mskov/whisper-small-esc50")
	print(pipe)
	processor = WhisperProcessor.from_pretrained("mskov/whisper-small-esc50")
	dataset = load_dataset("ashraq/esc50", split="train").cast_column("audio", Audio(sampling_rate=16000))

	# print(dataset, "and at 0[audio][array] ", dataset[0]["audio"]["array"], type(dataset[0]["audio"]["array"]), "and at audio : ", dataset[0]["audio"])

	model = WhisperForConditionalGeneration.from_pretrained("mskov/whisper-small-esc50")

	# Evaluate the model
	# model.eval()
	#print("model.eval ", model.eval())


	# Remove brackets and extra spaces
	def map_to_pred(batch):
	audio = batch["audio"]
	input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
	batch["reference"] = processor.tokenizer._normalize(batch['category'])

	with torch.no_grad():
	predicted_ids = model.generate(input_features.to("cuda"))[0]
	transcription = processor.decode(predicted_ids)
	batch["prediction"] = processor.tokenizer._normalize(transcription)
	return batch

	result = dataset.map(map_to_pred)

	wer = load("wer")
	print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))

	'''
	def map_to_pred(batch):
	cleaned_transcription = re.sub(r'\[[^\]]+\]', '', batch['category']).strip()
	print("cleaned transcript", cleaned_transcription)
	cleaned_transcription = preprocess_transcription(batch['category'])
	normalized_transcription = processor.tokenizer._normalize(cleaned_transcription)

	audio = batch["audio"]
	input_features = processor(audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt").input_features
	batch["reference"] = processor.tokenizer._normalize(batch['category'])


	with torch.no_grad():
	predicted_ids = model.generate(input_features)[0]

	transcription = processor.decode(predicted_ids)
	batch["prediction"] = processor.tokenizer._normalize(transcription)
	return batch

	result = dataset.map(map_to_pred)

	wer = load("wer")
	print(100 * wer.compute(references=result["reference"], predictions=result["prediction"]))
	'''
	'''
	with torch.no_grad():
	outputs = model(input_ids=input_ids, attention_mask=attention_mask)
	print("outputs ", outputs)

	# Convert predicted token IDs back to text
	predicted_text = tokenizer.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)

	# Get ground truth labels from the dataset
	labels = dataset["audio"] # Replace "labels" with the appropriate key in your dataset
	print("labels are ", labels)

	# Compute WER
	wer = load("wer")
	wer_score = wer(labels, predicted_text)

	# Print or return WER score
	print(f"Word Error Rate (WER): {wer_score}")
	'''

	def transcribe(audio):
	text = pipe(audio)["text"]
	return text, test

	iface = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(source="microphone", type="filepath"),
	outputs="text",
	title="Whisper Small Miso Test",
	)

	iface.launch()


	'''
	print("check check")
	print(inputs)
	input_features = inputs.input_features
	decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
	last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
	list(last_hidden_state.shape)
	print(list(last_hidden_state.shape))
	'''