Spaces:
Running
Running
File size: 4,759 Bytes
ab2b897 49c643d bffc737 137a8df bffc737 49c643d 137a8df db11291 125f0d6 2ce74f8 125f0d6 981560f 2ce74f8 f091ddf bffc737 5655bbd ab2b897 981560f 5655bbd 981560f 5655bbd 981560f f091ddf bffc737 ab2b897 f091ddf bffc737 f091ddf bffc737 ab2b897 981560f f091ddf bffc737 ab2b897 bffc737 f091ddf 981560f 5655bbd 981560f f091ddf 10f6ed1 981560f 137a8df bffc737 ab2b897 e81b0e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
#%%
from huggingface_hub import login
from transformers import pipeline
from datasets import load_dataset
import gradio as gr
import os
login(token=os.environ['hf_token'])
atco2 = load_dataset('jlvdoorn/atco2', split='validation')
atcosim = load_dataset('jlvdoorn/atcosim', split='validation')
examples = [atco2[0]['audio'], atcosim[0]['audio'], atco2[1]['audio'], atcosim[1]['audio'], atco2[2]['audio'], atcosim[2]['audio']]
examples_labels = ['Example ' + str(i) for i in len(examples)]
## Try to load a local model if available
# try:
# whisper = pipeline(model='/mnt/projects/whisper/WhisperANSP/Models/whisper-large-v2-atco2-asr-atcosim-ANSP-3h1m', task='automatic-speech-recognition')
# ttl = 'Whisper Large v2 - ATCO2-ATCOSIM-ANSP'
# dis = 'This demo will transcribe ATC audio files by using the Whisper Large v2 model fine-tuned on the ATCO2, ATCOSIM and ANSP datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'
# except:
# whisper = pipeline(model='jlvdoorn/whisper-large-v2-atco2-asr-atcosim')
# ttl = 'Whisper Large v2 - ATCO2-ATCOSIM'
# dis = 'This demo will transcribe ATC audio files by using the Whisper Large v2 model fine-tuned on the ATCO2 and ATCOSIM datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'
bert_atco_ner = pipeline(model='Jzuluaga/bert-base-ner-atc-en-atco2-1h')
whisper_v2 = pipeline(model='jlvdoorn/whisper-large-v2-atco2-asr-atcosim')
whisper_v3 = pipeline(model='jlvdoorn/whisper-large-v3-atco2-asr-atcosim')
#%%
def transcribe(audio_file, audio_mic, model_version):
if model_version == 'large-v2':
whisper = whisper_v2
ttl = 'Whisper Large v2 - ATCO2-ATCOSIM'
dis = 'This demo will transcribe ATC audio files by using the Whisper Large v2 model fine-tuned on the ATCO2 and ATCOSIM datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'
elif model_version == 'large-v3':
whisper = whisper_v3
ttl = 'Whisper Large v3 - ATCO2-ATCOSIM'
dis = 'This demo will transcribe ATC audio files by using the Whisper Large v3 model fine-tuned on the ATCO2 and ATCOSIM datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'
if audio_mic is not None:
return whisper(audio_mic)['text']
elif audio_file is not None:
return whisper(audio_file)['text']
else:
return 'There was no audio to transcribe...'
#%%
def extractCallSignCommand(transcription):
if type(transcription) is str:
result = bert_atco_ner(transcription)
callsigns = []
commands = []
values = []
for item in result:
if 'callsign' in item['entity']:
callsigns.append(item['word'])
if 'command' in item['entity']:
commands.append(item['word'])
if 'value' in item['entity']:
values.append(item['word'])
return 'Callsigns: ' + ', '.join(callsigns) + '\nCommands: ' + ', '.join(commands) + '\nValues: ' + ', '.join(values)
else:
return 'There was no transcription to extract a callsign or command from...'
#%%
def transcribeAndExtract(audio_file, audio_mic, transcribe_only, model_version):
transcription = transcribe(audio_file, audio_mic, model_version)
if not transcribe_only:
callSignCommandValues = extractCallSignCommand(transcription)
else:
callSignCommandValues = ''
return transcription, callSignCommandValues
#%%
iface = gr.Interface(
fn=transcribeAndExtract,
inputs=[
gr.Audio(source='upload', type='filepath', interactive=True),
gr.Audio(source='microphone', type='filepath'),
gr.Checkbox(label='Transcribe only', default=False),
gr.Dropdown(choices=['large-v2', 'large-v3'], value='large-v3', label='Whisper model version'),
],
outputs=[gr.Text(label='Transcription'), gr.Text(label='Callsigns, commands and values')],
title='Whisper ATC - Large v3',
description='Transcribe and extract',
examples = examples,
)
#%%
#iface.launch(server_name='0.0.0.0', server_port=9000)
iface.launch() |