File size: 4,763 Bytes
ab2b897
49c643d
bffc737
137a8df
bffc737
 
 
49c643d
 
c32cd1a
137a8df
 
 
 
 
db11291
125f0d6
 
 
 
2ce74f8
125f0d6
981560f
 
 
2ce74f8
f091ddf
bffc737
5655bbd
 
ab2b897
981560f
5655bbd
 
981560f
 
 
5655bbd
981560f
 
f091ddf
 
 
bffc737
 
 
 
ab2b897
f091ddf
 
 
 
 
 
 
 
 
 
 
 
 
bffc737
f091ddf
 
 
bffc737
ab2b897
981560f
 
f091ddf
 
 
 
 
bffc737
ab2b897
bffc737
f091ddf
981560f
 
 
 
 
5655bbd
981560f
f091ddf
10f6ed1
981560f
137a8df
bffc737
 
ab2b897
e81b0e9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#%%
from huggingface_hub import login
from transformers import pipeline
from datasets import load_dataset
import gradio as gr
import os

login(token=os.environ['hf_token'])

atco2 = load_dataset('jlvdoorn/atco2-asr', split='validation')
atcosim = load_dataset('jlvdoorn/atcosim', split='validation')

examples = [atco2[0]['audio'], atcosim[0]['audio'], atco2[1]['audio'], atcosim[1]['audio'], atco2[2]['audio'], atcosim[2]['audio']]
examples_labels = ['Example ' + str(i) for i in len(examples)]

## Try to load a local model if available
# try:
#     whisper = pipeline(model='/mnt/projects/whisper/WhisperANSP/Models/whisper-large-v2-atco2-asr-atcosim-ANSP-3h1m', task='automatic-speech-recognition')
#     ttl = 'Whisper Large v2 - ATCO2-ATCOSIM-ANSP'
#     dis = 'This demo will transcribe ATC audio files by using the Whisper Large v2 model fine-tuned on the ATCO2, ATCOSIM and ANSP datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'

# except:
# whisper = pipeline(model='jlvdoorn/whisper-large-v2-atco2-asr-atcosim')
# ttl = 'Whisper Large v2 - ATCO2-ATCOSIM'
# dis = 'This demo will transcribe ATC audio files by using the Whisper Large v2 model fine-tuned on the ATCO2 and ATCOSIM datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'

bert_atco_ner = pipeline(model='Jzuluaga/bert-base-ner-atc-en-atco2-1h')

whisper_v2 = pipeline(model='jlvdoorn/whisper-large-v2-atco2-asr-atcosim')
whisper_v3 = pipeline(model='jlvdoorn/whisper-large-v3-atco2-asr-atcosim')
#%%
def transcribe(audio_file, audio_mic, model_version):
    if model_version == 'large-v2':
        whisper = whisper_v2
        ttl = 'Whisper Large v2 - ATCO2-ATCOSIM'
        dis = 'This demo will transcribe ATC audio files by using the Whisper Large v2 model fine-tuned on the ATCO2 and ATCOSIM datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'
    elif model_version == 'large-v3':
        whisper = whisper_v3
        ttl = 'Whisper Large v3 - ATCO2-ATCOSIM'
        dis = 'This demo will transcribe ATC audio files by using the Whisper Large v3 model fine-tuned on the ATCO2 and ATCOSIM datasets. \n \n Further it uses a Named Entity Recognition model to extract callsigns, commands and values from the transcription. \n This model is based on Google\'s BERT model and fine-tuned on the ATCO2 dataset.'
    if audio_mic is not None:
        return whisper(audio_mic)['text']
    elif audio_file is not None:
        return whisper(audio_file)['text']
    else:
        return 'There was no audio to transcribe...'

#%%
def extractCallSignCommand(transcription):
    if type(transcription) is str:
        result = bert_atco_ner(transcription)
        callsigns = []
        commands = []
        values = []
        for item in result:
            if 'callsign' in item['entity']:
                callsigns.append(item['word'])
            if 'command' in item['entity']:
                commands.append(item['word'])
            if 'value' in item['entity']:
                values.append(item['word'])
                
        return 'Callsigns: ' + ', '.join(callsigns) + '\nCommands: ' + ', '.join(commands) + '\nValues: ' + ', '.join(values)
    else:
        return 'There was no transcription to extract a callsign or command from...'

#%%
def transcribeAndExtract(audio_file, audio_mic, transcribe_only, model_version):
    transcription = transcribe(audio_file, audio_mic, model_version)
    if not transcribe_only:
        callSignCommandValues = extractCallSignCommand(transcription)
    else:
        callSignCommandValues = ''
    return transcription, callSignCommandValues

#%%
iface = gr.Interface(
        fn=transcribeAndExtract,
        inputs=[
            gr.Audio(source='upload', type='filepath', interactive=True), 
            gr.Audio(source='microphone', type='filepath'), 
            
            gr.Checkbox(label='Transcribe only', default=False),
            gr.Dropdown(choices=['large-v2', 'large-v3'], value='large-v3', label='Whisper model version'),
            ],
        outputs=[gr.Text(label='Transcription'), gr.Text(label='Callsigns, commands and values')],
        title='Whisper ATC - Large v3',
        description='Transcribe and extract',
        examples = examples,
)

#%%
#iface.launch(server_name='0.0.0.0', server_port=9000)
iface.launch()