File size: 1,619 Bytes
73f862c
 
 
eb7a9c3
73f862c
 
62a6cf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb7a9c3
 
 
 
 
 
 
 
 
62a6cf8
eb7a9c3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_microphone_live
import torch
import gradio as gr

asr_model = "openai/whisper-tiny.en"
nlp_model = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"

pipe = pipeline("automatic-speech-recognition", model=model_id, device=device)
sampling_rate = pipe.feature_extractor.sampling_rate

chunk_length_s = 10  # how often returns the text
stream_chunk_s = 1  # how often the microphone is checked for new audio
mic = ffmpeg_microphone_live(
    sampling_rate=sampling_rate,
    chunk_length_s=chunk_length_s,
    stream_chunk_s=stream_chunk_s,
)

def listen_print_loop(responses):
    for response in responses:
        if response["text"]:
            print(response["text"], end="\r")
            return response["text"]
        if not response["partial"]:
            print("")


classifier = pipeline("zero-shot-classification", model=nlp_model)
candidate_labels = ["dim the light", "turn on light fully", "turn off light fully", "raise the light", "nothing about light"]


while True:
    context = listen_print_loop(pipe(mic))
    print(context)
    output = classifier(context, candidate_labels, multi_label=False)
    top_label = output['labels'][0]
    top_score = output['scores'][0]
    print(f"Top Prediction: {top_label} with a score of {top_score:.2f}")
    

iface = gr.Interface(
    fn=transcribe,
    inputs=gr.inputs.Audio(source="microphone", type="filepath"),
    outputs="text",
    title="Real-Time ASR Transcription",
    description="Speak into the microphone and get the real-time transcription."
)

iface.launch()