File size: 3,525 Bytes
98b12fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fabf0de
 
 
98b12fe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
from faster_whisper import WhisperModel
import torch
import numpy as np

import os
import wave

def model_init():
    # get device
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model_size = "large-v3"
    if device == "cuda:0":
        # Run on GPU with FP16
        model = WhisperModel(model_size, device="cuda", compute_type="float16")
        print("--------------")
        print("Model runs on GPU")
        print("--------------")
        # or Run on GPU with INT8
        # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
    else:
        # Run on CPU with INT8
        model = WhisperModel(model_size, device="cpu", compute_type="int8")

    return model

model = model_init()

import time

def transcribe_moon(stream, new_chunk):
    start_time = time.time()  # Start timing
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    # Perform the transcription using the specified model and settings
    segments, info = model.transcribe(
        stream,
    )
        # beam_size=5,
        # vad_filter=True,
        # vad_parameters={'min_silence_duration_ms': 500}
    # return stream, transcriber({"sampling_rate": sr, "raw": stream})["text"]
    # Compile the transcript with timestamps
    transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
    language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"

    end_time = time.time()  # End timing
    execution_time = f"Execution time: {end_time - start_time:.2f} seconds"  # Calculate execution time

    return stream, transcript, language_info, execution_time

def transcribe(audio_file):
    start_time = time.time()  # Start timing

    # Perform the transcription using the specified model and settings
    segments, info = model.transcribe(
        audio_file,
        beam_size=5,
        vad_filter=True,
        vad_parameters={'min_silence_duration_ms': 500}
    )

    # Compile the transcript with timestamps
    transcript = "\n".join([f"[{seg.start:.2f}s -> {seg.end:.2f}s] {seg.text}" for seg in segments])
    language_info = f"Detected language: {info.language} with probability {info.language_probability:.2f}"

    end_time = time.time()  # End timing
    execution_time = f"Execution time: {end_time - start_time:.2f} seconds"  # Calculate execution time

    return transcript, language_info, execution_time


# Input and Interface setup for file upload
# , "microphone"
input_audio = gr.Audio(sources=["upload" , "microphone"], type="filepath", label="Upload or Record Audio")

file_upload_interface = gr.Interface(
    fn=transcribe,
    inputs=input_audio,
     outputs=["text", "text", "text"],
    live=True,
    title="Speak to Transcript",
    description="Upload an audio file to transcribe and detect the spoken language."
)

input_audio_mic = gr.Audio(sources=["microphone"], label="Record Audio", streaming=True)
streaming_interface = gr.Interface(
    transcribe_moon,
    ["state", gr.Audio(sources=["microphone"], streaming=True)],
    ["state", "text", "text", "text"],
    live=True,
)

# Combine both interfaces in a single Gradio app using Tabs
tabbed_interface = gr.TabbedInterface(
    interface_list=[file_upload_interface, streaming_interface],
    tab_names=["Upload File", "Live Stream"]
)

tabbed_interface.launch(debug=True)