File size: 5,733 Bytes
caaee3e
 
6980dd0
b357c71
caaee3e
 
 
6980dd0
b357c71
caaee3e
f2b8075
 
18b526a
f2b8075
9aedf57
caaee3e
 
 
 
 
8a1e498
70a53fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caaee3e
70a53fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8621c12
70a53fa
8621c12
 
 
 
caaee3e
8621c12
 
caaee3e
8621c12
 
 
 
 
 
70a53fa
 
 
 
 
 
 
 
caaee3e
 
 
 
 
 
 
b357c71
caaee3e
 
 
 
263e119
 
 
 
 
 
caaee3e
 
 
 
 
 
8a1e498
caaee3e
8a1e498
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caaee3e
 
 
8a1e498
caaee3e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b357c71
caaee3e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
from transformers import WhisperTokenizer
import os
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe") #, language="marathi", task="transcribe"

from transformers import pipeline
import gradio as gr
import torch 
import torchaudio

pipe = pipeline(model="thak123/gom-stt-v3", #"thak123/whisper-small-LDC-V1", #"thak123/whisper-small-gom", 
                task="automatic-speech-recognition", 
                tokenizer= tokenizer,
               
               )  # change to "your-username/the-name-you-picked"

# pipe.model.config.forced_decoder_ids = (
#         pipe.tokenizer.get_decoder_prompt_ids(
#             language="marathi", task="transcribe"
#         )
#     )

# def transcribe_speech(filepath):
#     # waveform, sample_rate = torchaudio.load(filepath)

#     # Resample the audio signal to 16k sampling rate
#     # resampler = torchaudio.transforms.Resample(sample_rate, 16000)
#     # waveform_16k = resampler(waveform)

#     # Save the resampled audio signal to a new file
#     # torchaudio.save(filepath, waveform_16k, 16000)    
#     output = pipe(
#         filepath,
#         max_new_tokens=3,
#         generate_kwargs={
#             "task": "transcribe",
#             # "language": "konkani",
#         },  # update with the language you've fine-tuned on
#         chunk_length_s=30,
#         batch_size=8,
#          # sampling_rate=16000,
#         # padding=True
#     )
#     print(output)
#     return output["text"]


def transcribe_speech(filepath):
   
    from transformers import WhisperProcessor, WhisperForConditionalGeneration
    import torch
    import librosa

    # Load model and processor
    model = WhisperForConditionalGeneration.from_pretrained("thak123/gom-stt-v3")
    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", task="transcribe")
    processor = WhisperProcessor.from_pretrained("openai/whisper-small")
    output = ""
    # Load and preprocess audio
    audio_path = filepath 
    audio, sr = librosa.load(audio_path, sr=16000)
    input_features = processor(audio, sampling_rate=16000, return_tensors="pt",truncation=False, padding="max_length").input_features

    # Check length and process
    if input_features.shape[-1] > 3000:
        print("Splitting audio required")
        # from pydub import AudioSegment

        # def split_audio(file_path, chunk_length_ms=30000):  # 30 sec chunks
        #     audio = AudioSegment.from_file(file_path)
        #     chunks = [audio[i:i+chunk_length_ms] for i in range(0, len(audio), chunk_length_ms)]
        #     return chunks

        # # Split and transcribe
        # audio_chunks = split_audio(audio_path)

        # for i, chunk in enumerate(audio_chunks):
        #     print(i)
        #     chunk.export(f"chunk_{i}.wav", format="wav")
        #     result = pipe(f"chunk_{i}.wav")
        #     output += result['text'] + " "
        #     print(f"Chunk {i}: {result['text']}")
    else:
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        output = transcription
        print(transcription)
        
    return output #output["text"]
    
demo = gr.Blocks()

mic_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.components.Textbox(),
)

file_transcribe = gr.Interface(
    fn=transcribe_speech,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.components.Textbox(),
    examples=[
         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
         [os.path.join(os.path.dirname("."),"audio/panaji1920-9.mp3")],
     ],
)
with demo:
    gr.TabbedInterface(
        [mic_transcribe, file_transcribe],
        ["Transcribe Microphone", "Transcribe Audio File"],
    )

demo.launch(debug=True)

# # def transcribe(audio):
# #     # text = pipe(audio)["text"]
# #     # pipe(audio)
# #     text = pipe(audio)
# #     print("op",text)
# #     return text#pipe(audio) #text

# # iface = gr.Interface(
# #     fn=transcribe, 
# #     inputs=[gr.Audio(sources=["microphone", "upload"])], 
# #     outputs="text",
# #     examples=[
# #         [os.path.join(os.path.dirname("."),"audio/chalyaami.mp3")],
# #         [os.path.join(os.path.dirname("."),"audio/ekdonteen.flac")],
# #         [os.path.join(os.path.dirname("."),"audio/heyatachadjaale.mp3")],
# #     ],
# #     title="Whisper Konkani",
# #     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# # )


# # iface.launch()


# from transformers import WhisperTokenizer, pipeline
# import gradio as gr
# import os

# tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="marathi", task="transcribe")

# pipe = pipeline(model="thak123/gom-stt-v3", task="automatic-speech-recognition", tokenizer=tokenizer)

# def transcribe(audio):
#     result = pipe(audio)
#     text = result[0]['text']
#     print("op", text)
#     return text

# iface = gr.Interface(
#     fn=transcribe,
#     inputs=[gr.Audio(sources=["microphone", "upload"])],
#     outputs="text",
#     examples=[
#         [os.path.join(os.path.dirname("."), "audio/chalyaami.mp3")],
#         [os.path.join(os.path.dirname("."), "audio/ekdonteen.flac")],
#         [os.path.join(os.path.dirname("."), "audio/heyatachadjaale.mp3")],
#     ],
#     title="Whisper Konkani",
#     description="Realtime demo for Konkani speech recognition using a fine-tuned Whisper small model.",
# )

# iface.launch()