mutisya commited on
Commit
edc9375
·
verified ·
1 Parent(s): 3833c15

Delete app_previous.py

Browse files
Files changed (1) hide show
  1. app_previous.py +0 -70
app_previous.py DELETED
@@ -1,70 +0,0 @@
1
- import os
2
- import gradio as gr
3
- from pydub import AudioSegment
4
- import pyaudioconvert as pac
5
- import torch
6
- import torchaudio
7
- import sox
8
- from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
9
-
10
-
11
- def convert (audio):
12
- file_name = audio
13
- if file_name.endswith("mp3") or file_name.endswith("wav") or file_name.endswith("ogg"):
14
- if file_name.endswith("mp3"):
15
- sound = AudioSegment.from_mp3(file_name)
16
- sound.export(audio, format="wav")
17
- elif file_name.endswith("ogg"):
18
- sound = AudioSegment.from_ogg(audio)
19
- sound.export(audio, format="wav")
20
- else:
21
- return False
22
- pac.convert_wav_to_16bit_mono(audio,audio)
23
- return True
24
-
25
-
26
- def parse_transcription_with_lm(logits):
27
- result = processor_with_LM.batch_decode(logits.cpu().numpy())
28
- text = result.text
29
- transcription = text[0].replace('<s>','')
30
- return transcription
31
-
32
- def parse_transcription(logits):
33
- predicted_ids = torch.argmax(logits, dim=-1)
34
- transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
35
- return transcription
36
-
37
- def transcribe(audio_path, applyLM):
38
- speech_array, sampling_rate = torchaudio.load(audio_path)
39
- speech = torchaudio.functional.resample(speech_array, orig_freq=sampling_rate, new_freq=16000).squeeze().numpy()
40
- """
41
- if convert(audio_path)== False:
42
- return "The format must be mp3,wav and ogg"
43
- speech, sample_rate = torchaudio.load(audio_path)
44
- """
45
-
46
- inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
47
- with torch.no_grad():
48
- logits = model(inputs.input_values).logits
49
-
50
- if applyLM:
51
- return parse_transcription_with_lm(logits)
52
- else:
53
- return parse_transcription(logits)
54
-
55
- auth_token = os.environ.get("key") or True
56
- model_id = "mutisya/wav2vec2-300m-kik-t22-1k-ft-withLM"
57
- processor = Wav2Vec2Processor.from_pretrained(model_id, use_auth_token=auth_token)
58
- processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id, use_auth_token=auth_token)
59
- model = Wav2Vec2ForCTC.from_pretrained(model_id, use_auth_token=auth_token)
60
-
61
-
62
- gradio_ui = gr.Interface(
63
- fn=transcribe,
64
- title="Speech Recognition",
65
- description="",
66
- inputs=[gr.Audio(source="microphone", type="filepath", optional=True, label="Record from microphone"),
67
- gr.Checkbox(label="Apply LM", value=False)],
68
- outputs=[gr.outputs.Textbox(label="Recognized speech")]
69
- )
70
- gradio_ui.launch(enable_queue=True)