ales commited on
Commit
feb2a2b
1 Parent(s): 0c27fd9
Files changed (3) hide show
  1. app.py +44 -0
  2. pipeline.py +66 -0
  3. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Tuple
2
+
3
+ import numpy as np
4
+
5
+ import torch
6
+ from torchaudio.transforms import Resample
7
+
8
+ from huggingface_hub import hf_hub_download
9
+
10
+ import gradio as gr
11
+
12
+ from pipeline import PreTrainedPipeline
13
+
14
+
15
+ HF_HUB_URL = 'ales/wav2vec2-cv-be'
16
+ LM_HUB_FP = 'language_model/cv8be_5gram.bin'
17
+
18
+
19
+ def main(rate_audio_tuple: Tuple[int, np.ndarray]):
20
+ sampling_rate, audio = rate_audio_tuple
21
+
22
+ # resample audio to 16kHz
23
+ resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
24
+ audio_resampled = resampler(torch.tensor(audio)).numpy().flatten()
25
+
26
+ # download Language Model from HF Hub
27
+ lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
28
+
29
+ # init pipeline
30
+ pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
31
+
32
+ # recognize speech
33
+ text_recognized = pipeline(inputs=audio_resampled)['text'][0]
34
+
35
+ return text_recognized
36
+
37
+
38
+ iface = gr.Interface(
39
+ fn=main,
40
+ inputs='microphone',
41
+ outputs="text"
42
+ )
43
+
44
+ iface.launch()
pipeline.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+ from typing import Dict
4
+
5
+ import torch
6
+ import pyctcdecode
7
+
8
+ from transformers import (
9
+ Wav2Vec2Processor,
10
+ Wav2Vec2ProcessorWithLM,
11
+ Wav2Vec2ForCTC,
12
+ )
13
+
14
+
15
+ class PreTrainedPipeline():
16
+
17
+ def __init__(self, model_path: str, language_model_fp: str):
18
+ self.language_model_fp = language_model_fp
19
+
20
+ self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
21
+ self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
22
+ self.model.to(self.device)
23
+
24
+ processor = Wav2Vec2Processor.from_pretrained(model_path)
25
+ self.sampling_rate = processor.feature_extractor.sampling_rate
26
+
27
+ vocab = processor.tokenizer.get_vocab()
28
+ sorted_vocab_dict = [(char, ix) for char, ix in sorted(vocab.items(), key=lambda item: item[1])]
29
+
30
+ self.decoder = pyctcdecode.build_ctcdecoder(
31
+ labels=[x[0] for x in sorted_vocab_dict],
32
+ kenlm_model_path=self.language_model_fp,
33
+ )
34
+
35
+ self.processor_with_lm = Wav2Vec2ProcessorWithLM(
36
+ feature_extractor=processor.feature_extractor,
37
+ tokenizer=processor.tokenizer,
38
+ decoder=self.decoder
39
+ )
40
+
41
+ def __call__(self, inputs: np.array) -> Dict[str, str]:
42
+ """
43
+ Args:
44
+ inputs (:obj:`np.array`):
45
+ The raw waveform of audio received. By default at 16KHz.
46
+ Return:
47
+ A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
48
+ the detected text from the input audio.
49
+ """
50
+
51
+ input_values = self.processor_with_lm(
52
+ inputs, return_tensors="pt",
53
+ sampling_rate=self.sampling_rate
54
+ )['input_values']
55
+
56
+ with torch.no_grad():
57
+ # input_values should be a 1D numpy array by now
58
+ input_values = torch.tensor(input_values, device=self.device)
59
+ model_outs = self.model(input_values)
60
+ logits = model_outs.logits.cpu().detach().numpy()
61
+
62
+ text_predicted = self.processor_with_lm.batch_decode(logits)['text']
63
+
64
+ return {
65
+ "text": text_predicted
66
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ transformers==4.17.0
2
+ pyctcdecode==0.3.0
3
+ numpy
4
+ https://github.com/kpu/kenlm/archive/master.zip