Spaces:

ales
/

wav2vec2-cv-be-lm

Running

App Files Files Community

ales commited on Apr 13, 2022

Commit

feb2a2b

•

1 Parent(s): 0c27fd9

added app

Browse files

Files changed (3) hide show

app.py +44 -0
pipeline.py +66 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from typing import Tuple
+import numpy as np
+import torch
+from torchaudio.transforms import Resample
+from huggingface_hub import hf_hub_download
+import gradio as gr
+from pipeline import PreTrainedPipeline
+HF_HUB_URL = 'ales/wav2vec2-cv-be'
+LM_HUB_FP = 'language_model/cv8be_5gram.bin'
+def main(rate_audio_tuple: Tuple[int, np.ndarray]):
+    sampling_rate, audio = rate_audio_tuple
+    # resample audio to 16kHz
+    resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
+    audio_resampled = resampler(torch.tensor(audio)).numpy().flatten()
+    # download Language Model from HF Hub
+    lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
+    # init pipeline
+    pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
+    # recognize speech
+    text_recognized = pipeline(inputs=audio_resampled)['text'][0]
+    return text_recognized
+iface = gr.Interface(
+    fn=main,
+    inputs='microphone',
+    outputs="text"
+)
+iface.launch()

pipeline.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import numpy as np
+from typing import Dict
+import torch
+import pyctcdecode
+from transformers import (
+    Wav2Vec2Processor,
+    Wav2Vec2ProcessorWithLM,
+    Wav2Vec2ForCTC,
+)
+class PreTrainedPipeline():
+    def __init__(self, model_path: str, language_model_fp: str):
+        self.language_model_fp = language_model_fp
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.model = Wav2Vec2ForCTC.from_pretrained(model_path)
+        self.model.to(self.device)
+        processor = Wav2Vec2Processor.from_pretrained(model_path)
+        self.sampling_rate = processor.feature_extractor.sampling_rate
+        vocab = processor.tokenizer.get_vocab()
+        sorted_vocab_dict = [(char, ix) for char, ix in sorted(vocab.items(), key=lambda item: item[1])]
+        self.decoder = pyctcdecode.build_ctcdecoder(
+            labels=[x[0] for x in sorted_vocab_dict],
+            kenlm_model_path=self.language_model_fp,
+        )
+        self.processor_with_lm = Wav2Vec2ProcessorWithLM(
+            feature_extractor=processor.feature_extractor,
+            tokenizer=processor.tokenizer,
+            decoder=self.decoder
+        )
+    def __call__(self, inputs: np.array) -> Dict[str, str]:
+        """
+        Args:
+            inputs (:obj:`np.array`):
+                The raw waveform of audio received. By default at 16KHz.
+        Return:
+            A :obj:`dict`:. The object return should be liked {"text": "XXX"} containing
+            the detected text from the input audio.
+        """
+        input_values = self.processor_with_lm(
+            inputs, return_tensors="pt",
+            sampling_rate=self.sampling_rate
+        )['input_values']
+        with torch.no_grad():
+            # input_values should be a 1D numpy array by now
+            input_values = torch.tensor(input_values, device=self.device)
+            model_outs = self.model(input_values)
+        logits = model_outs.logits.cpu().detach().numpy()
+        text_predicted = self.processor_with_lm.batch_decode(logits)['text']
+        return {
+            "text": text_predicted
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+transformers==4.17.0
+pyctcdecode==0.3.0
+numpy
+https://github.com/kpu/kenlm/archive/master.zip