sts

Running

App Files Files Community

lewistape commited on 4 days ago

Commit

8fc270e

verified ·

1 Parent(s): d9961d8

Update asr.py

Browse files

Files changed (1) hide show

asr.py +74 -105

asr.py CHANGED Viewed

@@ -3,84 +3,115 @@ from transformers import Wav2Vec2ForCTC, AutoProcessor
 import torch
 import numpy as np
 from pathlib import Path
-import concurrent.futures
 from torch.cuda.amp import autocast
 from huggingface_hub import hf_hub_download
 from pyctcdecode import build_ctcdecoder
 import json
-import resampy  # Import resampy for faster resampling
 ASR_SAMPLING_RATE = 16_000
-CHUNK_LENGTH_S = 60  # Adjust based on your testing
-MAX_CONCURRENT_CHUNKS = 4  # Adjust based on VRAM (monitor with nvidia-smi)
-BATCH_SIZE = 4  # Batch size for processing chunks within process_chunk
 ASR_LANGUAGES = {}
-with open(f"data/asr/all_langs.tsv", "r") as f:
     for line in f:
         iso, name = line.split(" ", 1)
         ASR_LANGUAGES[iso.strip()] = name.strip()
 MODEL_ID = "facebook/mms-1b-all"
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
-# Optimize model for inference
-model.eval()  # Ensure the model is in evaluation mode
-# Dictionary to store loaded adapters
-loaded_adapters = {}
-# Dictionary to cache language model decoders for each language
-cached_decoders = {}
 def load_audio(audio_data):
     if isinstance(audio_data, tuple):
         sr, audio_samples = audio_data
         audio_samples = (audio_samples / 32768.0).astype(np.float32)
         if sr != ASR_SAMPLING_RATE:
-            audio_samples = resampy.resample(audio_samples, sr, ASR_SAMPLING_RATE)  # Use resampy
     elif isinstance(audio_data, np.ndarray):
         audio_samples = audio_data
     elif isinstance(audio_data, str):
         audio_samples, sr = librosa.load(audio_data, sr=ASR_SAMPLING_RATE, mono=True)
         if sr != ASR_SAMPLING_RATE:
-            audio_samples = resampy.resample(audio_samples, sr, ASR_SAMPLING_RATE)  # Use resampy
     else:
-        raise ValueError(f"Invalid Audio Input Instance: {type(audio_data)}")
     return audio_samples
 def process_chunk(chunks, device, decoder=None):
-    batch_size = BATCH_SIZE  # Local batch size
     transcriptions = []
-    for i in range(0, len(chunks), batch_size):
-        batch = chunks[i : i + batch_size]
         inputs = processor(
             batch,
             sampling_rate=ASR_SAMPLING_RATE,
             return_tensors="pt",
             padding=True,
             truncation=True,
-        ).to(
-            device
-        )  # Enable padding
         with torch.no_grad():
             with autocast():
                 outputs = model(**inputs).logits
         if decoder:
-            # Batch decoding with LM (if pyctcdecode supports it)
-            texts = decoder.decode_batch(outputs.cpu().numpy())  # Check for batch support
             transcriptions.extend(texts)
         else:
             ids = torch.argmax(outputs, dim=-1)
-            for id_tensor in ids:
-                transcriptions.append(processor.decode(id_tensor))
     return " ".join(transcriptions)
 def transcribe(audio_data=None, lang="eng (English)", use_lm_decoder=False):
     if audio_data is None or (isinstance(audio_data, np.ndarray) and audio_data.size == 0):
         return "<<ERROR: Empty Audio Input>>"
@@ -92,97 +123,35 @@ def transcribe(audio_data=None, lang="eng (English)", use_lm_decoder=False):
     lang_code = lang.split()[0]
-    # Load adapter efficiently
     if lang_code not in loaded_adapters:
         processor.tokenizer.set_target_lang(lang_code)
         model.load_adapter(lang_code)
-        loaded_adapters[lang_code] = True  # Mark as loaded
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
-    # Create chunks
     chunk_length = int(CHUNK_LENGTH_S * ASR_SAMPLING_RATE)
     chunks = [
-        audio_samples[i : i + chunk_length]
         for i in range(0, len(audio_samples), chunk_length)
     ]
-    # Use cached language model decoder if available
-    if use_lm_decoder and lang_code in cached_decoders:
-        decoder = cached_decoders[lang_code]
-    else:
-        decoder = None
-        if use_lm_decoder:
-            lm_decoding_config = {}
-            lm_decoding_configfile = hf_hub_download(
-                repo_id="facebook/mms-cclms",
-                filename="decoding_config.json",
-                subfolder="mms-1b-all",
-            )
-            with open(lm_decoding_configfile) as f:
-                lm_decoding_config = json.loads(f.read())
-            if lang_code in lm_decoding_config:
-                decoding_config = lm_decoding_config[lang_code]
-                lm_file = hf_hub_download(
-                    repo_id="facebook/mms-cclms",
-                    filename=decoding_config["lmfile"].rsplit("/", 1)[1],
-                    subfolder=decoding_config["lmfile"].rsplit("/", 1)[0],
-                )
-                token_file = hf_hub_download(
-                    repo_id="facebook/mms-cclms",
-                    filename=decoding_config["tokensfile"].rsplit("/", 1)[1],
-                    subfolder=decoding_config["tokensfile"].rsplit("/", 1)[0],
-                )
-                lexicon_file = None
-                if decoding_config["lexiconfile"] is not None:
-                    lexicon_file = hf_hub_download(
-                        repo_id="facebook/mms-cclms",
-                        filename=decoding_config["lexiconfile"].rsplit("/", 1)[1],
-                        subfolder=decoding_config["lexiconfile"].rsplit("/", 1)[0],
-                    )
-                vocab_dict = processor.tokenizer.get_vocab()
-                sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
-                vocab = [x[1] for x in sort_vocab]
-                vocab_list = vocab
-                # Update special tokens
-                vocab_list[vocab_list.index("<s>")] = "<s>"
-                vocab_list[vocab_list.index("</s>")] = "</s>"
-                vocab_list[vocab_list.index("<pad>")] = "<pad>"
-                decoder = build_ctcdecoder(
-                    vocab_list,
-                    kenlm_model_path=lm_file,  # either .arpa or .bin file
-                    alpha=float(decoding_config["alpha"]),
-                    beta=float(decoding_config["beta"]),
-                )
-                # Cache the decoder for this language
-                cached_decoders[lang_code] = decoder
-    # Process chunks with the selected batch size
-    transcription = process_chunk(chunks, device, decoder)
-    return transcription
-# Example usage (Make sure the file paths are correct)
-ASR_EXAMPLES = [
-    ["upload/english.mp3", "eng (English)"],  # Update with your file paths
-    # ["upload/tamil.mp3", "tam (Tamil)"],
-    # ["upload/burmese.mp3",  "mya (Burmese)"],
-]
-# Example to transcribe with LM decoding (for supported languages like English)
-# result_with_lm = transcribe("upload/english.mp3", "eng (English)", use_lm_decoder=True)
-# print(f"Transcription with LM decoding: {result_with_lm}")
-# Example to transcribe without LM decoding
-# result_without_lm = transcribe("upload/english.mp3", "eng (English)", use_lm_decoder=False)
-# print(f"Transcription without LM decoding: {result_without_lm}")
 for audio_path, lang in ASR_EXAMPLES:
     try:
@@ -198,4 +167,4 @@ for audio_path, lang in ASR_EXAMPLES:
         else:
             print(f"Error: File not found: {audio_path}")
     except Exception as e:
-        print(f"An error occurred while processing {audio_path}: {e}")

 import torch
 import numpy as np
 from pathlib import Path
 from torch.cuda.amp import autocast
 from huggingface_hub import hf_hub_download
 from pyctcdecode import build_ctcdecoder
 import json
+import resampy  # For efficient resampling
 ASR_SAMPLING_RATE = 16_000
+CHUNK_LENGTH_S = 60  # Adjust chunk length in seconds
+BATCH_SIZE = 4  # Batch size for processing chunks
 ASR_LANGUAGES = {}
+# Load available ASR languages
+with open("data/asr/all_langs.tsv", "r") as f:
     for line in f:
         iso, name = line.split(" ", 1)
         ASR_LANGUAGES[iso.strip()] = name.strip()
 MODEL_ID = "facebook/mms-1b-all"
 processor = AutoProcessor.from_pretrained(MODEL_ID)
 model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+# Ensure the model is in evaluation mode for inference
+model.eval()
+loaded_adapters = {}  # Store loaded adapters
+cached_decoders = {}  # Cache language model decoders for each language
 def load_audio(audio_data):
     if isinstance(audio_data, tuple):
         sr, audio_samples = audio_data
         audio_samples = (audio_samples / 32768.0).astype(np.float32)
         if sr != ASR_SAMPLING_RATE:
+            audio_samples = resampy.resample(audio_samples, sr, ASR_SAMPLING_RATE)
     elif isinstance(audio_data, np.ndarray):
         audio_samples = audio_data
     elif isinstance(audio_data, str):
         audio_samples, sr = librosa.load(audio_data, sr=ASR_SAMPLING_RATE, mono=True)
         if sr != ASR_SAMPLING_RATE:
+            audio_samples = resampy.resample(audio_samples, sr, ASR_SAMPLING_RATE)
     else:
+        raise ValueError(f"Invalid Audio Input: {type(audio_data)}")
     return audio_samples
 def process_chunk(chunks, device, decoder=None):
     transcriptions = []
+    max_length = CHUNK_LENGTH_S * ASR_SAMPLING_RATE  # Maximum input length for truncation
+    for i in range(0, len(chunks), BATCH_SIZE):
+        batch = chunks[i:i + BATCH_SIZE]
+        batch = [chunk[:max_length] for chunk in batch]  # Truncate each chunk to max_length
         inputs = processor(
             batch,
             sampling_rate=ASR_SAMPLING_RATE,
             return_tensors="pt",
             padding=True,
             truncation=True,
+            max_length=max_length,
+        ).to(device)
         with torch.no_grad():
             with autocast():
                 outputs = model(**inputs).logits
         if decoder:
+            texts = decoder.decode_batch(outputs.cpu().numpy())
             transcriptions.extend(texts)
         else:
             ids = torch.argmax(outputs, dim=-1)
+            transcriptions.extend(processor.batch_decode(ids))
     return " ".join(transcriptions)
+def load_decoder_for_language(lang_code):
+    lm_decoding_configfile = hf_hub_download(
+        repo_id="facebook/mms-cclms",
+        filename="decoding_config.json",
+        subfolder="mms-1b-all",
+    )
+    with open(lm_decoding_configfile) as f:
+        lm_decoding_config = json.load(f)
+    if lang_code in lm_decoding_config:
+        decoding_config = lm_decoding_config[lang_code]
+        lm_file = hf_hub_download(
+            repo_id="facebook/mms-cclms",
+            filename=decoding_config["lmfile"].rsplit("/", 1)[1],
+            subfolder=decoding_config["lmfile"].rsplit("/", 1)[0],
+        )
+        vocab_dict = processor.tokenizer.get_vocab()
+        vocab_list = [key for key, _ in sorted(vocab_dict.items(), key=lambda item: item[1])]
+        vocab_list[vocab_list.index("<s>")] = "<s>"
+        vocab_list[vocab_list.index("</s>")] = "</s>"
+        return build_ctcdecoder(
+            vocab_list,
+            kenlm_model_path=lm_file,
+            alpha=float(decoding_config["alpha"]),
+            beta=float(decoding_config["beta"]),
+        )
+    else:
+        raise ValueError(f"No LM configuration found for language code: {lang_code}")
 def transcribe(audio_data=None, lang="eng (English)", use_lm_decoder=False):
     if audio_data is None or (isinstance(audio_data, np.ndarray) and audio_data.size == 0):
         return "<<ERROR: Empty Audio Input>>"
     lang_code = lang.split()[0]
     if lang_code not in loaded_adapters:
         processor.tokenizer.set_target_lang(lang_code)
         model.load_adapter(lang_code)
+        loaded_adapters[lang_code] = True
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     model.to(device)
     chunk_length = int(CHUNK_LENGTH_S * ASR_SAMPLING_RATE)
     chunks = [
+        audio_samples[i:i + chunk_length]
         for i in range(0, len(audio_samples), chunk_length)
     ]
+    decoder = cached_decoders.get(lang_code) if use_lm_decoder else None
+    if use_lm_decoder and lang_code not in cached_decoders:
+        try:
+            decoder = load_decoder_for_language(lang_code)
+            cached_decoders[lang_code] = decoder
+        except Exception as e:
+            print(f"<<WARNING: Could not load LM decoder for {lang_code}: {str(e)}>>")
+    return process_chunk(chunks, device, decoder)
+# Example usage
+ASR_EXAMPLES = [
+    ["upload/english.mp3", "eng (English)"],
+]
 for audio_path, lang in ASR_EXAMPLES:
     try:
         else:
             print(f"Error: File not found: {audio_path}")
     except Exception as e:
+        print(f"An error occurred while processing {audio_path}: {e}")