### Align text and audio using Montreal Forced Aligner (MFA)

In [None]:
%%capture
!apt update -y
!pip install -U pip

In [None]:
%%capture
%%bash
data_root="./infore_16k_denoised"
mkdir -p $data_root
cd $data_root
wget https://huggingface.co/datasets/ntt123/infore/resolve/main/infore_16k_denoised.zip -O infore.zip
unzip infore.zip 

In [None]:
from pathlib import Path

txt_files = sorted(Path("./infore_16k_denoised").glob("*.txt"))
f = open("/content/words.txt", "w", encoding="utf-8")
for txt_file in txt_files:
 wav_file = txt_file.with_suffix(".wav")
 if not wav_file.exists():
 continue
 line = open(txt_file, "r", encoding="utf-8").read()
 for word in line.strip().lower().split():
 f.write(word)
 f.write("\n")
f.close()

In [None]:
black_list = (
 []
 + ["q", "adn", "h", "stress", "b", "k", "mark", "gas", "cs", "test", "l", "hiv"]
 + ["v", "d", "c", "p", "martin", "visa", "euro", "laser", "x", "real", "shop"]
 + ["studio", "kelvin", "đt", "pop", "rock", "gara", "karaoke", "đicr", "đigiúp"]
 + ["khmer", "ii", "s", "tr", "xhcn", "casino", "guitar", "sex", "oxi", "radio"]
 + ["qúy", "asean", "hlv" "ts", "video", "virus", "usd", "robot", "ph", "album"]
 + ["s", "kg", "km", "g", "tr", "đ", "ak", "d", "m", "n"]
)

In [None]:
ws = open("/content/words.txt").readlines()
f = open("/content/lexicon.txt", "w")
for w in sorted(set(ws)):
 w = w.strip()

 # this is a hack to match phoneme set in the vietTTS repo
 p = list(w)
 p = " ".join(p)
 if w in black_list:
 continue
 else:
 f.write(f"{w}\t{p}\n")
f.close()

In [None]:
%%writefile install_mfa.sh
#!/bin/bash

## a script to install Montreal Forced Aligner (MFA)

root_dir=${1:-/tmp/mfa}
mkdir -p $root_dir
cd $root_dir

# download miniconda3
wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f

#install MFA
$root_dir/miniconda3/bin/conda create -n aligner -c conda-forge montreal-forced-aligner=2.0.0rc7 -y

echo -e "\n======== DONE =========="
echo -e "\nTo activate MFA, run: source $root_dir/miniconda3/bin/activate aligner"

In [None]:
# download and install mfa
INSTALL_DIR = "/tmp/mfa" # path to install directory
!bash ./install_mfa.sh {INSTALL_DIR}

In [None]:
!source {INSTALL_DIR}/miniconda3/bin/activate aligner; \
mfa train --clean -t ./temp -o ./infore_mfa.zip ./infore_16k_denoised lexicon.txt ./infore_textgrid

In [None]:
# copy to train directory
!mkdir -p train_data
!cp ./infore_16k_denoised/*.wav ./train_data
!cp ./infore_textgrid/*.TextGrid ./train_data