vosk-models / _script /print-vocabulary.sh
Michael Hansen
Add vocab
87f72f6
raw
history blame
1.53 kB
#!/usr/bin/env bash
set -e
# Print out the vocabulary from Gr.fst for all zipped models in a directory.
# Assumes fstprint is in PATH and ngramfst.so is in LD_LIBRARY_PATH.
if [ -z "$2" ]; then
echo 'Usage: print-vocabulary <MODEL_DIR> <VOCAB_DIR>'
exit 1
fi
model_dir="$1"
vocab_dir="$2"
mkdir -p "${vocab_dir}"
temp_dir="$(mktemp -d)"
function finish {
rm -rf "${temp_dir}"
}
trap finish EXIT
find "${model_dir}" -name '*.zip' -type f | \
while read -r zip_file; do
model_name="$(basename "${zip_file}" .zip)"
vocab_file="${vocab_dir}/${model_name}.txt"
if [ -s "${vocab_file}" ]; then
echo "Skipping ${model_name} (${vocab_file})"
continue
fi
model_dir="${temp_dir}/${model_name}"
mkdir -p "${model_dir}"
unzip -j "${zip_file}" "${model_name}/graph/Gr.fst" -d "${model_dir}" || \
unzip -j "${zip_file}" "${model_name}/Gr.fst" -d "${model_dir}" || \
unzip -j "${zip_file}" "${model_name}/words.txt" -d "${model_dir}" || \
unzip -j "${zip_file}" "${model_name}/graph/words.txt" -d "${model_dir}" || \
true
if [ -f "${model_dir}/words.txt" ]; then
cut -d' ' -f1 < "${model_dir}/words.txt" | sort | uniq > "${vocab_file}"
elif [ -f "${model_dir}/Gr.fst" ]; then
fstprint "${model_dir}/Gr.fst" | cut -f3 | sort | uniq > "${vocab_file}"
else
echo "ERROR: can't get vocabulary for ${model_name}"
fi
done