|
#!/usr/bin/env bash |
|
set -e |
|
|
|
|
|
|
|
|
|
if [ -z "$2" ]; then |
|
echo 'Usage: print-vocabulary <MODEL_DIR> <VOCAB_DIR>' |
|
exit 1 |
|
fi |
|
|
|
model_dir="$1" |
|
vocab_dir="$2" |
|
|
|
mkdir -p "${vocab_dir}" |
|
|
|
temp_dir="$(mktemp -d)" |
|
function finish { |
|
rm -rf "${temp_dir}" |
|
} |
|
|
|
trap finish EXIT |
|
|
|
find "${model_dir}" -name '*.zip' -type f | \ |
|
while read -r zip_file; do |
|
model_name="$(basename "${zip_file}" .zip)" |
|
vocab_file="${vocab_dir}/${model_name}.txt" |
|
|
|
if [ -s "${vocab_file}" ]; then |
|
echo "Skipping ${model_name} (${vocab_file})" |
|
continue |
|
fi |
|
|
|
model_dir="${temp_dir}/${model_name}" |
|
mkdir -p "${model_dir}" |
|
unzip -j "${zip_file}" "${model_name}/graph/Gr.fst" -d "${model_dir}" || \ |
|
unzip -j "${zip_file}" "${model_name}/Gr.fst" -d "${model_dir}" || \ |
|
unzip -j "${zip_file}" "${model_name}/words.txt" -d "${model_dir}" || \ |
|
unzip -j "${zip_file}" "${model_name}/graph/words.txt" -d "${model_dir}" || \ |
|
true |
|
|
|
if [ -f "${model_dir}/words.txt" ]; then |
|
cut -d' ' -f1 < "${model_dir}/words.txt" | sort | uniq > "${vocab_file}" |
|
elif [ -f "${model_dir}/Gr.fst" ]; then |
|
fstprint "${model_dir}/Gr.fst" | cut -f3 | sort | uniq > "${vocab_file}" |
|
else |
|
echo "ERROR: can't get vocabulary for ${model_name}" |
|
fi |
|
|
|
done |
|
|