rristo
/

icefall_conformer_ctc3_et

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "b6b6ded1-0a58-43cb-9065-4f4fae02a01b",
    "metadata": {},
    "outputs": [],
@@ -50,18 +50,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "3d69d771-b421-417f-a6ff-e1d1c64ba934",
    "metadata": {},
    "outputs": [],
    "source": [
     "class Args:\n",
-    "    model_filename='conformer_ctc3/exp/jit_trace.pt'\n",
-    "    bpe_model_filename=\"data/lang_bpe_500/bpe.model\"\n",
-    "    method=\"ctc-decoding\"\n",
     "    sample_rate=16000\n",
-    "    num_classes=500 #bpe model size\n",
-    "    frame_shift_ms=10\n",
     "    dither=0\n",
     "    snip_edges=False\n",
     "    num_bins=80\n",
@@ -88,7 +121,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "48306369-fb68-4abe-be62-0806d00059f8",
    "metadata": {},
    "outputs": [],
@@ -163,7 +196,9 @@
     "    \n",
     "    def decode_(self, wave, fbank, model, device, method, bpe_model_filename, num_classes, \n",
     "          min_active_states, max_active_states, subsampling_factor, use_double_scores, \n",
-    "          frame_shift_ms, search_beam, output_beam):\n",
     " \n",
     "        wave = [wave.to(device)]\n",
     "        logging.info(\"Decoding started\")\n",
@@ -223,15 +258,127 @@
     "            logging.info(timestamps)\n",
     "            token_ids = get_texts(best_path)\n",
     "            return self.format_trs(hyps[0], timestamps[0])\n",
     "    \n",
-    "    def transcribe_file(self, audio_filename):\n",
     "        wave=self.read_sound_file_(audio_filename, expected_sample_rate=self.args.sample_rate)\n",
     "        \n",
-    "        trs=self.decode_(wave, self.fbank, self.model, self.args.device, self.args.method, \n",
     "                         self.args.bpe_model_filename, self.args.num_classes,\n",
     "                         self.args.min_active_states, self.args.max_active_states, \n",
     "                         self.args.subsampling_factor, self.args.use_double_scores, \n",
-    "                         self.args.frame_shift_ms, self.args.search_beam, self.args.output_beam)\n",
     "        return trs"
    ]
   },
@@ -245,23 +392,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "50ab7c8e-39b6-4783-8342-e79e91d2417e",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "fatal: not a git repository (or any parent up to mount point /opt)\n",
-      "Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\n",
-      "fatal: not a git repository (or any parent up to mount point /opt)\n",
-      "Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\n",
-      "fatal: not a git repository (or any parent up to mount point /opt)\n",
-      "Stopping at filesystem boundary (GIT_DISCOVERY_ACROSS_FILESYSTEM not set).\n"
-     ]
-    }
-   ],
    "source": [
     "#create transcriber/decoder object\n",
     "#if you want to change parameters (for example model filename) you could create a dict (see class Args attribute names)\n",
@@ -272,7 +406,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "8020f371-7584-4f6c-990b-f2c023e24060",
    "metadata": {},
    "outputs": [
@@ -280,8 +414,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 4.86 s, sys: 435 ms, total: 5.29 s\n",
-      "Wall time: 4.45 s\n"
      ]
     },
     {
@@ -303,7 +437,7 @@
        "  {'word': 'panna', 'start': 10.16, 'end': 10.4}]}"
       ]
      },
-     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -315,7 +449,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "4d2a480d-f0aa-4474-bfdb-ad298a629ce5",
    "metadata": {},
    "outputs": [
@@ -323,8 +457,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 16.2 s, sys: 1.8 s, total: 18 s\n",
-      "Wall time: 15.1 s\n"
      ]
     }
    ],
@@ -334,7 +468,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "id": "d3827548-bca0-4409-95bc-9aa8ba377135",
    "metadata": {},
    "outputs": [
@@ -458,7 +592,7 @@
        "  {'word': 'jah', 'start': 47.56, 'end': 47.68}]}"
       ]
      },
-     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -467,10 +601,109 @@
     "trs"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ea3b25b7-a1f9-4b21-911d-35159c5f3009",
    "metadata": {},
    "outputs": [],
    "source": []

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "b6b6ded1-0a58-43cb-9065-4f4fae02a01b",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "3d69d771-b421-417f-a6ff-e1d1c64ba934",
    "metadata": {},
    "outputs": [],
    "source": [
     "class Args:\n",
+    "    model_filename='conformer_ctc3/exp/jit_trace.pt' #Path to the torchscript model.\n",
+    "    bpe_model_filename='data/lang_bpe_500/bpe.model' #\"Path to bpe.model.\n",
+    "        #Used only when method is ctc-decoding.\n",
+    "    method=\"ctc-decoding\" #decoding method\n",
+    "        # ctc-decoding - Use CTC decoding. It uses a sentence\n",
+    "        #         piece model, i.e., lang_dir/bpe.model, to convert\n",
+    "        #         word pieces to words. It needs neither a lexicon\n",
+    "        #         nor an n-gram LM.\n",
+    "        #     (1) 1best - Use the best path as decoding output. Only\n",
+    "        #         the transformer encoder output is used for decoding.\n",
+    "        #         We call it HLG decoding.\n",
+    "        #     (2) nbest-rescoring. Extract n paths from the decoding lattice,\n",
+    "        #         rescore them with an LM, the path with\n",
+    "        #         the highest score is the decoding result.\n",
+    "        #         We call it HLG decoding + n-gram LM rescoring.\n",
+    "        #     (3) whole-lattice-rescoring - Use an LM to rescore the\n",
+    "        #         decoding lattice and then use 1best to decode the\n",
+    "        #         rescored lattice.\n",
+    "        #         We call it HLG decoding + n-gram LM rescoring.\n",
+    "    HLG='data/lang_bpe_500/HLG.pt' #Path to HLG.pt.\n",
+    "        #Used only when method is not ctc-decoding.\n",
+    "    G='data/lm/G_4_gram.pt' #Used only when method is\n",
+    "        #whole-lattice-rescoring or nbest-rescoring.\n",
+    "        #It's usually a 4-gram LM.\n",
+    "    words_file='data/lang_phone/words.txt' #Path to words.txt.\n",
+    "        #Used only when method is not ctc-decoding.\n",
+    "    num_paths=100 # Used only when method is attention-decoder.\n",
+    "        #It specifies the size of n-best list.\n",
+    "    ngram_lm_scale=0.1 #Used only when method is whole-lattice-rescoring and nbest-rescoring.\n",
+    "                        #It specifies the scale for n-gram LM scores.\n",
+    "                        #(Note: You need to tune it on a dataset.)\n",
+    "    nbest_scale=0.5 #Used only when method is nbest-rescoring.\n",
+    "        # It specifies the scale for lattice.scores when\n",
+    "        # extracting n-best lists. A smaller value results in\n",
+    "        # more unique number of paths with the risk of missing\n",
+    "        # the best path.\n",
     "    sample_rate=16000\n",
+    "    num_classes=500 #Vocab size in the BPE model.\n",
+    "    frame_shift_ms=10 #Frame shift in milliseconds between two contiguous frames.\n",
     "    dither=0\n",
     "    snip_edges=False\n",
     "    num_bins=80\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 25,
    "id": "48306369-fb68-4abe-be62-0806d00059f8",
    "metadata": {},
    "outputs": [],
     "    \n",
     "    def decode_(self, wave, fbank, model, device, method, bpe_model_filename, num_classes, \n",
     "          min_active_states, max_active_states, subsampling_factor, use_double_scores, \n",
+    "          frame_shift_ms, search_beam, output_beam, HLG=None, G=None, words_file=None,\n",
+    "                num_paths=None, ngram_lm_scale=None, nbest_scale=None):\n",
+    "                    \n",
     " \n",
     "        wave = [wave.to(device)]\n",
     "        logging.info(\"Decoding started\")\n",
     "            logging.info(timestamps)\n",
     "            token_ids = get_texts(best_path)\n",
     "            return self.format_trs(hyps[0], timestamps[0])\n",
+    "        \n",
+    "        elif method in [\n",
+    "            \"1best\",\n",
+    "            \"nbest-rescoring\",\n",
+    "            \"whole-lattice-rescoring\",\n",
+    "        ]:\n",
+    "            logging.info(f\"Loading HLG from {HLG}\")\n",
+    "            HLG = k2.Fsa.from_dict(torch.load(HLG, map_location=\"cpu\"))\n",
+    "            HLG = HLG.to(device)\n",
+    "            if not hasattr(HLG, \"lm_scores\"):\n",
+    "                # For whole-lattice-rescoring and attention-decoder\n",
+    "                HLG.lm_scores = HLG.scores.clone()\n",
+    "\n",
+    "            if method in [\n",
+    "                \"nbest-rescoring\",\n",
+    "                \"whole-lattice-rescoring\",\n",
+    "            ]:\n",
+    "                logging.info(f\"Loading G from {G}\")\n",
+    "                G = k2.Fsa.from_dict(torch.load(G, map_location=\"cpu\"))\n",
+    "                G = G.to(device)\n",
+    "                if method == \"whole-lattice-rescoring\":\n",
+    "                    # Add epsilon self-loops to G as we will compose\n",
+    "                    # it with the whole lattice later\n",
+    "                    G = k2.add_epsilon_self_loops(G)\n",
+    "                    G = k2.arc_sort(G)\n",
+    "\n",
+    "                # G.lm_scores is used to replace HLG.lm_scores during\n",
+    "                # LM rescoring.\n",
+    "                G.lm_scores = G.scores.clone()\n",
+    "                if method == \"nbest-rescoring\" or method == \"whole-lattice-rescoring\":\n",
+    "                    #adjustes symbol table othersie returns empty text\n",
+    "                    #https://github.com/k2-fsa/k2/issues/874\n",
+    "                    def is_disambig_symbol(symbol: str, pattern: re.Pattern = re.compile(r'^#\\d+$')) -> bool:\n",
+    "                        return pattern.match(symbol) is not None\n",
+    "\n",
+    "                    def find_first_disambig_symbol(symbols: k2.SymbolTable) -> int:\n",
+    "                        return min(v for k, v in symbols._sym2id.items() if is_disambig_symbol(k))\n",
+    "                    symbol_table = k2.SymbolTable.from_file(words_file)\n",
+    "                    first_word_disambig_id = find_first_disambig_symbol(symbol_table)\n",
+    "                    print(\"disambig id:\", first_word_disambig_id)\n",
+    "                    G.labels[G.labels >= first_word_disambig_id] = 0\n",
+    "                    G.labels_sym = symbol_table\n",
+    "\n",
+    "                #added part, transforms G from Fsa to FsaVec otherwise throws error\n",
+    "                G = k2.create_fsa_vec([G])\n",
+    "                #https://github.com/k2-fsa/k2/blob/master/k2/python/k2/utils.py\n",
+    "                delattr(G, \"aux_labels\")\n",
+    "                G = k2.arc_sort(G)\n",
+    "\n",
+    "\n",
+    "            lattice = get_lattice(\n",
+    "                nnet_output=nnet_output,\n",
+    "                decoding_graph=HLG,\n",
+    "                supervision_segments=supervision_segments,\n",
+    "                search_beam=search_beam,\n",
+    "                output_beam=output_beam,\n",
+    "                min_active_states=min_active_states,\n",
+    "                max_active_states=max_active_states,\n",
+    "                subsampling_factor=subsampling_factor,\n",
+    "            )\n",
+    "\n",
+    "            ############\n",
+    "            # scored_lattice = k2.top_sort(k2.connect(k2.intersect(lattice, G, treat_epsilons_specially=True)))\n",
+    "            # scored_lattice[0].draw(\"after_intersection.svg\", title=\"after_intersection\")\n",
+    "            # scores = scored_lattice.get_forward_scores(True, True)\n",
+    "            # print(scores)\n",
+    "            #########################\n",
+    "            if method == \"1best\":\n",
+    "                logging.info(\"Use HLG decoding\")\n",
+    "                best_path = one_best_decoding(\n",
+    "                    lattice=lattice, use_double_scores=use_double_scores\n",
+    "                )\n",
+    "\n",
+    "                timestamps, hyps = parse_fsa_timestamps_and_texts(\n",
+    "                    best_paths=best_path,\n",
+    "                    word_table=word_table,\n",
+    "                    subsampling_factor=subsampling_factor,\n",
+    "                    frame_shift_ms=frame_shift_ms,\n",
+    "                )\n",
+    "\n",
+    "            if method == \"nbest-rescoring\":\n",
+    "                logging.info(\"Use HLG decoding + LM rescoring\")\n",
+    "                best_path_dict = rescore_with_n_best_list(\n",
+    "                    lattice=lattice,\n",
+    "                    G=G,\n",
+    "                    num_paths=num_paths,\n",
+    "                    lm_scale_list=[ngram_lm_scale],\n",
+    "                    nbest_scale=nbest_scale,\n",
+    "                )\n",
+    "                best_path = next(iter(best_path_dict.values()))\n",
+    "                \n",
+    "            elif method == \"whole-lattice-rescoring\":\n",
+    "                logging.info(\"Use HLG decoding + LM rescoring\")\n",
+    "                best_path_dict = rescore_with_whole_lattice(\n",
+    "                    lattice=lattice,\n",
+    "                    G_with_epsilon_loops=G,\n",
+    "                    lm_scale_list=[ngram_lm_scale],\n",
+    "                )\n",
+    "                best_path = next(iter(best_path_dict.values()))\n",
+    "\n",
+    "            hyps = get_texts(best_path)\n",
+    "            word_sym_table = k2.SymbolTable.from_file(words_file)\n",
+    "            hyps = [[word_sym_table[i] for i in ids] for ids in hyps]\n",
+    "            return hyps\n",
+    "        else:\n",
+    "            raise ValueError(f\"Unsupported decoding method: {method}\")\n",
+    "\n",
     "    \n",
+    "    def transcribe_file(self, audio_filename, method=None):\n",
     "        wave=self.read_sound_file_(audio_filename, expected_sample_rate=self.args.sample_rate)\n",
     "        \n",
+    "        if method is None:\n",
+    "            method=self.args.method\n",
+    "        \n",
+    "        trs=self.decode_(wave, self.fbank, self.model, self.args.device, method, \n",
     "                         self.args.bpe_model_filename, self.args.num_classes,\n",
     "                         self.args.min_active_states, self.args.max_active_states, \n",
     "                         self.args.subsampling_factor, self.args.use_double_scores, \n",
+    "                         self.args.frame_shift_ms, self.args.search_beam, self.args.output_beam,\n",
+    "                        self.args.HLG, self.args.G, self.args.words_file, self.args.num_paths,\n",
+    "                         self.args.ngram_lm_scale, self.args.nbest_scale)\n",
     "        return trs"
    ]
   },
   },
   {
    "cell_type": "code",
+   "execution_count": 26,
    "id": "50ab7c8e-39b6-4783-8342-e79e91d2417e",
    "metadata": {},
+   "outputs": [],
    "source": [
     "#create transcriber/decoder object\n",
     "#if you want to change parameters (for example model filename) you could create a dict (see class Args attribute names)\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "id": "8020f371-7584-4f6c-990b-f2c023e24060",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "CPU times: user 6.22 s, sys: 432 ms, total: 6.65 s\n",
+      "Wall time: 5.79 s\n"
      ]
     },
     {
        "  {'word': 'panna', 'start': 10.16, 'end': 10.4}]}"
       ]
      },
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "id": "4d2a480d-f0aa-4474-bfdb-ad298a629ce5",
    "metadata": {},
    "outputs": [
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "CPU times: user 28.4 s, sys: 2.93 s, total: 31.3 s\n",
+      "Wall time: 27.3 s\n"
      ]
     }
    ],
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "id": "d3827548-bca0-4409-95bc-9aa8ba377135",
    "metadata": {},
    "outputs": [
        "  {'word': 'jah', 'start': 47.56, 'end': 47.68}]}"
       ]
      },
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
     "trs"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "6740a04c-09e1-4497-84e2-5227acd9dda3",
+   "metadata": {},
+   "source": [
+    "## Some other decoding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b012c0d7-04ab-451e-8414-85b4b9ac9165",
+   "metadata": {},
+   "source": [
+    "1best decoding currently not working"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "15fcf012-265a-4464-8da7-1c7e1a46556a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "disambig id: 157281\n",
+      "CPU times: user 3min 56s, sys: 7.52 s, total: 4min 3s\n",
+      "Wall time: 2min 22s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[['mina',\n",
+       "  'tahaksin',\n",
+       "  'homme',\n",
+       "  'täna',\n",
+       "  'ja',\n",
+       "  'homme',\n",
+       "  'kui',\n",
+       "  'saan',\n",
+       "  'kontsu',\n",
+       "  'madise',\n",
+       "  'vei',\n",
+       "  'panna']]"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%time transcriber.transcribe_file('audio/emt16k.wav', method='nbest-rescoring')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "31591ee0-605c-4b20-b01f-cb8643fefdd1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "disambig id: 157281\n",
+      "CPU times: user 41.2 s, sys: 409 ms, total: 41.6 s\n",
+      "Wall time: 31.3 s\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[['mina',\n",
+       "  'tahaksin',\n",
+       "  'homme',\n",
+       "  'täna',\n",
+       "  'ja',\n",
+       "  'homme',\n",
+       "  'kui',\n",
+       "  'saan',\n",
+       "  'all',\n",
+       "  'kontsu',\n",
+       "  'madise',\n",
+       "  'vei',\n",
+       "  'panna']]"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "%time transcriber.transcribe_file('audio/emt16k.wav', method='whole-lattice-rescoring')"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "80dfe34d-a76b-4ddc-a47c-c481c5e1432f",
    "metadata": {},
    "outputs": [],
    "source": []