Spaces:
Sleeping
Sleeping
# This module is from [WeNet](https://github.com/wenet-e2e/wenet). | |
# ## Citations | |
# ```bibtex | |
# @inproceedings{yao2021wenet, | |
# title={WeNet: Production oriented Streaming and Non-streaming End-to-End Speech Recognition Toolkit}, | |
# author={Yao, Zhuoyuan and Wu, Di and Wang, Xiong and Zhang, Binbin and Yu, Fan and Yang, Chao and Peng, Zhendong and Chen, Xiaoyu and Xie, Lei and Lei, Xin}, | |
# booktitle={Proc. Interspeech}, | |
# year={2021}, | |
# address={Brno, Czech Republic }, | |
# organization={IEEE} | |
# } | |
# @article{zhang2022wenet, | |
# title={WeNet 2.0: More Productive End-to-End Speech Recognition Toolkit}, | |
# author={Zhang, Binbin and Wu, Di and Peng, Zhendong and Song, Xingchen and Yao, Zhuoyuan and Lv, Hang and Xie, Lei and Yang, Chao and Pan, Fuping and Niu, Jianwei}, | |
# journal={arXiv preprint arXiv:2203.15455}, | |
# year={2022} | |
# } | |
# | |
from typing import Dict, Optional, Tuple | |
import torch | |
from modules.wenet_extractor.cif.predictor import MAELoss | |
from modules.wenet_extractor.paraformer.search.beam_search import Hypothesis | |
from modules.wenet_extractor.transformer.asr_model import ASRModel | |
from modules.wenet_extractor.transformer.ctc import CTC | |
from modules.wenet_extractor.transformer.decoder import TransformerDecoder | |
from modules.wenet_extractor.transformer.encoder import TransformerEncoder | |
from modules.wenet_extractor.utils.common import IGNORE_ID, add_sos_eos, th_accuracy | |
from modules.wenet_extractor.utils.mask import make_pad_mask | |
class Paraformer(ASRModel): | |
"""Paraformer: Fast and Accurate Parallel Transformer for | |
Non-autoregressive End-to-End Speech Recognition | |
see https://arxiv.org/pdf/2206.08317.pdf | |
""" | |
def __init__( | |
self, | |
vocab_size: int, | |
encoder: TransformerEncoder, | |
decoder: TransformerDecoder, | |
ctc: CTC, | |
predictor, | |
ctc_weight: float = 0.5, | |
predictor_weight: float = 1.0, | |
predictor_bias: int = 0, | |
ignore_id: int = IGNORE_ID, | |
reverse_weight: float = 0.0, | |
lsm_weight: float = 0.0, | |
length_normalized_loss: bool = False, | |
): | |
assert 0.0 <= ctc_weight <= 1.0, ctc_weight | |
assert 0.0 <= predictor_weight <= 1.0, predictor_weight | |
super().__init__( | |
vocab_size, | |
encoder, | |
decoder, | |
ctc, | |
ctc_weight, | |
ignore_id, | |
reverse_weight, | |
lsm_weight, | |
length_normalized_loss, | |
) | |
self.predictor = predictor | |
self.predictor_weight = predictor_weight | |
self.predictor_bias = predictor_bias | |
self.criterion_pre = MAELoss(normalize_length=length_normalized_loss) | |
def forward( | |
self, | |
speech: torch.Tensor, | |
speech_lengths: torch.Tensor, | |
text: torch.Tensor, | |
text_lengths: torch.Tensor, | |
) -> Dict[str, Optional[torch.Tensor]]: | |
"""Frontend + Encoder + Decoder + Calc loss | |
Args: | |
speech: (Batch, Length, ...) | |
speech_lengths: (Batch, ) | |
text: (Batch, Length) | |
text_lengths: (Batch,) | |
""" | |
assert text_lengths.dim() == 1, text_lengths.shape | |
# Check that batch_size is unified | |
assert ( | |
speech.shape[0] | |
== speech_lengths.shape[0] | |
== text.shape[0] | |
== text_lengths.shape[0] | |
), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape) | |
# 1. Encoder | |
encoder_out, encoder_mask = self.encoder(speech, speech_lengths) | |
encoder_out_lens = encoder_mask.squeeze(1).sum(1) | |
# 2a. Attention-decoder branch | |
if self.ctc_weight != 1.0: | |
loss_att, acc_att, loss_pre = self._calc_att_loss( | |
encoder_out, encoder_mask, text, text_lengths | |
) | |
else: | |
# loss_att = None | |
# loss_pre = None | |
loss_att: torch.Tensor = torch.tensor(0) | |
loss_pre: torch.Tensor = torch.tensor(0) | |
# 2b. CTC branch | |
if self.ctc_weight != 0.0: | |
loss_ctc = self.ctc(encoder_out, encoder_out_lens, text, text_lengths) | |
else: | |
loss_ctc = None | |
if loss_ctc is None: | |
loss = loss_att + self.predictor_weight * loss_pre | |
# elif loss_att is None: | |
elif loss_att == torch.tensor(0): | |
loss = loss_ctc | |
else: | |
loss = ( | |
self.ctc_weight * loss_ctc | |
+ (1 - self.ctc_weight) * loss_att | |
+ self.predictor_weight * loss_pre | |
) | |
return { | |
"loss": loss, | |
"loss_att": loss_att, | |
"loss_ctc": loss_ctc, | |
"loss_pre": loss_pre, | |
} | |
def _calc_att_loss( | |
self, | |
encoder_out: torch.Tensor, | |
encoder_mask: torch.Tensor, | |
ys_pad: torch.Tensor, | |
ys_pad_lens: torch.Tensor, | |
) -> Tuple[torch.Tensor, float, torch.Tensor]: | |
if self.predictor_bias == 1: | |
_, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id) | |
ys_pad_lens = ys_pad_lens + self.predictor_bias | |
pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor( | |
encoder_out, ys_pad, encoder_mask, ignore_id=self.ignore_id | |
) | |
# 1. Forward decoder | |
decoder_out, _, _ = self.decoder( | |
encoder_out, encoder_mask, pre_acoustic_embeds, ys_pad_lens | |
) | |
# 2. Compute attention loss | |
loss_att = self.criterion_att(decoder_out, ys_pad) | |
acc_att = th_accuracy( | |
decoder_out.view(-1, self.vocab_size), | |
ys_pad, | |
ignore_label=self.ignore_id, | |
) | |
loss_pre: torch.Tensor = self.criterion_pre( | |
ys_pad_lens.type_as(pre_token_length), pre_token_length | |
) | |
return loss_att, acc_att, loss_pre | |
def calc_predictor(self, encoder_out, encoder_mask): | |
encoder_mask = ( | |
~make_pad_mask(encoder_mask, max_len=encoder_out.size(1))[:, None, :] | |
).to(encoder_out.device) | |
pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor( | |
encoder_out, None, encoder_mask, ignore_id=self.ignore_id | |
) | |
return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index | |
def cal_decoder_with_predictor( | |
self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens | |
): | |
decoder_out, _, _ = self.decoder( | |
encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens | |
) | |
decoder_out = torch.log_softmax(decoder_out, dim=-1) | |
return decoder_out, ys_pad_lens | |
def recognize(self): | |
raise NotImplementedError | |
def paraformer_greedy_search( | |
self, | |
speech: torch.Tensor, | |
speech_lengths: torch.Tensor, | |
decoding_chunk_size: int = -1, | |
num_decoding_left_chunks: int = -1, | |
simulate_streaming: bool = False, | |
) -> Tuple[torch.Tensor, torch.Tensor]: | |
"""Apply beam search on attention decoder | |
Args: | |
speech (torch.Tensor): (batch, max_len, feat_dim) | |
speech_length (torch.Tensor): (batch, ) | |
decoding_chunk_size (int): decoding chunk for dynamic chunk | |
trained model. | |
<0: for decoding, use full chunk. | |
>0: for decoding, use fixed chunk size as set. | |
0: used for training, it's prohibited here | |
simulate_streaming (bool): whether do encoder forward in a | |
streaming fashion | |
Returns: | |
torch.Tensor: decoding result, (batch, max_result_len) | |
""" | |
assert speech.shape[0] == speech_lengths.shape[0] | |
assert decoding_chunk_size != 0 | |
device = speech.device | |
batch_size = speech.shape[0] | |
# Let's assume B = batch_size and N = beam_size | |
# 1. Encoder | |
encoder_out, encoder_mask = self._forward_encoder( | |
speech, | |
speech_lengths, | |
decoding_chunk_size, | |
num_decoding_left_chunks, | |
simulate_streaming, | |
) # (B, maxlen, encoder_dim) | |
encoder_out_lens = encoder_mask.squeeze(1).sum(1) | |
# 2. Predictor | |
predictor_outs = self.calc_predictor(encoder_out, encoder_mask) | |
pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = ( | |
predictor_outs[0], | |
predictor_outs[1], | |
predictor_outs[2], | |
predictor_outs[3], | |
) | |
pre_token_length = pre_token_length.round().long() | |
if torch.max(pre_token_length) < 1: | |
return torch.tensor([]), torch.tensor([]) | |
# 2. Decoder forward | |
decoder_outs = self.cal_decoder_with_predictor( | |
encoder_out, encoder_out_lens, pre_acoustic_embeds, pre_token_length | |
) | |
decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1] | |
hyps = [] | |
b, n, d = decoder_out.size() | |
for i in range(b): | |
x = encoder_out[i, : encoder_out_lens[i], :] | |
am_scores = decoder_out[i, : pre_token_length[i], :] | |
yseq = am_scores.argmax(dim=-1) | |
score = am_scores.max(dim=-1)[0] | |
score = torch.sum(score, dim=-1) | |
# pad with mask tokens to ensure compatibility with sos/eos tokens | |
yseq = torch.tensor( | |
[self.sos] + yseq.tolist() + [self.eos], device=yseq.device | |
) | |
nbest_hyps = [Hypothesis(yseq=yseq, score=score)] | |
for hyp in nbest_hyps: | |
assert isinstance(hyp, (Hypothesis)), type(hyp) | |
# remove sos/eos and get hyps | |
last_pos = -1 | |
if isinstance(hyp.yseq, list): | |
token_int = hyp.yseq[1:last_pos] | |
else: | |
token_int = hyp.yseq[1:last_pos].tolist() | |
# remove blank symbol id and unk id, which is assumed to be 0 | |
# and 1 | |
token_int = list(filter(lambda x: x != 0 and x != 1, token_int)) | |
hyps.append(token_int) | |
return hyps | |
def paraformer_beam_search( | |
self, | |
speech: torch.Tensor, | |
speech_lengths: torch.Tensor, | |
beam_search: torch.nn.Module = None, | |
decoding_chunk_size: int = -1, | |
num_decoding_left_chunks: int = -1, | |
simulate_streaming: bool = False, | |
) -> Tuple[torch.Tensor, torch.Tensor]: | |
"""Apply beam search on attention decoder | |
Args: | |
speech (torch.Tensor): (batch, max_len, feat_dim) | |
speech_lengths (torch.Tensor): (batch, ) | |
beam_search (torch.nn.Moudle): beam search module | |
decoding_chunk_size (int): decoding chunk for dynamic chunk | |
trained model. | |
<0: for decoding, use full chunk. | |
>0: for decoding, use fixed chunk size as set. | |
0: used for training, it's prohibited here | |
simulate_streaming (bool): whether do encoder forward in a | |
streaming fashion | |
Returns: | |
torch.Tensor: decoding result, (batch, max_result_len) | |
""" | |
assert speech.shape[0] == speech_lengths.shape[0] | |
assert decoding_chunk_size != 0 | |
device = speech.device | |
batch_size = speech.shape[0] | |
# Let's assume B = batch_size and N = beam_size | |
# 1. Encoder | |
encoder_out, encoder_mask = self._forward_encoder( | |
speech, | |
speech_lengths, | |
decoding_chunk_size, | |
num_decoding_left_chunks, | |
simulate_streaming, | |
) # (B, maxlen, encoder_dim) | |
encoder_out_lens = encoder_mask.squeeze(1).sum(1) | |
# 2. Predictor | |
predictor_outs = self.calc_predictor(encoder_out, encoder_mask) | |
pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = ( | |
predictor_outs[0], | |
predictor_outs[1], | |
predictor_outs[2], | |
predictor_outs[3], | |
) | |
pre_token_length = pre_token_length.round().long() | |
if torch.max(pre_token_length) < 1: | |
return torch.tensor([]), torch.tensor([]) | |
# 2. Decoder forward | |
decoder_outs = self.cal_decoder_with_predictor( | |
encoder_out, encoder_out_lens, pre_acoustic_embeds, pre_token_length | |
) | |
decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1] | |
hyps = [] | |
b, n, d = decoder_out.size() | |
for i in range(b): | |
x = encoder_out[i, : encoder_out_lens[i], :] | |
am_scores = decoder_out[i, : pre_token_length[i], :] | |
if beam_search is not None: | |
nbest_hyps = beam_search(x=x, am_scores=am_scores) | |
nbest_hyps = nbest_hyps[:1] | |
else: | |
yseq = am_scores.argmax(dim=-1) | |
score = am_scores.max(dim=-1)[0] | |
score = torch.sum(score, dim=-1) | |
# pad with mask tokens to ensure compatibility with sos/eos | |
# tokens | |
yseq = torch.tensor( | |
[self.sos] + yseq.tolist() + [self.eos], device=yseq.device | |
) | |
nbest_hyps = [Hypothesis(yseq=yseq, score=score)] | |
for hyp in nbest_hyps: | |
assert isinstance(hyp, (Hypothesis)), type(hyp) | |
# remove sos/eos and get hyps | |
last_pos = -1 | |
if isinstance(hyp.yseq, list): | |
token_int = hyp.yseq[1:last_pos] | |
else: | |
token_int = hyp.yseq[1:last_pos].tolist() | |
# remove blank symbol id and unk id, which is assumed to be 0 | |
# and 1 | |
token_int = list(filter(lambda x: x != 0 and x != 1, token_int)) | |
hyps.append(token_int) | |
return hyps | |