npark commited on
Commit
6145d03
·
1 Parent(s): 7b172a9

방언별 모델 추가

Browse files
cc/asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad12c7d1a8105c039078fd854cce0a12fb07efea8044d810c5cbcd859a1b65bc
3
+ size 183455481
cc/hyperparams.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Feature parameters
2
+ sample_rate: 16000
3
+ n_fft: 400
4
+ n_mels: 80
5
+
6
+ ####################### Model parameters ###########################
7
+ # Transformer
8
+ d_model: 256
9
+ nhead: 4
10
+ num_encoder_layers: 12
11
+ num_decoder_layers: 6
12
+ d_ffn: 2048
13
+ transformer_dropout: 0.0
14
+ activation: !name:torch.nn.GELU
15
+ output_neurons: 5000
16
+ vocab_size: 5000
17
+
18
+ # Outputs
19
+ blank_index: 0
20
+ label_smoothing: 0.1
21
+ pad_index: 0
22
+ bos_index: 1
23
+ eos_index: 2
24
+ unk_index: 0
25
+
26
+ # Decoding parameters
27
+ min_decode_ratio: 0.0
28
+ max_decode_ratio: 1.0
29
+ valid_search_interval: 10 # 10
30
+ valid_beam_size: 10
31
+ test_beam_size: 60
32
+ lm_weight: 0.20
33
+ ctc_weight_decode: 0.40
34
+
35
+ ############################## asr models ################################
36
+ normalizer: !new:speechbrain.processing.features.InputNormalization
37
+ norm_type: global
38
+ #####
39
+
40
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
41
+ input_shape: (8, 10, 80)
42
+ num_blocks: 2
43
+ num_layers_per_block: 1
44
+ out_channels: (64, 32)
45
+ kernel_sizes: (3, 3)
46
+ strides: (2, 2)
47
+ residuals: (False, False)
48
+
49
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
50
+ input_size: 640
51
+ tgt_vocab: !ref <output_neurons>
52
+ d_model: !ref <d_model>
53
+ nhead: !ref <nhead>
54
+ num_encoder_layers: !ref <num_encoder_layers>
55
+ num_decoder_layers: !ref <num_decoder_layers>
56
+ d_ffn: !ref <d_ffn>
57
+ dropout: !ref <transformer_dropout>
58
+ activation: !ref <activation>
59
+ encoder_module: conformer
60
+ attention_type: RelPosMHAXL
61
+ normalize_before: True
62
+ causal: False
63
+
64
+ ### lm_model ###
65
+ ################
66
+
67
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
68
+
69
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
70
+ input_size: !ref <d_model>
71
+ n_neurons: !ref <output_neurons>
72
+
73
+ seq_lin: !new:speechbrain.nnet.linear.Linear
74
+ input_size: !ref <d_model>
75
+ n_neurons: !ref <output_neurons>
76
+
77
+ # decoder
78
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
79
+ modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
80
+ bos_index: !ref <bos_index>
81
+ eos_index: !ref <eos_index>
82
+ blank_index: !ref <blank_index>
83
+ min_decode_ratio: !ref <min_decode_ratio>
84
+ max_decode_ratio: !ref <max_decode_ratio>
85
+ beam_size: !ref <valid_beam_size>
86
+ ctc_weight: !ref <ctc_weight_decode>
87
+ using_eos_threshold: False
88
+ length_normalization: False
89
+
90
+ # encoder
91
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
92
+ transformer: !ref <Transformer>
93
+
94
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
95
+ input_shape: [null, null, !ref <n_mels>]
96
+ compute_features: !ref <compute_features>
97
+ normalize: !ref <normalizer>
98
+ cnn: !ref <CNN>
99
+ transformer_encoder: !ref <Tencoder>
100
+ # transformer: !ref <Transformer>
101
+
102
+ asr_model: !new:torch.nn.ModuleList
103
+ - [!ref <normalizer>, !ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
104
+
105
+ log_softmax: !new:torch.nn.LogSoftmax
106
+ dim: -1
107
+
108
+
109
+ compute_features: !new:speechbrain.lobes.features.Fbank
110
+ sample_rate: !ref <sample_rate>
111
+ n_fft: !ref <n_fft>
112
+ n_mels: !ref <n_mels>
113
+
114
+ # modules:
115
+ # encoder: !ref <encoder>
116
+ # decoder: !ref <decoder>
117
+
118
+ modules:
119
+ compute_features: !ref <compute_features>
120
+ normalizer: !ref <normalizer>
121
+ pre_transformer: !ref <CNN>
122
+ transformer: !ref <Transformer>
123
+ asr_model: !ref <asr_model>
124
+ # lm_model: !ref <lm_model>
125
+ encoder: !ref <encoder>
126
+ decoder: !ref <decoder>
127
+
128
+ # pretrainer
129
+
130
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
131
+ loadables:
132
+ normalizer: !ref <normalizer>
133
+ asr: !ref <asr_model>
134
+ # lm: !ref <lm_model>
135
+ tokenizer: !ref <tokenizer>
cc/normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83161ae74826513aec3e2a0866425398ad3740754d392e7514bf04b4153924aa
3
+ size 1779
cc/tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b5d8028156e43584fa2d5772840d4b31a3433f38a5a876797420cf4ae463cc6
3
+ size 316607
gs/asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52bc459fda286be791587fb6afad5a687b0ab1dc285d89d6d640cc22a235aba5
3
+ size 183455481
gs/hyperparams.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Feature parameters
2
+ sample_rate: 16000
3
+ n_fft: 400
4
+ n_mels: 80
5
+
6
+ ####################### Model parameters ###########################
7
+ # Transformer
8
+ d_model: 256
9
+ nhead: 4
10
+ num_encoder_layers: 12
11
+ num_decoder_layers: 6
12
+ d_ffn: 2048
13
+ transformer_dropout: 0.0
14
+ activation: !name:torch.nn.GELU
15
+ output_neurons: 5000
16
+ vocab_size: 5000
17
+
18
+ # Outputs
19
+ blank_index: 0
20
+ label_smoothing: 0.1
21
+ pad_index: 0
22
+ bos_index: 1
23
+ eos_index: 2
24
+ unk_index: 0
25
+
26
+ # Decoding parameters
27
+ min_decode_ratio: 0.0
28
+ max_decode_ratio: 1.0
29
+ valid_search_interval: 10 # 10
30
+ valid_beam_size: 10
31
+ test_beam_size: 60
32
+ lm_weight: 0.20
33
+ ctc_weight_decode: 0.40
34
+
35
+ ############################## asr models ################################
36
+ normalizer: !new:speechbrain.processing.features.InputNormalization
37
+ norm_type: global
38
+ #####
39
+
40
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
41
+ input_shape: (8, 10, 80)
42
+ num_blocks: 2
43
+ num_layers_per_block: 1
44
+ out_channels: (64, 32)
45
+ kernel_sizes: (3, 3)
46
+ strides: (2, 2)
47
+ residuals: (False, False)
48
+
49
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
50
+ input_size: 640
51
+ tgt_vocab: !ref <output_neurons>
52
+ d_model: !ref <d_model>
53
+ nhead: !ref <nhead>
54
+ num_encoder_layers: !ref <num_encoder_layers>
55
+ num_decoder_layers: !ref <num_decoder_layers>
56
+ d_ffn: !ref <d_ffn>
57
+ dropout: !ref <transformer_dropout>
58
+ activation: !ref <activation>
59
+ encoder_module: conformer
60
+ attention_type: RelPosMHAXL
61
+ normalize_before: True
62
+ causal: False
63
+
64
+ ### lm_model ###
65
+ ################
66
+
67
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
68
+
69
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
70
+ input_size: !ref <d_model>
71
+ n_neurons: !ref <output_neurons>
72
+
73
+ seq_lin: !new:speechbrain.nnet.linear.Linear
74
+ input_size: !ref <d_model>
75
+ n_neurons: !ref <output_neurons>
76
+
77
+ # decoder
78
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
79
+ modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
80
+ bos_index: !ref <bos_index>
81
+ eos_index: !ref <eos_index>
82
+ blank_index: !ref <blank_index>
83
+ min_decode_ratio: !ref <min_decode_ratio>
84
+ max_decode_ratio: !ref <max_decode_ratio>
85
+ beam_size: !ref <valid_beam_size>
86
+ ctc_weight: !ref <ctc_weight_decode>
87
+ using_eos_threshold: False
88
+ length_normalization: False
89
+
90
+ # encoder
91
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
92
+ transformer: !ref <Transformer>
93
+
94
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
95
+ input_shape: [null, null, !ref <n_mels>]
96
+ compute_features: !ref <compute_features>
97
+ normalize: !ref <normalizer>
98
+ cnn: !ref <CNN>
99
+ transformer_encoder: !ref <Tencoder>
100
+ # transformer: !ref <Transformer>
101
+
102
+ asr_model: !new:torch.nn.ModuleList
103
+ - [!ref <normalizer>, !ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
104
+
105
+ log_softmax: !new:torch.nn.LogSoftmax
106
+ dim: -1
107
+
108
+
109
+ compute_features: !new:speechbrain.lobes.features.Fbank
110
+ sample_rate: !ref <sample_rate>
111
+ n_fft: !ref <n_fft>
112
+ n_mels: !ref <n_mels>
113
+
114
+ # modules:
115
+ # encoder: !ref <encoder>
116
+ # decoder: !ref <decoder>
117
+
118
+ modules:
119
+ compute_features: !ref <compute_features>
120
+ normalizer: !ref <normalizer>
121
+ pre_transformer: !ref <CNN>
122
+ transformer: !ref <Transformer>
123
+ asr_model: !ref <asr_model>
124
+ # lm_model: !ref <lm_model>
125
+ encoder: !ref <encoder>
126
+ decoder: !ref <decoder>
127
+
128
+ # pretrainer
129
+
130
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
131
+ loadables:
132
+ normalizer: !ref <normalizer>
133
+ asr: !ref <asr_model>
134
+ # lm: !ref <lm_model>
135
+ tokenizer: !ref <tokenizer>
gs/normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4e0c384525f909cf5d26d19a4dde5bf7739b6748039e6ce70b789cf5af506a4
3
+ size 1779
gs/tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d99de04f89fc1b1f33747b12f5b4f308367af4e20be798d7b894022d25d82005
3
+ size 315086
gw/asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d3be7c0429c45a16fa1cc180d1d554fc64e9a13e34c7b3a7d7d0f49638212ac
3
+ size 183455481
gw/hyperparams.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Feature parameters
2
+ sample_rate: 16000
3
+ n_fft: 400
4
+ n_mels: 80
5
+
6
+ ####################### Model parameters ###########################
7
+ # Transformer
8
+ d_model: 256
9
+ nhead: 4
10
+ num_encoder_layers: 12
11
+ num_decoder_layers: 6
12
+ d_ffn: 2048
13
+ transformer_dropout: 0.0
14
+ activation: !name:torch.nn.GELU
15
+ output_neurons: 5000
16
+ vocab_size: 5000
17
+
18
+ # Outputs
19
+ blank_index: 0
20
+ label_smoothing: 0.1
21
+ pad_index: 0
22
+ bos_index: 1
23
+ eos_index: 2
24
+ unk_index: 0
25
+
26
+ # Decoding parameters
27
+ min_decode_ratio: 0.0
28
+ max_decode_ratio: 1.0
29
+ valid_search_interval: 10 # 10
30
+ valid_beam_size: 10
31
+ test_beam_size: 60
32
+ lm_weight: 0.20
33
+ ctc_weight_decode: 0.40
34
+
35
+ ############################## asr models ################################
36
+ normalizer: !new:speechbrain.processing.features.InputNormalization
37
+ norm_type: global
38
+ #####
39
+
40
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
41
+ input_shape: (8, 10, 80)
42
+ num_blocks: 2
43
+ num_layers_per_block: 1
44
+ out_channels: (64, 32)
45
+ kernel_sizes: (3, 3)
46
+ strides: (2, 2)
47
+ residuals: (False, False)
48
+
49
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
50
+ input_size: 640
51
+ tgt_vocab: !ref <output_neurons>
52
+ d_model: !ref <d_model>
53
+ nhead: !ref <nhead>
54
+ num_encoder_layers: !ref <num_encoder_layers>
55
+ num_decoder_layers: !ref <num_decoder_layers>
56
+ d_ffn: !ref <d_ffn>
57
+ dropout: !ref <transformer_dropout>
58
+ activation: !ref <activation>
59
+ encoder_module: conformer
60
+ attention_type: RelPosMHAXL
61
+ normalize_before: True
62
+ causal: False
63
+
64
+ ### lm_model ###
65
+ ################
66
+
67
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
68
+
69
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
70
+ input_size: !ref <d_model>
71
+ n_neurons: !ref <output_neurons>
72
+
73
+ seq_lin: !new:speechbrain.nnet.linear.Linear
74
+ input_size: !ref <d_model>
75
+ n_neurons: !ref <output_neurons>
76
+
77
+ # decoder
78
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
79
+ modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
80
+ bos_index: !ref <bos_index>
81
+ eos_index: !ref <eos_index>
82
+ blank_index: !ref <blank_index>
83
+ min_decode_ratio: !ref <min_decode_ratio>
84
+ max_decode_ratio: !ref <max_decode_ratio>
85
+ beam_size: !ref <valid_beam_size>
86
+ ctc_weight: !ref <ctc_weight_decode>
87
+ using_eos_threshold: False
88
+ length_normalization: False
89
+
90
+ # encoder
91
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
92
+ transformer: !ref <Transformer>
93
+
94
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
95
+ input_shape: [null, null, !ref <n_mels>]
96
+ compute_features: !ref <compute_features>
97
+ normalize: !ref <normalizer>
98
+ cnn: !ref <CNN>
99
+ transformer_encoder: !ref <Tencoder>
100
+ # transformer: !ref <Transformer>
101
+
102
+ asr_model: !new:torch.nn.ModuleList
103
+ - [!ref <normalizer>, !ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
104
+
105
+ log_softmax: !new:torch.nn.LogSoftmax
106
+ dim: -1
107
+
108
+
109
+ compute_features: !new:speechbrain.lobes.features.Fbank
110
+ sample_rate: !ref <sample_rate>
111
+ n_fft: !ref <n_fft>
112
+ n_mels: !ref <n_mels>
113
+
114
+ # modules:
115
+ # encoder: !ref <encoder>
116
+ # decoder: !ref <decoder>
117
+
118
+ modules:
119
+ compute_features: !ref <compute_features>
120
+ normalizer: !ref <normalizer>
121
+ pre_transformer: !ref <CNN>
122
+ transformer: !ref <Transformer>
123
+ asr_model: !ref <asr_model>
124
+ # lm_model: !ref <lm_model>
125
+ encoder: !ref <encoder>
126
+ decoder: !ref <decoder>
127
+
128
+ # pretrainer
129
+
130
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
131
+ loadables:
132
+ normalizer: !ref <normalizer>
133
+ asr: !ref <asr_model>
134
+ # lm: !ref <lm_model>
135
+ tokenizer: !ref <tokenizer>
gw/normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1bdff08e4ef1aa7c0d6bf8e5a59a99f01237571181f38915be2b08825e25771
3
+ size 1779
gw/tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ea5069b52d831b3a1da37cf56261f0b26c70da23d9d1496c93052c41547c8e6
3
+ size 317334
jj/asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f89d6f6802d0c3afb1decc4f8804a6a7f5e522e53bfa3566911bf0c7a46cf1a
3
+ size 183455481
jj/hyperparams.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Feature parameters
2
+ sample_rate: 16000
3
+ n_fft: 400
4
+ n_mels: 80
5
+
6
+ ####################### Model parameters ###########################
7
+ # Transformer
8
+ d_model: 256
9
+ nhead: 4
10
+ num_encoder_layers: 12
11
+ num_decoder_layers: 6
12
+ d_ffn: 2048
13
+ transformer_dropout: 0.0
14
+ activation: !name:torch.nn.GELU
15
+ output_neurons: 5000
16
+ vocab_size: 5000
17
+
18
+ # Outputs
19
+ blank_index: 0
20
+ label_smoothing: 0.1
21
+ pad_index: 0
22
+ bos_index: 1
23
+ eos_index: 2
24
+ unk_index: 0
25
+
26
+ # Decoding parameters
27
+ min_decode_ratio: 0.0
28
+ max_decode_ratio: 1.0
29
+ valid_search_interval: 10 # 10
30
+ valid_beam_size: 10
31
+ test_beam_size: 60
32
+ lm_weight: 0.20
33
+ ctc_weight_decode: 0.40
34
+
35
+ ############################## asr models ################################
36
+ normalizer: !new:speechbrain.processing.features.InputNormalization
37
+ norm_type: global
38
+ #####
39
+
40
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
41
+ input_shape: (8, 10, 80)
42
+ num_blocks: 2
43
+ num_layers_per_block: 1
44
+ out_channels: (64, 32)
45
+ kernel_sizes: (3, 3)
46
+ strides: (2, 2)
47
+ residuals: (False, False)
48
+
49
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
50
+ input_size: 640
51
+ tgt_vocab: !ref <output_neurons>
52
+ d_model: !ref <d_model>
53
+ nhead: !ref <nhead>
54
+ num_encoder_layers: !ref <num_encoder_layers>
55
+ num_decoder_layers: !ref <num_decoder_layers>
56
+ d_ffn: !ref <d_ffn>
57
+ dropout: !ref <transformer_dropout>
58
+ activation: !ref <activation>
59
+ encoder_module: conformer
60
+ attention_type: RelPosMHAXL
61
+ normalize_before: True
62
+ causal: False
63
+
64
+ ### lm_model ###
65
+ ################
66
+
67
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
68
+
69
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
70
+ input_size: !ref <d_model>
71
+ n_neurons: !ref <output_neurons>
72
+
73
+ seq_lin: !new:speechbrain.nnet.linear.Linear
74
+ input_size: !ref <d_model>
75
+ n_neurons: !ref <output_neurons>
76
+
77
+ # decoder
78
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
79
+ modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
80
+ bos_index: !ref <bos_index>
81
+ eos_index: !ref <eos_index>
82
+ blank_index: !ref <blank_index>
83
+ min_decode_ratio: !ref <min_decode_ratio>
84
+ max_decode_ratio: !ref <max_decode_ratio>
85
+ beam_size: !ref <valid_beam_size>
86
+ ctc_weight: !ref <ctc_weight_decode>
87
+ using_eos_threshold: False
88
+ length_normalization: False
89
+
90
+ # encoder
91
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
92
+ transformer: !ref <Transformer>
93
+
94
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
95
+ input_shape: [null, null, !ref <n_mels>]
96
+ compute_features: !ref <compute_features>
97
+ normalize: !ref <normalizer>
98
+ cnn: !ref <CNN>
99
+ transformer_encoder: !ref <Tencoder>
100
+ # transformer: !ref <Transformer>
101
+
102
+ asr_model: !new:torch.nn.ModuleList
103
+ - [!ref <normalizer>, !ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
104
+
105
+ log_softmax: !new:torch.nn.LogSoftmax
106
+ dim: -1
107
+
108
+
109
+ compute_features: !new:speechbrain.lobes.features.Fbank
110
+ sample_rate: !ref <sample_rate>
111
+ n_fft: !ref <n_fft>
112
+ n_mels: !ref <n_mels>
113
+
114
+ # modules:
115
+ # encoder: !ref <encoder>
116
+ # decoder: !ref <decoder>
117
+
118
+ modules:
119
+ compute_features: !ref <compute_features>
120
+ normalizer: !ref <normalizer>
121
+ pre_transformer: !ref <CNN>
122
+ transformer: !ref <Transformer>
123
+ asr_model: !ref <asr_model>
124
+ # lm_model: !ref <lm_model>
125
+ encoder: !ref <encoder>
126
+ decoder: !ref <decoder>
127
+
128
+ # pretrainer
129
+
130
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
131
+ loadables:
132
+ normalizer: !ref <normalizer>
133
+ asr: !ref <asr_model>
134
+ # lm: !ref <lm_model>
135
+ tokenizer: !ref <tokenizer>
jj/normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1174d2c9b2914bb523e380af6f94f71e00c00c41999c0c47f43b7e13ad7ccdd2
3
+ size 1779
jj/tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e28448ce28252d0872cd28d1dd595f42f83cbd15e9e135da72be535f9e75402b
3
+ size 318076
jl/asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2527f6585d45100a75c480329c53ff91b764e268f1c0874fa754bdc1a9e044a8
3
+ size 183455481
jl/hyperparams.yaml ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Feature parameters
2
+ sample_rate: 16000
3
+ n_fft: 400
4
+ n_mels: 80
5
+
6
+ ####################### Model parameters ###########################
7
+ # Transformer
8
+ d_model: 256
9
+ nhead: 4
10
+ num_encoder_layers: 12
11
+ num_decoder_layers: 6
12
+ d_ffn: 2048
13
+ transformer_dropout: 0.0
14
+ activation: !name:torch.nn.GELU
15
+ output_neurons: 5000
16
+ vocab_size: 5000
17
+
18
+ # Outputs
19
+ blank_index: 0
20
+ label_smoothing: 0.1
21
+ pad_index: 0
22
+ bos_index: 1
23
+ eos_index: 2
24
+ unk_index: 0
25
+
26
+ # Decoding parameters
27
+ min_decode_ratio: 0.0
28
+ max_decode_ratio: 1.0
29
+ valid_search_interval: 10 # 10
30
+ valid_beam_size: 10
31
+ test_beam_size: 60
32
+ lm_weight: 0.20
33
+ ctc_weight_decode: 0.40
34
+
35
+ ############################## asr models ################################
36
+ normalizer: !new:speechbrain.processing.features.InputNormalization
37
+ norm_type: global
38
+ #####
39
+
40
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
41
+ input_shape: (8, 10, 80)
42
+ num_blocks: 2
43
+ num_layers_per_block: 1
44
+ out_channels: (64, 32)
45
+ kernel_sizes: (3, 3)
46
+ strides: (2, 2)
47
+ residuals: (False, False)
48
+
49
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
50
+ input_size: 640
51
+ tgt_vocab: !ref <output_neurons>
52
+ d_model: !ref <d_model>
53
+ nhead: !ref <nhead>
54
+ num_encoder_layers: !ref <num_encoder_layers>
55
+ num_decoder_layers: !ref <num_decoder_layers>
56
+ d_ffn: !ref <d_ffn>
57
+ dropout: !ref <transformer_dropout>
58
+ activation: !ref <activation>
59
+ encoder_module: conformer
60
+ attention_type: RelPosMHAXL
61
+ normalize_before: True
62
+ causal: False
63
+
64
+ ### lm_model ###
65
+ ################
66
+
67
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
68
+
69
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
70
+ input_size: !ref <d_model>
71
+ n_neurons: !ref <output_neurons>
72
+
73
+ seq_lin: !new:speechbrain.nnet.linear.Linear
74
+ input_size: !ref <d_model>
75
+ n_neurons: !ref <output_neurons>
76
+
77
+ # decoder
78
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
79
+ modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
80
+ bos_index: !ref <bos_index>
81
+ eos_index: !ref <eos_index>
82
+ blank_index: !ref <blank_index>
83
+ min_decode_ratio: !ref <min_decode_ratio>
84
+ max_decode_ratio: !ref <max_decode_ratio>
85
+ beam_size: !ref <valid_beam_size>
86
+ ctc_weight: !ref <ctc_weight_decode>
87
+ using_eos_threshold: False
88
+ length_normalization: False
89
+
90
+ # encoder
91
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
92
+ transformer: !ref <Transformer>
93
+
94
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
95
+ input_shape: [null, null, !ref <n_mels>]
96
+ compute_features: !ref <compute_features>
97
+ normalize: !ref <normalizer>
98
+ cnn: !ref <CNN>
99
+ transformer_encoder: !ref <Tencoder>
100
+ # transformer: !ref <Transformer>
101
+
102
+ asr_model: !new:torch.nn.ModuleList
103
+ - [!ref <normalizer>, !ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
104
+
105
+ log_softmax: !new:torch.nn.LogSoftmax
106
+ dim: -1
107
+
108
+
109
+ compute_features: !new:speechbrain.lobes.features.Fbank
110
+ sample_rate: !ref <sample_rate>
111
+ n_fft: !ref <n_fft>
112
+ n_mels: !ref <n_mels>
113
+
114
+ # modules:
115
+ # encoder: !ref <encoder>
116
+ # decoder: !ref <decoder>
117
+
118
+ modules:
119
+ compute_features: !ref <compute_features>
120
+ normalizer: !ref <normalizer>
121
+ pre_transformer: !ref <CNN>
122
+ transformer: !ref <Transformer>
123
+ asr_model: !ref <asr_model>
124
+ # lm_model: !ref <lm_model>
125
+ encoder: !ref <encoder>
126
+ decoder: !ref <decoder>
127
+
128
+ # pretrainer
129
+
130
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
131
+ loadables:
132
+ normalizer: !ref <normalizer>
133
+ asr: !ref <asr_model>
134
+ # lm: !ref <lm_model>
135
+ tokenizer: !ref <tokenizer>
jl/normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74fc2b66807e754a62359ffdc9b20bf501f7f30f2e74ce9e4690ac36c7b70bf0
3
+ size 1779
jl/tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c059eab392ecbe2cd0c688b14ab757e56ebfa5eb95b017f1e7f32353cdd7cfd
3
+ size 315063