npark commited on
Commit
86e0936
·
1 Parent(s): 4a604a2

speechbrain v0.5.13 trained model on KsponSpeech dataset

Browse files
Files changed (6) hide show
  1. KsponSpeech_E00099.wav +0 -0
  2. asr.ckpt +3 -0
  3. hyperparams.yaml +134 -0
  4. lm.ckpt +3 -0
  5. normalizer.ckpt +3 -0
  6. tokenizer.ckpt +3 -0
KsponSpeech_E00099.wav ADDED
Binary file (51 kB). View file
 
asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8cc15da795c52688d706de03a6056bb3750ee8923c01f81d585f64bd1769d85
3
+ size 183455481
hyperparams.yaml ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Feature parameters
2
+ sample_rate: 16000
3
+ n_fft: 400
4
+ n_mels: 80
5
+
6
+ ####################### Model parameters ###########################
7
+ # Transformer
8
+ d_model: 256
9
+ nhead: 4
10
+ num_encoder_layers: 12
11
+ num_decoder_layers: 6
12
+ d_ffn: 2048
13
+ transformer_dropout: 0.1
14
+ activation: !name:torch.nn.GELU
15
+ output_neurons: 5000
16
+ vocab_size: 5000
17
+
18
+ # Outputs
19
+ blank_index: 0
20
+ label_smoothing: 0.1
21
+ pad_index: 0
22
+ bos_index: 1
23
+ eos_index: 2
24
+ unk_index: 0
25
+
26
+ # Decoding parameters
27
+ min_decode_ratio: 0.0
28
+ max_decode_ratio: 1.0
29
+ valid_search_interval: 10 # 10
30
+ valid_beam_size: 10
31
+ test_beam_size: 60
32
+ lm_weight: 0.20
33
+ ctc_weight_decode: 0.40
34
+
35
+ ############################## asr models ################################
36
+ normalizer: !new:speechbrain.processing.features.InputNormalization
37
+ norm_type: global
38
+ #####
39
+
40
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
41
+ input_shape: (8, 10, 80)
42
+ num_blocks: 2
43
+ num_layers_per_block: 1
44
+ out_channels: (64, 32)
45
+ kernel_sizes: (3, 3)
46
+ strides: (2, 2)
47
+ residuals: (False, False)
48
+
49
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
50
+ input_size: 640
51
+ tgt_vocab: !ref <output_neurons>
52
+ d_model: !ref <d_model>
53
+ nhead: !ref <nhead>
54
+ num_encoder_layers: !ref <num_encoder_layers>
55
+ num_decoder_layers: !ref <num_decoder_layers>
56
+ d_ffn: !ref <d_ffn>
57
+ dropout: !ref <transformer_dropout>
58
+ activation: !ref <activation>
59
+ encoder_module: conformer
60
+ attention_type: RelPosMHAXL
61
+ normalize_before: True
62
+ causal: False
63
+
64
+ ### lm_model ###
65
+ ################
66
+
67
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
68
+
69
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
70
+ input_size: !ref <d_model>
71
+ n_neurons: !ref <output_neurons>
72
+
73
+ seq_lin: !new:speechbrain.nnet.linear.Linear
74
+ input_size: !ref <d_model>
75
+ n_neurons: !ref <output_neurons>
76
+
77
+ # decoder
78
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
79
+ modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
80
+ bos_index: !ref <bos_index>
81
+ eos_index: !ref <eos_index>
82
+ blank_index: !ref <blank_index>
83
+ min_decode_ratio: !ref <min_decode_ratio>
84
+ max_decode_ratio: !ref <max_decode_ratio>
85
+ beam_size: !ref <valid_beam_size>
86
+ ctc_weight: !ref <ctc_weight_decode>
87
+ using_eos_threshold: False
88
+ length_normalization: False
89
+
90
+ # encoder
91
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
92
+ transformer: !ref <Transformer>
93
+
94
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
95
+ input_shape: [null, null, !ref <n_mels>]
96
+ compute_features: !ref <compute_features>
97
+ normalize: !ref <normalizer>
98
+ cnn: !ref <CNN>
99
+ transformer_encoder: !ref <Tencoder>
100
+
101
+ asr_model: !new:torch.nn.ModuleList
102
+ - [!ref <normalizer>, !ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
103
+
104
+ log_softmax: !new:torch.nn.LogSoftmax
105
+ dim: -1
106
+
107
+
108
+ compute_features: !new:speechbrain.lobes.features.Fbank
109
+ sample_rate: !ref <sample_rate>
110
+ n_fft: !ref <n_fft>
111
+ n_mels: !ref <n_mels>
112
+
113
+ # modules:
114
+ # encoder: !ref <encoder>
115
+ # decoder: !ref <decoder>
116
+
117
+ modules:
118
+ compute_features: !ref <compute_features>
119
+ normalizer: !ref <normalizer>
120
+ pre_transformer: !ref <CNN>
121
+ transformer: !ref <Transformer>
122
+ asr_model: !ref <asr_model>
123
+ # lm_model: !ref <lm_model>
124
+ encoder: !ref <encoder>
125
+ decoder: !ref <decoder>
126
+
127
+ # pretrainer
128
+
129
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
130
+ loadables:
131
+ normalizer: !ref <normalizer>
132
+ asr: !ref <asr_model>
133
+ # lm: !ref <lm_model>
134
+ tokenizer: !ref <tokenizer>
lm.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b84f36502f001cecba2f0e0e9794dc07e99b1c35394ae7c87c4c6ab14f11df4
3
+ size 381062933
normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c19d4d66cfb6e329678af7f8b0555a0233bb6645f0558596d8b966e18dd1a2b
3
+ size 1779
tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d419e55734c26df6c5690671be2b887a7db389c1a7f63286111ce737508c6569
3
+ size 313900