|
{
|
|
"_name_or_path": "encodec_24khz",
|
|
"architectures": [
|
|
"MimiModel"
|
|
],
|
|
"attention_bias": false,
|
|
"attention_dropout": 0.0,
|
|
"audio_channels": 1,
|
|
"codebook_dim": 256,
|
|
"codebook_size": 2048,
|
|
"compress": 2,
|
|
"dilation_growth_rate": 2,
|
|
"frame_rate": 12.5,
|
|
"head_dim": 64,
|
|
"hidden_act": "gelu",
|
|
"hidden_size": 512,
|
|
"initializer_range": 0.02,
|
|
"intermediate_size": 2048,
|
|
"kernel_size": 7,
|
|
"last_kernel_size": 3,
|
|
"layer_scale_initial_scale": 0.01,
|
|
"max_position_embeddings": 8000,
|
|
"model_type": "mimi",
|
|
"norm_eps": 1e-05,
|
|
"normalize": false,
|
|
"num_attention_heads": 8,
|
|
"num_filters": 64,
|
|
"num_hidden_layers": 8,
|
|
"num_key_value_heads": 8,
|
|
"num_quantizers": 32,
|
|
"num_residual_layers": 1,
|
|
"num_semantic_quantizers": 1,
|
|
"pad_mode": "constant",
|
|
"residual_kernel_size": 3,
|
|
"rope_theta": 10000.0,
|
|
"sampling_rate": 24000,
|
|
"sliding_window": 250,
|
|
"torch_dtype": "float16",
|
|
"transformers_version": "4.48.1",
|
|
"trim_right_ratio": 1.0,
|
|
"upsample_groups": 512,
|
|
"upsampling_ratios": [
|
|
8,
|
|
6,
|
|
5,
|
|
4
|
|
],
|
|
"use_cache": false,
|
|
"use_causal_conv": true,
|
|
"use_conv_shortcut": false,
|
|
"vector_quantization_hidden_dimension": 256
|
|
}
|
|
|