Diffusers
Safetensors
AudioLDM2Pipeline
sanchit-gandhi HF staff commited on
Commit
efd334c
1 Parent(s): e238d94

Add model weights and config

Browse files
feature_extractor/preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length_s": 10,
3
+ "feature_extractor_type": "ClapFeatureExtractor",
4
+ "feature_size": 64,
5
+ "fft_window_size": 1024,
6
+ "frequency_max": 14000,
7
+ "frequency_min": 50,
8
+ "hop_length": 480,
9
+ "max_length_s": 10,
10
+ "n_fft": 1024,
11
+ "nb_frequency_bins": 513,
12
+ "nb_max_frames": 1000,
13
+ "nb_max_samples": 480000,
14
+ "padding": "repeatpad",
15
+ "padding_side": "right",
16
+ "padding_value": 0.0,
17
+ "processor_class": "ClapProcessor",
18
+ "return_attention_mask": false,
19
+ "sampling_rate": 48000,
20
+ "top_db": null,
21
+ "truncation": "rand_trunc"
22
+ }
language_model/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2Model"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 50256,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "max_new_tokens": 8,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 1024,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 1024,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "torch_dtype": "float32",
36
+ "transformers_version": "4.32.0.dev0",
37
+ "use_cache": true,
38
+ "vocab_size": 50257
39
+ }
language_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:925012ada53083b40604540406b53570066b6d218380af45dd426fa531b875fb
3
+ size 497772432
language_model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80896089a320949684e5150079143ee3061df687124216292da482e3b79ddc64
3
+ size 497803293
model_index.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AudioLDM2Pipeline",
3
+ "_diffusers_version": "0.20.0.dev0",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "ClapFeatureExtractor"
7
+ ],
8
+ "language_model": [
9
+ "transformers",
10
+ "GPT2Model"
11
+ ],
12
+ "projection_model": [
13
+ "audioldm2",
14
+ "AudioLDM2ProjectionModel"
15
+ ],
16
+ "scheduler": [
17
+ "diffusers",
18
+ "DDIMScheduler"
19
+ ],
20
+ "text_encoder": [
21
+ "transformers",
22
+ "ClapModel"
23
+ ],
24
+ "text_encoder_2": [
25
+ "transformers",
26
+ "T5EncoderModel"
27
+ ],
28
+ "tokenizer": [
29
+ "transformers",
30
+ "RobertaTokenizerFast"
31
+ ],
32
+ "tokenizer_2": [
33
+ "transformers",
34
+ "T5TokenizerFast"
35
+ ],
36
+ "unet": [
37
+ "audioldm2",
38
+ "AudioLDM2UNet2DConditionModel"
39
+ ],
40
+ "vae": [
41
+ "diffusers",
42
+ "AutoencoderKL"
43
+ ],
44
+ "vocoder": [
45
+ "transformers",
46
+ "SpeechT5HifiGan"
47
+ ]
48
+ }
projection_model/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AudioLDM2ProjectionModel",
3
+ "_diffusers_version": "0.20.0.dev0",
4
+ "langauge_model_dim": 768,
5
+ "text_encoder_1_dim": 1024,
6
+ "text_encoder_dim": 512
7
+ }
projection_model/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfb555ca6f1d76278436c48bafea78b5122b9496434694cb8866c096fb1c6ad0
3
+ size 4739951
projection_model/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d4d8b1233e8193c784ac7c99aed9f76b66312a9ddfe8b1bbad68fe03dd71bde
3
+ size 4737688
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.20.0.dev0",
4
+ "beta_end": 0.0195,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.0015,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "dynamic_thresholding_ratio": 0.995,
10
+ "num_train_timesteps": 1000,
11
+ "prediction_type": "epsilon",
12
+ "rescale_betas_zero_snr": false,
13
+ "sample_max_value": 1.0,
14
+ "set_alpha_to_one": false,
15
+ "steps_offset": 1,
16
+ "thresholding": false,
17
+ "timestep_spacing": "leading",
18
+ "trained_betas": null
19
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "ClapModel"
4
+ ],
5
+ "audio_config": {
6
+ "depths": [
7
+ 2,
8
+ 2,
9
+ 12,
10
+ 2
11
+ ],
12
+ "fusion_num_hidden_layers": 2,
13
+ "hidden_size": 1024,
14
+ "model_type": "clap_audio_model",
15
+ "patch_embeds_hidden_size": 128,
16
+ "projection_hidden_size": 768
17
+ },
18
+ "hidden_size": 768,
19
+ "initializer_factor": 1.0,
20
+ "logit_scale_init_value": 14.285714285714285,
21
+ "model_type": "clap",
22
+ "num_hidden_layers": 16,
23
+ "projection_dim": 512,
24
+ "projection_hidden_act": "relu",
25
+ "text_config": {
26
+ "classifier_dropout": null,
27
+ "fusion_hidden_size": 768,
28
+ "fusion_num_hidden_layers": 2,
29
+ "initializer_range": 0.02,
30
+ "model_type": "clap_text_model",
31
+ "projection_hidden_size": 768
32
+ },
33
+ "torch_dtype": "float64",
34
+ "transformers_version": "4.32.0.dev0"
35
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4a47b4a637dd58e9edb7b64a06acf37328b7cc3eafb0b8a85df895cc9e45d09
3
+ size 776327432
text_encoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:637b3ff0f7b212cedafb00739521dc49d8f7953f12bfc1f76ff692f108a41ed0
3
+ size 776444665
text_encoder_2/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "T5EncoderModel"
4
+ ],
5
+ "classifier_dropout": 0.0,
6
+ "d_ff": 2816,
7
+ "d_kv": 64,
8
+ "d_model": 1024,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "t5",
19
+ "n_positions": 512,
20
+ "num_decoder_layers": 24,
21
+ "num_heads": 16,
22
+ "num_layers": 24,
23
+ "output_past": true,
24
+ "pad_token_id": 0,
25
+ "relative_attention_max_distance": 128,
26
+ "relative_attention_num_buckets": 32,
27
+ "tie_word_embeddings": false,
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.32.0.dev0",
30
+ "use_cache": true,
31
+ "vocab_size": 32128
32
+ }
text_encoder_2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c1d0c8f1c739db9343c12ea4b0e3f2c97a833b3c072c251e91d97b7326fefb4e
3
+ size 1364951064
text_encoder_2/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c4be8e23954ef72bd0d623206a46b7e1ab7fa23f530b7b9f691d40785273b27
3
+ size 1364996921
tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
+ "clean_up_tokenization_spaces": true,
5
+ "cls_token": "<s>",
6
+ "eos_token": "</s>",
7
+ "errors": "replace",
8
+ "mask_token": "<mask>",
9
+ "max_length": null,
10
+ "model_max_length": 512,
11
+ "pad_to_multiple_of": null,
12
+ "pad_token": "<pad>",
13
+ "pad_token_type_id": 0,
14
+ "padding_side": "right",
15
+ "processor_class": "ClapProcessor",
16
+ "sep_token": "</s>",
17
+ "tokenizer_class": "RobertaTokenizer",
18
+ "trim_offsets": true,
19
+ "unk_token": "<unk>"
20
+ }
tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_2/special_tokens_map.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "eos_token": "</s>",
105
+ "pad_token": "<pad>",
106
+ "unk_token": "<unk>"
107
+ }
tokenizer_2/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d60acb128cf7b7f2536e8f38a5b18a05535c9e14c7a355904270e15b0945ea86
3
+ size 791656
tokenizer_2/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_2/tokenizer_config.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<extra_id_0>",
4
+ "<extra_id_1>",
5
+ "<extra_id_2>",
6
+ "<extra_id_3>",
7
+ "<extra_id_4>",
8
+ "<extra_id_5>",
9
+ "<extra_id_6>",
10
+ "<extra_id_7>",
11
+ "<extra_id_8>",
12
+ "<extra_id_9>",
13
+ "<extra_id_10>",
14
+ "<extra_id_11>",
15
+ "<extra_id_12>",
16
+ "<extra_id_13>",
17
+ "<extra_id_14>",
18
+ "<extra_id_15>",
19
+ "<extra_id_16>",
20
+ "<extra_id_17>",
21
+ "<extra_id_18>",
22
+ "<extra_id_19>",
23
+ "<extra_id_20>",
24
+ "<extra_id_21>",
25
+ "<extra_id_22>",
26
+ "<extra_id_23>",
27
+ "<extra_id_24>",
28
+ "<extra_id_25>",
29
+ "<extra_id_26>",
30
+ "<extra_id_27>",
31
+ "<extra_id_28>",
32
+ "<extra_id_29>",
33
+ "<extra_id_30>",
34
+ "<extra_id_31>",
35
+ "<extra_id_32>",
36
+ "<extra_id_33>",
37
+ "<extra_id_34>",
38
+ "<extra_id_35>",
39
+ "<extra_id_36>",
40
+ "<extra_id_37>",
41
+ "<extra_id_38>",
42
+ "<extra_id_39>",
43
+ "<extra_id_40>",
44
+ "<extra_id_41>",
45
+ "<extra_id_42>",
46
+ "<extra_id_43>",
47
+ "<extra_id_44>",
48
+ "<extra_id_45>",
49
+ "<extra_id_46>",
50
+ "<extra_id_47>",
51
+ "<extra_id_48>",
52
+ "<extra_id_49>",
53
+ "<extra_id_50>",
54
+ "<extra_id_51>",
55
+ "<extra_id_52>",
56
+ "<extra_id_53>",
57
+ "<extra_id_54>",
58
+ "<extra_id_55>",
59
+ "<extra_id_56>",
60
+ "<extra_id_57>",
61
+ "<extra_id_58>",
62
+ "<extra_id_59>",
63
+ "<extra_id_60>",
64
+ "<extra_id_61>",
65
+ "<extra_id_62>",
66
+ "<extra_id_63>",
67
+ "<extra_id_64>",
68
+ "<extra_id_65>",
69
+ "<extra_id_66>",
70
+ "<extra_id_67>",
71
+ "<extra_id_68>",
72
+ "<extra_id_69>",
73
+ "<extra_id_70>",
74
+ "<extra_id_71>",
75
+ "<extra_id_72>",
76
+ "<extra_id_73>",
77
+ "<extra_id_74>",
78
+ "<extra_id_75>",
79
+ "<extra_id_76>",
80
+ "<extra_id_77>",
81
+ "<extra_id_78>",
82
+ "<extra_id_79>",
83
+ "<extra_id_80>",
84
+ "<extra_id_81>",
85
+ "<extra_id_82>",
86
+ "<extra_id_83>",
87
+ "<extra_id_84>",
88
+ "<extra_id_85>",
89
+ "<extra_id_86>",
90
+ "<extra_id_87>",
91
+ "<extra_id_88>",
92
+ "<extra_id_89>",
93
+ "<extra_id_90>",
94
+ "<extra_id_91>",
95
+ "<extra_id_92>",
96
+ "<extra_id_93>",
97
+ "<extra_id_94>",
98
+ "<extra_id_95>",
99
+ "<extra_id_96>",
100
+ "<extra_id_97>",
101
+ "<extra_id_98>",
102
+ "<extra_id_99>"
103
+ ],
104
+ "clean_up_tokenization_spaces": true,
105
+ "eos_token": "</s>",
106
+ "extra_ids": 100,
107
+ "model_max_length": 128,
108
+ "pad_token": "<pad>",
109
+ "sp_model_kwargs": {},
110
+ "tokenizer_class": "T5Tokenizer",
111
+ "unk_token": "<unk>"
112
+ }
unet/config.json ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AudioLDM2UNet2DConditionModel",
3
+ "_diffusers_version": "0.20.0.dev0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 128,
8
+ 256,
9
+ 384,
10
+ 640
11
+ ],
12
+ "class_embed_type": null,
13
+ "class_embeddings_concat": false,
14
+ "conv_in_kernel": 3,
15
+ "conv_out_kernel": 3,
16
+ "cross_attention_dim": [
17
+ [
18
+ null,
19
+ 768,
20
+ 1024
21
+ ],
22
+ [
23
+ null,
24
+ 768,
25
+ 1024
26
+ ],
27
+ [
28
+ null,
29
+ 768,
30
+ 1024
31
+ ],
32
+ [
33
+ null,
34
+ 768,
35
+ 1024
36
+ ]
37
+ ],
38
+ "down_block_types": [
39
+ "DownBlock2D",
40
+ "CrossAttnDownBlock2D",
41
+ "CrossAttnDownBlock2D",
42
+ "CrossAttnDownBlock2D"
43
+ ],
44
+ "downsample_padding": 1,
45
+ "flip_sin_to_cos": true,
46
+ "freq_shift": 0,
47
+ "in_channels": 8,
48
+ "layers_per_block": 2,
49
+ "mid_block_scale_factor": 1,
50
+ "mid_block_type": "UNetMidBlock2DCrossAttn",
51
+ "norm_eps": 1e-05,
52
+ "norm_num_groups": 32,
53
+ "num_attention_heads": null,
54
+ "num_class_embeds": null,
55
+ "only_cross_attention": false,
56
+ "out_channels": 8,
57
+ "projection_class_embeddings_input_dim": null,
58
+ "resnet_time_scale_shift": "default",
59
+ "sample_size": 256,
60
+ "time_cond_proj_dim": null,
61
+ "time_embedding_act_fn": null,
62
+ "time_embedding_dim": null,
63
+ "time_embedding_type": "positional",
64
+ "timestep_post_act": null,
65
+ "transformer_layers_per_block": 1,
66
+ "up_block_types": [
67
+ "CrossAttnUpBlock2D",
68
+ "CrossAttnUpBlock2D",
69
+ "CrossAttnUpBlock2D",
70
+ "UpBlock2D"
71
+ ],
72
+ "upcast_attention": false,
73
+ "use_linear_projection": false
74
+ }
unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d8d6f8f65e32c7a72aa6c9b7e87debe93e71e5a94669522f3c5ced98b238df9
3
+ size 1388420361
unet/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:359a5ffb89a844beb2fcfac584aae2cd7cd6e87c3ab1ec4e892ef45d91db77c2
3
+ size 1387964784
vae/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.20.0.dev0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512
9
+ ],
10
+ "down_block_types": [
11
+ "DownEncoderBlock2D",
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D"
14
+ ],
15
+ "force_upcast": true,
16
+ "in_channels": 1,
17
+ "latent_channels": 8,
18
+ "layers_per_block": 2,
19
+ "norm_num_groups": 32,
20
+ "out_channels": 1,
21
+ "sample_size": 1024,
22
+ "scaling_factor": 0.4110932946205139,
23
+ "up_block_types": [
24
+ "UpDecoderBlock2D",
25
+ "UpDecoderBlock2D",
26
+ "UpDecoderBlock2D"
27
+ ]
28
+ }
vae/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3494aadd9cf3e3f0cbb4e913f9b35a25da4a3cb709852e204b667ae5890f758
3
+ size 221586761
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f8ddddc5c45eddaab38a67a434e8a64486964540ba3fc248a0da7cbd599d4ad
3
+ size 221530308
vocoder/config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "SpeechT5HifiGan"
4
+ ],
5
+ "initializer_range": 0.01,
6
+ "leaky_relu_slope": 0.1,
7
+ "model_in_dim": 64,
8
+ "model_type": "hifigan",
9
+ "normalize_before": false,
10
+ "resblock_dilation_sizes": [
11
+ [
12
+ 1,
13
+ 3,
14
+ 5
15
+ ],
16
+ [
17
+ 1,
18
+ 3,
19
+ 5
20
+ ],
21
+ [
22
+ 1,
23
+ 3,
24
+ 5
25
+ ]
26
+ ],
27
+ "resblock_kernel_sizes": [
28
+ 3,
29
+ 7,
30
+ 11
31
+ ],
32
+ "sampling_rate": 16000,
33
+ "torch_dtype": "float32",
34
+ "transformers_version": "4.32.0.dev0",
35
+ "upsample_initial_channel": 1024,
36
+ "upsample_kernel_sizes": [
37
+ 16,
38
+ 16,
39
+ 8,
40
+ 4,
41
+ 4
42
+ ],
43
+ "upsample_rates": [
44
+ 5,
45
+ 4,
46
+ 2,
47
+ 2,
48
+ 2
49
+ ]
50
+ }
vocoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9dc6513c30a5b86c2497712690c04fe74b4aa79fdab6d490b34fcb4e24c590c
3
+ size 221079092
vocoder/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9fbefc2b31c85d1dabe98e53d09ac88039af411162a7e641040a9c2b5f62364
3
+ size 221120349