Pur1zumu commited on
Commit
4c0b0e0
·
verified ·
1 Parent(s): 028c696

Upload folder using huggingface_hub

Browse files
content-vec-best/.gitattributes ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
content-vec-best/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ content-vec-best-legacy-500.pt
content-vec-best/README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+
5
+ # Content Vec Best
6
+ Official Repo: [ContentVec](https://github.com/auspicious3000/contentvec)
7
+ This repo brings fairseq ContentVec model to HuggingFace Transformers.
8
+
9
+ ## How to use
10
+ To use this model, you need to define
11
+ ```python
12
+ class HubertModelWithFinalProj(HubertModel):
13
+ def __init__(self, config):
14
+ super().__init__(config)
15
+
16
+ # The final projection layer is only used for backward compatibility.
17
+ # Following https://github.com/auspicious3000/contentvec/issues/6
18
+ # Remove this layer is necessary to achieve the desired outcome.
19
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
20
+ ```
21
+
22
+ and then load the model with
23
+ ```python
24
+ model = HubertModelWithFinalProj.from_pretrained("lengyue233/content-vec-best")
25
+
26
+ x = model(audio)["last_hidden_state"]
27
+ ```
28
+
29
+ ## How to convert
30
+ You need to download the ContentVec_legacy model from the official repo, and then run
31
+ ```bash
32
+ python convert.py
33
+ ```
content-vec-best/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.1,
3
+ "apply_spec_augment": true,
4
+ "architectures": [
5
+ "HubertModelWithFinalProj"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "bos_token_id": 1,
9
+ "classifier_proj_size": 256,
10
+ "conv_bias": false,
11
+ "conv_dim": [
12
+ 512,
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512
19
+ ],
20
+ "conv_kernel": [
21
+ 10,
22
+ 3,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 2,
27
+ 2
28
+ ],
29
+ "conv_stride": [
30
+ 5,
31
+ 2,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2
37
+ ],
38
+ "ctc_loss_reduction": "sum",
39
+ "ctc_zero_infinity": false,
40
+ "do_stable_layer_norm": false,
41
+ "eos_token_id": 2,
42
+ "feat_extract_activation": "gelu",
43
+ "feat_extract_norm": "group",
44
+ "feat_proj_dropout": 0.0,
45
+ "feat_proj_layer_norm": true,
46
+ "final_dropout": 0.1,
47
+ "hidden_act": "gelu",
48
+ "hidden_dropout": 0.1,
49
+ "hidden_size": 768,
50
+ "initializer_range": 0.02,
51
+ "intermediate_size": 3072,
52
+ "layer_norm_eps": 1e-05,
53
+ "layerdrop": 0.1,
54
+ "mask_feature_length": 10,
55
+ "mask_feature_min_masks": 0,
56
+ "mask_feature_prob": 0.0,
57
+ "mask_time_length": 10,
58
+ "mask_time_min_masks": 2,
59
+ "mask_time_prob": 0.05,
60
+ "model_type": "hubert",
61
+ "num_attention_heads": 12,
62
+ "num_conv_pos_embedding_groups": 16,
63
+ "num_conv_pos_embeddings": 128,
64
+ "num_feat_extract_layers": 7,
65
+ "num_hidden_layers": 12,
66
+ "pad_token_id": 0,
67
+ "torch_dtype": "float32",
68
+ "transformers_version": "4.27.3",
69
+ "use_weighted_layer_sum": false,
70
+ "vocab_size": 32
71
+ }
content-vec-best/convert.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from transformers import HubertConfig, HubertModel
4
+ import logging
5
+
6
+ # Ignore fairseq's logger
7
+ logging.getLogger("fairseq").setLevel(logging.WARNING)
8
+ logging.getLogger("torch.distributed.nn.jit.instantiator").setLevel(logging.WARNING)
9
+
10
+ from fairseq import checkpoint_utils
11
+
12
+ models, _, _ = checkpoint_utils.load_model_ensemble_and_task(
13
+ ["content-vec-best-legacy-500.pt"], suffix=""
14
+ )
15
+ model = models[0]
16
+ model.eval()
17
+ model.eval()
18
+
19
+
20
+ class HubertModelWithFinalProj(HubertModel):
21
+ def __init__(self, config):
22
+ super().__init__(config)
23
+
24
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
25
+
26
+
27
+ # Default Config
28
+ hubert = HubertModelWithFinalProj(HubertConfig())
29
+
30
+ # huggingface: fairseq
31
+ mapping = {
32
+ "masked_spec_embed": "mask_emb",
33
+ "encoder.layer_norm.bias": "encoder.layer_norm.bias",
34
+ "encoder.layer_norm.weight": "encoder.layer_norm.weight",
35
+ "encoder.pos_conv_embed.conv.bias": "encoder.pos_conv.0.bias",
36
+ "encoder.pos_conv_embed.conv.weight_g": "encoder.pos_conv.0.weight_g",
37
+ "encoder.pos_conv_embed.conv.weight_v": "encoder.pos_conv.0.weight_v",
38
+ "feature_projection.layer_norm.bias": "layer_norm.bias",
39
+ "feature_projection.layer_norm.weight": "layer_norm.weight",
40
+ "feature_projection.projection.bias": "post_extract_proj.bias",
41
+ "feature_projection.projection.weight": "post_extract_proj.weight",
42
+ "final_proj.bias": "final_proj.bias",
43
+ "final_proj.weight": "final_proj.weight",
44
+ }
45
+
46
+ # Convert encoder
47
+ for layer in range(12):
48
+ for j in ["q", "k", "v"]:
49
+ mapping[
50
+ f"encoder.layers.{layer}.attention.{j}_proj.weight"
51
+ ] = f"encoder.layers.{layer}.self_attn.{j}_proj.weight"
52
+ mapping[
53
+ f"encoder.layers.{layer}.attention.{j}_proj.bias"
54
+ ] = f"encoder.layers.{layer}.self_attn.{j}_proj.bias"
55
+
56
+ mapping[
57
+ f"encoder.layers.{layer}.final_layer_norm.bias"
58
+ ] = f"encoder.layers.{layer}.final_layer_norm.bias"
59
+ mapping[
60
+ f"encoder.layers.{layer}.final_layer_norm.weight"
61
+ ] = f"encoder.layers.{layer}.final_layer_norm.weight"
62
+
63
+ mapping[
64
+ f"encoder.layers.{layer}.layer_norm.bias"
65
+ ] = f"encoder.layers.{layer}.self_attn_layer_norm.bias"
66
+ mapping[
67
+ f"encoder.layers.{layer}.layer_norm.weight"
68
+ ] = f"encoder.layers.{layer}.self_attn_layer_norm.weight"
69
+
70
+ mapping[
71
+ f"encoder.layers.{layer}.attention.out_proj.bias"
72
+ ] = f"encoder.layers.{layer}.self_attn.out_proj.bias"
73
+ mapping[
74
+ f"encoder.layers.{layer}.attention.out_proj.weight"
75
+ ] = f"encoder.layers.{layer}.self_attn.out_proj.weight"
76
+
77
+ mapping[
78
+ f"encoder.layers.{layer}.feed_forward.intermediate_dense.bias"
79
+ ] = f"encoder.layers.{layer}.fc1.bias"
80
+ mapping[
81
+ f"encoder.layers.{layer}.feed_forward.intermediate_dense.weight"
82
+ ] = f"encoder.layers.{layer}.fc1.weight"
83
+
84
+ mapping[
85
+ f"encoder.layers.{layer}.feed_forward.output_dense.bias"
86
+ ] = f"encoder.layers.{layer}.fc2.bias"
87
+ mapping[
88
+ f"encoder.layers.{layer}.feed_forward.output_dense.weight"
89
+ ] = f"encoder.layers.{layer}.fc2.weight"
90
+
91
+ # Convert Conv Layers
92
+ for layer in range(7):
93
+ mapping[
94
+ f"feature_extractor.conv_layers.{layer}.conv.weight"
95
+ ] = f"feature_extractor.conv_layers.{layer}.0.weight"
96
+
97
+ if layer != 0:
98
+ continue
99
+
100
+ mapping[
101
+ f"feature_extractor.conv_layers.{layer}.layer_norm.weight"
102
+ ] = f"feature_extractor.conv_layers.{layer}.2.weight"
103
+ mapping[
104
+ f"feature_extractor.conv_layers.{layer}.layer_norm.bias"
105
+ ] = f"feature_extractor.conv_layers.{layer}.2.bias"
106
+
107
+ hf_keys = set(hubert.state_dict().keys())
108
+ fair_keys = set(model.state_dict().keys())
109
+
110
+ hf_keys -= set(mapping.keys())
111
+ fair_keys -= set(mapping.values())
112
+
113
+ for i, j in zip(sorted(hf_keys), sorted(fair_keys)):
114
+ print(i, j)
115
+
116
+ print(hf_keys, fair_keys)
117
+ print(len(hf_keys), len(fair_keys))
118
+
119
+ # try loading the weights
120
+ new_state_dict = {}
121
+ for k, v in mapping.items():
122
+ new_state_dict[k] = model.state_dict()[v]
123
+
124
+ x = hubert.load_state_dict(new_state_dict, strict=False)
125
+ print(x)
126
+ hubert.eval()
127
+
128
+ with torch.no_grad():
129
+ new_input = torch.randn(1, 16384)
130
+
131
+ result1 = hubert(new_input, output_hidden_states=True)["hidden_states"][9]
132
+ result1 = hubert.final_proj(result1)
133
+
134
+ result2 = model.extract_features(
135
+ **{
136
+ "source": new_input,
137
+ "padding_mask": torch.zeros(1, 16384, dtype=torch.bool),
138
+ # "features_only": True,
139
+ "output_layer": 9,
140
+ }
141
+ )[0]
142
+ result2 = model.final_proj(result2)
143
+
144
+ assert torch.allclose(result1, result2, atol=1e-3)
145
+
146
+ print("Sanity check passed")
147
+
148
+ # Save huggingface model
149
+ hubert.save_pretrained(".")
150
+ print("Saved model")
content-vec-best/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d8dd400e054ddf4e6be75dab5a2549db748cc99e756a097c496c099f65a4854e
3
+ size 378342945