MUSTAR commited on
Commit
7105a54
·
verified ·
1 Parent(s): 170a68a

Upload 2 files

Browse files
Scripts/prepare.sh ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONDA_ROOT=/home/$(whoami)/miniconda3
2
+
3
+ source ${CONDA_ROOT}/etc/profile.d/conda.sh
4
+ conda activate contentvec
5
+
6
+ mkdir -p feature/lab
7
+
8
+ # Generate manifest files
9
+ python3 fairseq/examples/wav2vec/wav2vec_manifest.py dataset --dest feature --valid-percent 0.1
10
+
11
+ # Filter out files with silence and update manifests
12
+ python remove_silence_files.py feature/train.tsv feature/valid.tsv feature/filtered
13
+
14
+ cp feature/filtered/train.tsv feature/lab/train.tsv
15
+ cp feature/filtered/valid.tsv feature/lab/valid.tsv
16
+
17
+ # Continue with feature extraction
18
+ rm -rf fairseq/examples/hubert/simple_kmeans/dump_hubert_feature.py
19
+ cp dump_hubert_feature.py fairseq/examples/hubert/simple_kmeans/dump_hubert_feature.py
20
+
21
+ tsv_dir="feature/lab"
22
+ split="train"
23
+ ckpt_path="checkpoint_best_legacy_500.pt"
24
+ layer=12
25
+ nshard=1
26
+ rank=0
27
+ feat_dir="feature"
28
+ km_path="feature/${split}.km"
29
+ lab_dir="feature/lab"
30
+ n_clusters=100
31
+
32
+ python speaker.py
33
+
34
+ # Extract features
35
+ python fairseq/examples/hubert/simple_kmeans/dump_hubert_feature.py $tsv_dir $split $ckpt_path $layer $nshard $rank $feat_dir
Scripts/remove_silence_files.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import soundfile as sf
4
+ from tqdm import tqdm
5
+
6
+ def is_significant_audio(file_path, silence_threshold=-40, silence_percent=90):
7
+ """
8
+ Check if an audio file contains significant non-silent parts.
9
+ """
10
+ try:
11
+ data, samplerate = sf.read(file_path)
12
+ if len(data) == 0:
13
+ return False # Empty file
14
+
15
+ # Calculate audio energy
16
+ energy = (data ** 2).mean()
17
+ silence_ratio = (energy < silence_threshold).sum() / len(data) * 100
18
+ return silence_ratio < silence_percent
19
+ except Exception as e:
20
+ print(f"Error processing {file_path}: {e}")
21
+ return False
22
+
23
+ def filter_manifest(manifest_path, output_path, dataset_dir):
24
+ """
25
+ Read the manifest file, check for silence, and write filtered files.
26
+ """
27
+ with open(manifest_path, 'r') as f:
28
+ lines = f.readlines()
29
+
30
+ filtered_lines = [lines[0]] # Keep the header
31
+ for line in tqdm(lines[1:], desc=f"Processing {manifest_path}"):
32
+ file_path = os.path.join(dataset_dir, line.split("\t")[0])
33
+ if is_significant_audio(file_path):
34
+ filtered_lines.append(line)
35
+ else:
36
+ print(f"Skipping file due to silence: {file_path}")
37
+
38
+ with open(output_path, 'w') as f_out:
39
+ f_out.writelines(filtered_lines)
40
+
41
+ if __name__ == "__main__":
42
+ train_manifest = sys.argv[1]
43
+ valid_manifest = sys.argv[2]
44
+ output_dir = sys.argv[3]
45
+
46
+ os.makedirs(output_dir, exist_ok=True)
47
+
48
+ dataset_dir = "dataset"
49
+ filter_manifest(train_manifest, os.path.join(output_dir, "train.tsv"), dataset_dir)
50
+ filter_manifest(valid_manifest, os.path.join(output_dir, "valid.tsv"), dataset_dir)