kjysmu commited on
Commit
51f9444
·
verified ·
1 Parent(s): 4c53a91

Upload 17 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ inference/input/test.mp3 filter=lfs diff=lfs merge=lfs -text
inference/.DS_Store ADDED
Binary file (8.2 kB). View file
 
inference/data/base_config.yaml ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "1.34"
2
+
3
+ lr: 1e-4
4
+ log_step: 1
5
+ split: 0
6
+ batch_size: 8
7
+ sr: 16000
8
+
9
+ datasets:
10
+ - jamendo
11
+ - emomusic
12
+ - pmemo
13
+ - deam
14
+
15
+ model:
16
+ encoder: "MERT"
17
+ layers:
18
+ - 5
19
+ - 6
20
+ classifier: "linear-mt-attn-ck"
21
+ # - linear
22
+ # - linear-attn-ck
23
+ # - linear-mt-attn-ck
24
+
25
+ kd: True
26
+ kd_weight: 0.8
27
+ kd_temperature: 1
28
+ lr: 1e-4
29
+
30
+ # audio_path: './dataset/jamendo'
31
+ # subset: 'moodtheme'
32
+
33
+ dataset:
34
+ jamendo:
35
+ root: './dataset/jamendo'
36
+ subset: 'moodtheme'
37
+ batch_size: 8
38
+ output_size : 56
39
+ split: 0
40
+ segment_type: "all" # [all,f10s,f30s,10s,30s]
41
+ num_workers: 4
42
+ deam:
43
+ root: './dataset/deam'
44
+ batch_size: 8
45
+ output_size : 2
46
+ segment_type: "all" # [all,f10s,f30s,10s,30s]
47
+ num_workers: 4
48
+ pmemo:
49
+ root: './dataset/pmemo'
50
+ batch_size: 8
51
+ output_size : 2
52
+ segment_type: "all" # [all,f10s,f30s,10s,30s]
53
+ num_workers: 4
54
+ emomusic:
55
+ root: './dataset/emomusic'
56
+ batch_size: 8
57
+ output_size : 2
58
+ segment_type: "all" # [all,f10s,f30s,10s,30s]
59
+ num_workers: 4
60
+
61
+
62
+
63
+ # --------------------------------------- #
64
+ genre_class_size: 87
65
+ mood_class_size: 56
66
+ instr_class_size: 40
67
+ dac_latents_size: 72
68
+ dac_rvq_size: 9
69
+ # --------------------------------------- #
70
+
71
+
72
+ #PMEMO BEST (0.5360 0.7772), mt: (0.5401 0.7780)
73
+ checkpoint_pmemo: "tb_logs/best/P.ckpt"
74
+
75
+ #DEAM BEST (0.5131 0.6025), mt: (0.5150 0.6125)
76
+ checkpoint_deam: "tb_logs/best/D.ckpt"
77
+
78
+ #EMOMUSIC BEST (0.5957 0.7489), mt: (0.6091 0.7525)
79
+ checkpoint_emomusic: "tb_logs/best/E.ckpt"
80
+
81
+ #JAMENDO BEST (0.1521 0.7806)
82
+ checkpoint_jamendo: "tb_logs/best/J.ckpt"
83
+
84
+
85
+
86
+
87
+ # datasets:
88
+ # - jamendo
89
+ # - pmemo
90
+ # - deam
91
+ # - emomusic
92
+ # - pmemo
93
+ # - jamendo
94
+ # datasets_val:
95
+ # - emomusic
96
+ # model_save_path: './saved_models/'
97
+ # results_save_path: './results/'
98
+ # hydra:
99
+ # job:
100
+ # chdir: True
101
+ # - MERT M2L LIBROSA Encodec DAC
102
+ # aggr_method: "mean"
103
+ # - mean
104
+ # - median
105
+ # - 80th_percentile
106
+ # - max
inference/data/btc_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71c2c5db17e8c43b8a9a9da5db36ef2d667158c07a214eba16344c154c00bf54
3
+ size 12154754
inference/data/btc_model_large_voca.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1673d23f8f9a55ae7f9e8b80a51da616debb22675b8d8b67ea6ce0ef37b0ab51
3
+ size 12229576
inference/data/chord.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"N": 0, "C": 1, "C:dim": 2, "C:sus4": 3, "C:min7": 4, "C:min": 5, "C:sus2": 6, "C:aug": 7, "C:dim7": 8, "C:maj6": 9, "C:hdim7": 10, "C:7": 11, "C:min6": 12, "C:maj7": 13, "C#": 14, "C#:dim": 15, "C#:sus4": 16, "C#:min7": 17, "C#:min": 18, "C#:sus2": 19, "C#:aug": 20, "C#:dim7": 21, "C#:maj6": 22, "C#:hdim7": 23, "C#:7": 24, "C#:min6": 25, "C#:maj7": 26, "D": 27, "D:dim": 28, "D:sus4": 29, "D:min7": 30, "D:min": 31, "D:sus2": 32, "D:aug": 33, "D:dim7": 34, "D:maj6": 35, "D:hdim7": 36, "D:7": 37, "D:min6": 38, "D:maj7": 39, "D#": 40, "D#:dim": 41, "D#:sus4": 42, "D#:min7": 43, "D#:min": 44, "D#:sus2": 45, "D#:aug": 46, "D#:dim7": 47, "D#:maj6": 48, "D#:hdim7": 49, "D#:7": 50, "D#:min6": 51, "D#:maj7": 52, "E": 53, "E:dim": 54, "E:sus4": 55, "E:min7": 56, "E:min": 57, "E:sus2": 58, "E:aug": 59, "E:dim7": 60, "E:maj6": 61, "E:hdim7": 62, "E:7": 63, "E:min6": 64, "E:maj7": 65, "F": 66, "F:dim": 67, "F:sus4": 68, "F:min7": 69, "F:min": 70, "F:sus2": 71, "F:aug": 72, "F:dim7": 73, "F:maj6": 74, "F:hdim7": 75, "F:7": 76, "F:min6": 77, "F:maj7": 78, "F#": 79, "F#:dim": 80, "F#:sus4": 81, "F#:min7": 82, "F#:min": 83, "F#:sus2": 84, "F#:aug": 85, "F#:dim7": 86, "F#:maj6": 87, "F#:hdim7": 88, "F#:7": 89, "F#:min6": 90, "F#:maj7": 91, "G": 92, "G:dim": 93, "G:sus4": 94, "G:min7": 95, "G:min": 96, "G:sus2": 97, "G:aug": 98, "G:dim7": 99, "G:maj6": 100, "G:hdim7": 101, "G:7": 102, "G:min6": 103, "G:maj7": 104, "G#": 105, "G#:dim": 106, "G#:sus4": 107, "G#:min7": 108, "G#:min": 109, "G#:sus2": 110, "G#:aug": 111, "G#:dim7": 112, "G#:maj6": 113, "G#:hdim7": 114, "G#:7": 115, "G#:min6": 116, "G#:maj7": 117, "A": 118, "A:dim": 119, "A:sus4": 120, "A:min7": 121, "A:min": 122, "A:sus2": 123, "A:aug": 124, "A:dim7": 125, "A:maj6": 126, "A:hdim7": 127, "A:7": 128, "A:min6": 129, "A:maj7": 130, "A#": 131, "A#:dim": 132, "A#:sus4": 133, "A#:min7": 134, "A#:min": 135, "A#:sus2": 136, "A#:aug": 137, "A#:dim7": 138, "A#:maj6": 139, "A#:hdim7": 140, "A#:7": 141, "A#:min6": 142, "A#:maj7": 143, "B": 144, "B:dim": 145, "B:sus4": 146, "B:min7": 147, "B:min": 148, "B:sus2": 149, "B:aug": 150, "B:dim7": 151, "B:maj6": 152, "B:hdim7": 153, "B:7": 154, "B:min6": 155, "B:maj7": 156, "X": 157 }
inference/data/chord_attr.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"N": 0, "maj": 1, "dim": 2, "sus4": 3, "min7": 4, "min": 5, "sus2": 6, "aug": 7, "dim7": 8, "maj6": 9, "hdim7": 10, "7": 11, "min6": 12, "maj7": 13}
inference/data/chord_attr_inv.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "N",
3
+ "1": "maj",
4
+ "2": "dim",
5
+ "3": "sus4",
6
+ "4": "min7",
7
+ "5": "min",
8
+ "6": "sus2",
9
+ "7": "aug",
10
+ "8": "dim7",
11
+ "9": "maj6",
12
+ "10": "hdim7",
13
+ "11": "7",
14
+ "12": "min6",
15
+ "13": "maj7"
16
+ }
inference/data/chord_inv.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": "N", "1": "C", "2": "C:dim", "3": "C:sus4", "4": "C:min7", "5": "C:min", "6": "C:sus2", "7": "C:aug", "8": "C:dim7", "9": "C:maj6", "10": "C:hdim7", "11": "C:7", "12": "C:min6", "13": "C:maj7", "14": "C#", "15": "C#:dim", "16": "C#:sus4", "17": "C#:min7", "18": "C#:min", "19": "C#:sus2", "20": "C#:aug", "21": "C#:dim7", "22": "C#:maj6", "23": "C#:hdim7", "24": "C#:7", "25": "C#:min6", "26": "C#:maj7", "27": "D", "28": "D:dim", "29": "D:sus4", "30": "D:min7", "31": "D:min", "32": "D:sus2", "33": "D:aug", "34": "D:dim7", "35": "D:maj6", "36": "D:hdim7", "37": "D:7", "38": "D:min6", "39": "D:maj7", "40": "D#", "41": "D#:dim", "42": "D#:sus4", "43": "D#:min7", "44": "D#:min", "45": "D#:sus2", "46": "D#:aug", "47": "D#:dim7", "48": "D#:maj6", "49": "D#:hdim7", "50": "D#:7", "51": "D#:min6", "52": "D#:maj7", "53": "E", "54": "E:dim", "55": "E:sus4", "56": "E:min7", "57": "E:min", "58": "E:sus2", "59": "E:aug", "60": "E:dim7", "61": "E:maj6", "62": "E:hdim7", "63": "E:7", "64": "E:min6", "65": "E:maj7", "66": "F", "67": "F:dim", "68": "F:sus4", "69": "F:min7", "70": "F:min", "71": "F:sus2", "72": "F:aug", "73": "F:dim7", "74": "F:maj6", "75": "F:hdim7", "76": "F:7", "77": "F:min6", "78": "F:maj7", "79": "F#", "80": "F#:dim", "81": "F#:sus4", "82": "F#:min7", "83": "F#:min", "84": "F#:sus2", "85": "F#:aug", "86": "F#:dim7", "87": "F#:maj6", "88": "F#:hdim7", "89": "F#:7", "90": "F#:min6", "91": "F#:maj7", "92": "G", "93": "G:dim", "94": "G:sus4", "95": "G:min7", "96": "G:min", "97": "G:sus2", "98": "G:aug", "99": "G:dim7", "100": "G:maj6", "101": "G:hdim7", "102": "G:7", "103": "G:min6", "104": "G:maj7", "105": "G#", "106": "G#:dim", "107": "G#:sus4", "108": "G#:min7", "109": "G#:min", "110": "G#:sus2", "111": "G#:aug", "112": "G#:dim7", "113": "G#:maj6", "114": "G#:hdim7", "115": "G#:7", "116": "G#:min6", "117": "G#:maj7", "118": "A", "119": "A:dim", "120": "A:sus4", "121": "A:min7", "122": "A:min", "123": "A:sus2", "124": "A:aug", "125": "A:dim7", "126": "A:maj6", "127": "A:hdim7", "128": "A:7", "129": "A:min6", "130": "A:maj7", "131": "A#", "132": "A#:dim", "133": "A#:sus4", "134": "A#:min7", "135": "A#:min", "136": "A#:sus2", "137": "A#:aug", "138": "A#:dim7", "139": "A#:maj6", "140": "A#:hdim7", "141": "A#:7", "142": "A#:min6", "143": "A#:maj7", "144": "B", "145": "B:dim", "146": "B:sus4", "147": "B:min7", "148": "B:min", "149": "B:sus2", "150": "B:aug", "151": "B:dim7", "152": "B:maj6", "153": "B:hdim7", "154": "B:7", "155": "B:min6", "156": "B:maj7", "157": "X"}
inference/data/chord_root.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"N": 0, "C": 1, "C#": 2, "D": 3, "D#": 4, "E": 5, "F": 6, "F#": 7, "G": 8, "G#": 9, "A": 10, "A#": 11, "B": 12}
inference/data/chord_root_inv.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "N",
3
+ "1": "C",
4
+ "2": "C#",
5
+ "3": "D",
6
+ "4": "D#",
7
+ "5": "E",
8
+ "6": "F",
9
+ "7": "F#",
10
+ "8": "G",
11
+ "9": "G#",
12
+ "10": "A",
13
+ "11": "A#",
14
+ "12": "B"
15
+ }
inference/data/prep_config.yaml ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ device_id: 3
2
+
3
+ is_split: True
4
+ segment_duration: 30
5
+
6
+
7
+ # --- DATASET --- #
8
+
9
+ # dataset:
10
+ # input_dir: '../dataset/jamendo/mp3'
11
+ # output_dir: '../dataset/jamendo/mert_30s'
12
+ # audio length : Full
13
+
14
+ # dataset:
15
+ # input_dir: '../dataset/dmdd/mp3'
16
+ # output_dir: '../dataset/dmdd/mert_30s'
17
+ # # audio length : ~30s
18
+
19
+ # dataset:
20
+ # input_dir: '../dataset/emomusic/mp3'
21
+ # output_dir: '../dataset/emomusic/mert_30s'
22
+ # # audio length : ~30s
23
+
24
+ # dataset:
25
+ # input_dir: '../dataset/pmemo/mp3'
26
+ # output_dir: '../dataset/pmemo/mert_30s'
27
+ # # audio length : ~30s
28
+
29
+
30
+ dataset:
31
+ input_dir: '../dataset/deam/mp3'
32
+ output_dir: '../dataset/deam/mert_30s'
33
+ # audio length : ~30s
34
+
35
+
36
+
37
+
38
+ # --- ENCODER --- #
39
+
40
+ model:
41
+ name: 'm-a-p/MERT-v1-95M'
42
+ sr: 24000
43
+
44
+ # model:
45
+ # name: 'music2latent'
46
+ # sr: 44100
47
+
48
+
49
+
50
+
51
+
inference/data/run_config.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ mp3:
2
+ song_hz: 22050
3
+ inst_len: 10.0
4
+ skip_interval: 5.0
5
+
6
+ feature:
7
+ n_bins: 144
8
+ bins_per_octave: 24
9
+ hop_length: 2048
10
+ #large_voca: False
11
+ large_voca: True
12
+
13
+ experiment:
14
+ learning_rate : 0.0001
15
+ weight_decay : 0.0
16
+ max_epoch : 100
17
+ batch_size : 128
18
+ save_step : 40
19
+ data_ratio : 0.8
20
+
21
+ model:
22
+ feature_size : 144
23
+ timestep : 108
24
+ #num_chords : 25
25
+ num_chords : 170
26
+ input_dropout : 0.2
27
+ layer_dropout : 0.2
28
+ attention_dropout : 0.2
29
+ relu_dropout : 0.2
30
+ num_layers : 8
31
+ num_heads : 4
32
+ hidden_size : 128
33
+ total_key_depth : 128
34
+ total_value_depth : 128
35
+ filter_size : 128
36
+ loss : 'ce'
37
+ probs_out : False
38
+
39
+ path:
40
+ ckpt_path : 'model'
41
+ result_path : 'result'
42
+ asset_path : '/data/music/chord_recognition/jayg996/assets'
43
+ root_path : '/data/music/chord_recognition'
inference/data/tag_list.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9510e22fca2ac817c8af9287f1fa40dbbbc10c489ead8d7bfc99191c0569d60d
3
+ size 22820
inference/data/test_config.yaml ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - base_config
3
+ - _self_
4
+
5
+ batch_size: 8
6
+ devices: [0]
7
+
8
+ trainer:
9
+ devices: ${devices}
10
+ accelerator: 'gpu'
11
+
12
+ # datasets:
13
+ # - jamendo
14
+ # - dmdd
15
+
16
+ checkpoint_latest: True
17
+
18
+ multitask: True
19
+ dataset_type: "va"
20
+ #'mood' or 'va'
21
+ ## If not True, then use following checkpoint.
22
+
23
+ checkpoint: "tb_logs/best/EJ.ckpt"
24
+
25
+ # checkpoint_J: "tb_logs/best/jamendo.ckpt"
26
+ # checkpoint_P: "tb_logs/best/pmemo.ckpt"
27
+ # checkpoint_E: "tb_logs/best/emomusic.ckpt"
28
+ # checkpoint_D: "tb_logs/best/deam.ckpt"
29
+
30
+ # checkpoint_PJ: "tb_logs/best/PJ.ckpt"
31
+ # checkpoint_EJ: "tb_logs/best/EJ.ckpt"
32
+ # checkpoint_DJ: "tb_logs/best/DJ.ckpt"
33
+
34
+ # checkpoint_JP: "tb_logs/best/JP.ckpt"
35
+ # checkpoint_JE: "tb_logs/best/JE.ckpt"
36
+ # checkpoint_JD: "tb_logs/best/JD.ckpt"
37
+
38
+ # checkpoint_ALL: "tb_logs/best/ALL.ckpt"
39
+
40
+
41
+ # checkpoint: "tb_logs/train_audio_classification/version_110/checkpoints/21-0.1202.ckpt"
inference/data/train_config.yaml ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ defaults:
2
+ - base_config
3
+ - _self_
4
+
5
+ devices: [0,1,2,3]
6
+ epochs: 500
7
+ batch_size: 8
8
+
9
+ monitor_metric: "val_loss"
10
+ monitor_metric_mood: "val_loss_mood"
11
+ monitor_metric_va: "val_loss_va"
12
+
13
+ checkpoint:
14
+ monitor: "${monitor_metric}"
15
+ filename: "{epoch:02d}-{${monitor_metric}:.4f}"
16
+ save_top_k: -1
17
+ mode: "min"
18
+ auto_insert_metric_name: False
19
+ save_last: True
20
+
21
+ checkpoint_mood:
22
+ monitor: "${monitor_metric_mood}"
23
+ filename: "mood-{epoch:02d}-{${monitor_metric_mood}:.4f}"
24
+ save_top_k: -1
25
+ mode: "min"
26
+ auto_insert_metric_name: False
27
+ save_last: True
28
+
29
+ checkpoint_va:
30
+ monitor: "${monitor_metric_va}"
31
+ filename: "va-{epoch:02d}-{${monitor_metric_va}:.4f}"
32
+ save_top_k: 5
33
+ mode: "min"
34
+ auto_insert_metric_name: False
35
+ save_last: True
36
+
37
+ earlystopping:
38
+ monitor: "${monitor_metric_mood}"
39
+ patience: 10
40
+ min_delta: 0.0001
41
+ mode: "min"
42
+
43
+ trainer:
44
+ devices: ${devices}
45
+ max_epochs: ${epochs}
46
+ accelerator: 'gpu'
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+ # strategy: 'ddp_find_unused_parameters_true'
55
+ # optimizer:
56
+ # _target_: torch.optim.AdamW
57
+ # _partial_: true
58
+ # lr: 1e-4
59
+ # weight_decay: 0.01
60
+ # scheduler:
61
+ # _target_: torch.optim.lr_scheduler.ReduceLROnPlateau
62
+ # _partial_: true
63
+ # cooldown: 5
64
+ # mode: max
65
+ # factor: 0.2
66
+ # patience: 10
67
+ # min_lr: 1.6e-7
68
+ # monitor_metric: "val_loss"
69
+ # # val_loss
70
+ # # val_loss_mood
71
+ # # val_loss_va
72
+ # checkpoint:
73
+ # monitor: "${monitor_metric}"
74
+ # filename: "{epoch:02d}-{${monitor_metric}:.4f}"
75
+ # save_top_k: 2
76
+ # mode: "min"
77
+ # auto_insert_metric_name: False
78
+ # save_last: True
79
+ # checkpoint:
80
+ # monitor: "val_loss_mood"
81
+ # filename: "{epoch:02d}-{val_loss_mood:.4f}"
82
+ # save_top_k: 2
83
+ # mode: "min"
84
+ # auto_insert_metric_name: False
85
+ # save_last: True
86
+ # earlystopping:
87
+ # monitor: 'val_loss_mood'
88
+ # patience: 10
89
+ # min_delta: 0.0001
90
+ # mode: "min"
91
+ # datasets:
92
+ # - jamendo
93
+ # - dmdd
94
+
inference/input/test.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22a56123f5adb9d061d4ab80a97aae12c84937d86a5042343c05e108b4e9fdda
3
+ size 8195178
inference/temp_out/.DS_Store ADDED
Binary file (6.15 kB). View file