kjysmu commited on
Commit
4c53a91
·
verified ·
1 Parent(s): 6ad6801

Upload 14 files

Browse files
model/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ "Import all submodules"
2
+
3
+ # from model import
model/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (171 Bytes). View file
 
model/__pycache__/crnn.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
model/__pycache__/linear.cpython-310.pyc ADDED
Binary file (1.17 kB). View file
 
model/__pycache__/linear_mt.cpython-310.pyc ADDED
Binary file (989 Bytes). View file
 
model/__pycache__/linear_mt_multitask.cpython-310.pyc ADDED
Binary file (1.27 kB). View file
 
model/__pycache__/linear_multitask.cpython-310.pyc ADDED
Binary file (1.41 kB). View file
 
model/__pycache__/linear_small.cpython-310.pyc ADDED
Binary file (1.07 kB). View file
 
model/__pycache__/linear_small_multitask.cpython-310.pyc ADDED
Binary file (1.34 kB). View file
 
model/__pycache__/transformer.cpython-310.pyc ADDED
Binary file (1.62 kB). View file
 
model/__pycache__/transformer_multitask.cpython-310.pyc ADDED
Binary file (2.07 kB). View file
 
model/linear.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import pytorch_lightning as pl
5
+ from sklearn import metrics
6
+ from transformers import AutoModelForAudioClassification
7
+ import numpy as np
8
+
9
+ class FeedforwardModel(nn.Module):
10
+ def __init__(self, input_size, output_size):
11
+ super(FeedforwardModel, self).__init__()
12
+ self.model = nn.Sequential(
13
+ nn.Linear(input_size, 1024),
14
+ nn.BatchNorm1d(1024),
15
+ nn.ReLU(),
16
+ nn.Dropout(0.3),
17
+
18
+ nn.Linear(1024, 512),
19
+ nn.BatchNorm1d(512),
20
+ nn.ReLU(),
21
+ nn.Dropout(0.3),
22
+
23
+ nn.Linear(512, 256),
24
+ nn.BatchNorm1d(256),
25
+ nn.ReLU(),
26
+ nn.Dropout(0.3),
27
+
28
+ nn.Linear(256, 128),
29
+ nn.BatchNorm1d(128),
30
+ nn.ReLU(),
31
+ nn.Dropout(0.3),
32
+
33
+ nn.Linear(128, output_size),
34
+ )
35
+
36
+ def forward(self, x):
37
+ logit = self.model(x)
38
+ return logit
39
+
model/linear_attn_ck.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import pytorch_lightning as pl
5
+ from sklearn import metrics
6
+ from transformers import AutoModelForAudioClassification
7
+ import numpy as np
8
+
9
+ class PositionalEncoding(nn.Module):
10
+ def __init__(self, d_model, max_len=100):
11
+ super().__init__()
12
+ self.encoding = torch.zeros(max_len, d_model)
13
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
14
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
15
+ self.encoding[:, 0::2] = torch.sin(position * div_term)
16
+ self.encoding[:, 1::2] = torch.cos(position * div_term)
17
+ self.encoding = self.encoding.unsqueeze(0) # Shape: (1, max_len, d_model)
18
+
19
+ def forward(self, x):
20
+ seq_len = x.size(1)
21
+ return x + self.encoding[:, :seq_len, :].to(x.device)
22
+
23
+ class FeedforwardModelAttnCK(nn.Module):
24
+ def __init__(self, input_size, output_size, nhead=8, num_layers=1, dropout_rate=0.1,
25
+ num_key = 2, num_chords=158, num_chords_root=14, num_chords_attr=14,
26
+ key_emb_dim=4, chord_emb_dim=8, chord_root_emb_dim=4, chord_attr_emb_dim=4):
27
+ super().__init__()
28
+ self.d_model = 512
29
+
30
+ self.d_model_transformer = chord_root_emb_dim + chord_attr_emb_dim
31
+
32
+ # Embedding layers for chords and keys
33
+ self.chord_root_embedding = nn.Embedding(num_chords_root, chord_root_emb_dim)
34
+ self.chord_attr_embedding = nn.Embedding(num_chords_attr, chord_attr_emb_dim)
35
+
36
+ nn.init.xavier_uniform_(self.chord_root_embedding.weight)
37
+ nn.init.xavier_uniform_(self.chord_attr_embedding.weight)
38
+
39
+ # Positional encoding for chord progression
40
+ self.positional_encoding = PositionalEncoding(self.d_model_transformer)
41
+
42
+ # Transformer for chord progression modeling
43
+ self.chord_transformer = nn.TransformerEncoder(
44
+ nn.TransformerEncoderLayer(d_model=self.d_model_transformer, nhead=nhead, dim_feedforward= 64, dropout=0.1, batch_first=True),
45
+ num_layers=2
46
+ )
47
+ # Input projection for latent features
48
+ self.input_proj = nn.Sequential(
49
+ nn.Linear(input_size + self.d_model_transformer + 1, self.d_model),
50
+ nn.ReLU(),
51
+ )
52
+
53
+ # Output projection
54
+ self.output_proj = nn.Sequential(
55
+ nn.Linear(self.d_model, 256),
56
+ nn.ReLU(),
57
+ nn.Linear(256, output_size),
58
+ )
59
+
60
+ def forward(self, model_input_dic ):
61
+ x_mert = model_input_dic["x_mert"]
62
+ x_chord_root = model_input_dic["x_chord_root"]
63
+ x_chord_attr = model_input_dic["x_chord_attr"]
64
+ x_key = model_input_dic["x_key"]
65
+
66
+ key_embedding = x_key.float()
67
+
68
+ chord_root_embedding = self.chord_root_embedding(x_chord_root) # Shape: (batch_size, seq_len, chord_root_emb_dim)
69
+ chord_attr_embedding = self.chord_attr_embedding(x_chord_attr) # Shape: (batch_size, seq_len, chord_attr_emb_dim)
70
+
71
+ # Concatenate root and attribute embeddings
72
+ chord_combined_embedding = torch.cat(
73
+ (chord_root_embedding, chord_attr_embedding), dim=-1
74
+ ) # Shape: (batch_size, seq_len, chord_root_emb_dim + chord_attr_emb_dim)
75
+
76
+ # Positional encoding and chord transformer
77
+ chord_combined_embedding = self.positional_encoding(chord_combined_embedding)
78
+
79
+ cls_token = torch.zeros_like(chord_combined_embedding[:, :1, :])
80
+
81
+ chord_embedding_with_cls = torch.cat([cls_token, chord_combined_embedding], dim=1) # Add CLS at the start
82
+ chord_embedding_transformed = self.chord_transformer(chord_embedding_with_cls) # Shape: (seq_len+1, batch_size, chord_emb_dim)
83
+
84
+ chord_embedding_cls = chord_embedding_transformed[:,0,:] # Shape: (batch_size, chord_emb_dim)
85
+
86
+ # Combine all features
87
+ combined_features = torch.cat((x_mert, chord_embedding_cls, key_embedding), dim=1)
88
+ # Input projection
89
+ combined_features = self.input_proj(combined_features) # Shape: (batch_size, d_model)
90
+
91
+ output = self.output_proj(combined_features) # Shape: (batch_size, output_size)
92
+ return output
model/linear_mt_attn_ck.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import pytorch_lightning as pl
5
+ from sklearn import metrics
6
+ from transformers import AutoModelForAudioClassification
7
+ import numpy as np
8
+
9
+ class PositionalEncoding(nn.Module):
10
+ def __init__(self, d_model, max_len=100):
11
+ super().__init__()
12
+ self.encoding = torch.zeros(max_len, d_model)
13
+ position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
14
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
15
+ self.encoding[:, 0::2] = torch.sin(position * div_term)
16
+ self.encoding[:, 1::2] = torch.cos(position * div_term)
17
+ self.encoding = self.encoding.unsqueeze(0) # Shape: (1, max_len, d_model)
18
+
19
+ def forward(self, x):
20
+ seq_len = x.size(1)
21
+ return x + self.encoding[:, :seq_len, :].to(x.device)
22
+
23
+ class FeedforwardModelMTAttnCK(nn.Module):
24
+ def __init__(self, input_size, output_size_classification, output_size_regression, nhead=8, num_layers=1, dropout_rate=0.1,
25
+ num_key = 2, num_chords=158, num_chords_root=14, num_chords_attr=14,
26
+ key_emb_dim=4, chord_emb_dim=8, chord_root_emb_dim=4, chord_attr_emb_dim=4):
27
+ super().__init__()
28
+ self.d_model = 512
29
+
30
+ self.d_model_transformer = chord_root_emb_dim + chord_attr_emb_dim
31
+
32
+ # Embedding layers for chords and keys
33
+ self.chord_root_embedding = nn.Embedding(num_chords_root, chord_root_emb_dim)
34
+ self.chord_attr_embedding = nn.Embedding(num_chords_attr, chord_attr_emb_dim)
35
+
36
+ nn.init.xavier_uniform_(self.chord_root_embedding.weight)
37
+ nn.init.xavier_uniform_(self.chord_attr_embedding.weight)
38
+
39
+ # Positional encoding for chord progression
40
+ self.positional_encoding = PositionalEncoding(self.d_model_transformer)
41
+
42
+ # Transformer for chord progression modeling
43
+ self.chord_transformer = nn.TransformerEncoder(
44
+ nn.TransformerEncoderLayer(d_model=self.d_model_transformer, nhead=nhead, dim_feedforward= 64, dropout=0.1, batch_first=True),
45
+ num_layers=2
46
+ )
47
+
48
+ self.input_proj = nn.Sequential(
49
+ nn.Linear(input_size + self.d_model_transformer + 1, self.d_model),
50
+ nn.ReLU(),
51
+ )
52
+
53
+ # Classification branch (e.g., Jamendo - mood classification with 56 classes)
54
+ self.classification_branch = nn.Sequential(
55
+ nn.Linear(self.d_model, 256),
56
+ nn.ReLU(),
57
+ nn.Linear(256, output_size_classification) # Output: 56 classes
58
+ )
59
+
60
+ # Regression branch (e.g., DMDD - valence-arousal prediction, including std values)
61
+ self.regression_branch = nn.Sequential(
62
+ nn.Linear(self.d_model, 256),
63
+ nn.ReLU(),
64
+ nn.Linear(256, output_size_regression) # Output: [mean, std] for valence-arousal
65
+ )
66
+
67
+
68
+ def forward(self, model_input_dic ):
69
+ x_mert = model_input_dic["x_mert"]
70
+ x_chord_root = model_input_dic["x_chord_root"]
71
+ x_chord_attr = model_input_dic["x_chord_attr"]
72
+
73
+ x_key = model_input_dic["x_key"]
74
+ key_embedding = x_key.float()
75
+
76
+ chord_root_embedding = self.chord_root_embedding(x_chord_root) # Shape: (batch_size, seq_len, chord_root_emb_dim)
77
+ chord_attr_embedding = self.chord_attr_embedding(x_chord_attr) # Shape: (batch_size, seq_len, chord_attr_emb_dim)
78
+
79
+ # Concatenate root and attribute embeddings
80
+ chord_combined_embedding = torch.cat(
81
+ (chord_root_embedding, chord_attr_embedding), dim=-1
82
+ ) # Shape: (batch_size, seq_len, chord_root_emb_dim + chord_attr_emb_dim)
83
+
84
+ chord_combined_embedding = self.positional_encoding(chord_combined_embedding)
85
+ cls_token = torch.zeros_like(chord_combined_embedding[:, :1, :])
86
+
87
+ chord_embedding_with_cls = torch.cat([cls_token, chord_combined_embedding], dim=1) # Add CLS at the start
88
+ chord_embedding_transformed = self.chord_transformer(chord_embedding_with_cls) # Shape: (seq_len+1, batch_size, chord_emb_dim)
89
+
90
+ chord_embedding_cls = chord_embedding_transformed[:,0,:] # Shape: (batch_size, chord_emb_dim)
91
+
92
+ # Combine all features
93
+ combined_features = torch.cat((x_mert, chord_embedding_cls, key_embedding), dim=1)
94
+ # Input projection
95
+ combined_features = self.input_proj(combined_features) # Shape: (batch_size, d_model)
96
+
97
+ classification_output = self.classification_branch(combined_features)
98
+ regression_output = self.regression_branch(combined_features)
99
+
100
+ return classification_output, regression_output