jamimulgrave
/

Bart-fusion

Model card Files Files and versions Community

jamimulgrave commited on May 8, 2023

Commit

c961996

1 Parent(s): 7b6887d

Upload 10 files

Browse files

Files changed (10) hide show

code/LyricsCommentData.py +16 -0
code/attention_modules.py +267 -0
code/data.py +297 -0
code/eval.py +87 -0
code/model.py +56 -0
code/model_fusion.py +69 -0
code/modeling_bart.py +1483 -0
code/music_encoder.py +196 -0
code/train.py +145 -0
code/train_fusion.py +193 -0

code/LyricsCommentData.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from dataclasses import dataclass
+import os
+@dataclass
+class LyricsCommentData(object):
+    music4all_id: str
+    songmeanings_id: str
+    lyrics: str
+    comment: str
+    def get_audio_path(self): # get audio path from id
+        self.audio_path = os.path.join("Music4All/music4all/audios",
+                            self.music4all_id + '.mp3'
+                            )
+        return self.audio_path

code/attention_modules.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# coding: utf-8
+# Code adopted from https://github.com/huggingface/pytorch-pretrained-BERT
+import math
+import copy
+import torch
+import torch.nn as nn
+import numpy as np
+# Gelu
+def gelu(x):
+    """Implementation of the gelu activation function.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+# LayerNorm
+try:
+    from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
+except ImportError:
+    # print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
+    class BertLayerNorm(nn.Module):
+        def __init__(self, hidden_size, eps=1e-12):
+            """Construct a layernorm module in the TF style (epsilon inside the square root).
+            """
+            super(BertLayerNorm, self).__init__()
+            self.weight = nn.Parameter(torch.ones(hidden_size))
+            self.bias = nn.Parameter(torch.zeros(hidden_size))
+            self.variance_epsilon = eps
+        def forward(self, x):
+            u = x.mean(-1, keepdim=True)
+            s = (x - u).pow(2).mean(-1, keepdim=True)
+            x = (x - u) / torch.sqrt(s + self.variance_epsilon)
+            return self.weight * x + self.bias
+class BertConfig(object):
+    def __init__(self,
+                 vocab_size,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 attention_probs_dropout_prob=0.1,
+                 type_vocab_size=2):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.type_vocab_size = type_vocab_size
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        return context_layer
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.intermediate_act_fn = gelu
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(self, hidden_states, attention_mask):
+        attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        layer = BertLayer(config)
+        self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)])
+    def forward(self, hidden_states, attention_mask=None, output_all_encoded_layers=True):
+        all_encoder_layers = []
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states, attention_mask)
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+        if not output_all_encoded_layers:
+            all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super(BertEmbeddings, self).__init__()
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, input_ids, token_type_ids=None):
+        seq_length = input_ids.size(1)
+        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
+        position_ids = position_ids.unsqueeze(0).expand_as(input_ids[:, :, 0])
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = input_ids + position_embeddings
+        # embeddings = input_ids
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class PositionalEncoding(nn.Module):
+    def __init__(self, config):
+        super(PositionalEncoding, self).__init__()
+        emb_dim = config.hidden_size
+        max_len = config.max_position_embeddings
+        self.position_enc = self.position_encoding_init(max_len, emb_dim)
+    @staticmethod
+    def position_encoding_init(n_position, emb_dim):
+        ''' Init the sinusoid position encoding table '''
+        # keep dim 0 for padding token position encoding zero vector
+        position_enc = np.array([
+            [pos / np.power(10000, 2 * (j // 2) / emb_dim) for j in range(emb_dim)]
+            if pos != 0 else np.zeros(emb_dim) for pos in range(n_position)])
+        position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # apply sin on 0th,2nd,4th...emb_dim
+        position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # apply cos on 1st,3rd,5th...emb_dim
+        return torch.from_numpy(position_enc).type(torch.FloatTensor)
+    def forward(self, word_seq):
+        position_encoding = self.position_enc.unsqueeze(0).expand_as(word_seq)
+        position_encoding = position_encoding.to(word_seq.device)
+        word_pos_encoded = word_seq + position_encoding
+        return word_pos_encoded
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super(BertPooler, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output

code/data.py ADDED Viewed

	@@ -0,0 +1,297 @@

+import sys
+sys.path.append('..')
+from torch.utils.data import Dataset
+import pickle
+import random
+from . import LyricsCommentData
+class LyricsCommentsDataset(Dataset):
+    def __init__(self, random=False):
+        super(LyricsCommentsDataset, self).__init__()
+        self.random = random
+        with open("dataset.pkl", "rb") as f:
+            self.data = pickle.load(f)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, item):
+        lyrics = self.data[item].lyrics
+        # if random:
+        #     comment = random.choice(self.data[item].comments)
+        # else:
+        comment = self.data[item].comments[0]
+        # the longest?
+        for i, (tmp_item, _) in enumerate(self.data[item].comments):
+            if len(tmp_item) > len(comment[0]):
+                comment = self.data[item].comments[i]
+        comment = comment[0] # keep comments w/o rating
+        return [lyrics, comment]
+class LyricsCommentsDatasetClean(Dataset):
+    def __init__(self, random=False):
+        super(LyricsCommentsDatasetClean, self).__init__()
+        self.random = random
+        with open("cleaned_dataset.pkl", "rb") as f:
+            self.data = pickle.load(f)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, item):
+        lyrics = self.data[item].lyrics
+        comment = self.data[item].comment
+        return [lyrics, comment]
+class LyricsCommentsDatasetPsuedo(Dataset):
+    def __init__(self, dataset_path, random=False):
+        super(LyricsCommentsDatasetPsuedo, self).__init__()
+        self.random = random
+        with open(dataset_path, "rb") as f:
+            self.data = pickle.load(f)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, item):
+        lyrics = self.data[item].lyrics.replace('\n', ';')
+        comment = self.data[item].comment
+        return [lyrics, comment]
+class LyricsCommentsDatasetPsuedo_fusion(Dataset):
+    def __init__(self, dataset_path):
+        super(LyricsCommentsDatasetPsuedo_fusion, self).__init__()
+        with open(dataset_path, "rb") as f:
+            self.data = pickle.load(f)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, item):
+        lyrics = self.data[item].lyrics.replace('\n', ';')
+        comment = self.data[item].comment
+        music_id = self.data[item].music4all_id
+        return [lyrics, comment, music_id]
+from torch.utils.data import Dataset, DataLoader
+import torch
+from MusicData import MusicData
+import csv
+import os
+from pydub import AudioSegment
+import matplotlib.pyplot as plt
+from scipy.io import wavfile
+from tempfile import mktemp
+from scipy import signal
+import numpy as np
+import torchaudio
+import transformers
+import nltk
+class Music4AllDataset(Dataset):
+    def __init__(self,
+                 mel_bins,
+                 audio_length,
+                 pad_length,
+                 tag_file_path=r"Music4All/music4all/id_genres.csv",
+                 augment=True):
+        self.tag_file_path = tag_file_path
+        self.allow_cache = True
+        self.mel_bins = mel_bins
+        self.audio_length = audio_length
+        self.pad_length = pad_length
+        self.augment = augment
+        # read all tags
+        tags_file = open(tag_file_path, 'r', encoding='utf-8')
+        self.tags_reader = list(csv.reader(tags_file, delimiter='\t'))[1:]
+        tags_file.close()
+        if self.augment:
+            self.data_augmentation()
+    def data_augmentation(self):
+        pass
+    def __len__(self):
+        return len(self.tags_reader)
+    def __getitem__(self, item):
+        """
+        :param item: index
+        :return: tags and mel-spectrogram.
+        """
+        id = self.tags_reader[item][0]
+        tags = self.tags_reader[item][1] #.split(',')
+        # pad tags
+        # if len(tags) >= self.pad_length:
+        #     tags = tags[:self.pad_length]
+        # else:
+        #     for i in range(self.pad_length - len(tags)):
+        #         tags.append("[PAD]")
+        spec_path = os.path.join("Music4All/temp_data/specs/data_cache/", id + ".npy")
+        exist_cache = os.path.isfile(spec_path)
+        # search cache
+        # if exist cache, load
+        if self.allow_cache and exist_cache:
+            spectrogram = torch.Tensor(np.load(spec_path))
+        # if does not exist, calculate and save
+        else:
+            audio_path = os.path.join("Music4All/music4all/audios",
+                                      id + '.mp3'
+                                      )
+            (data, sample_rate) = torchaudio.backend.sox_io_backend.load(audio_path)
+            spectrogram = torchaudio.transforms.MelSpectrogram(n_mels=self.mel_bins,
+                                                               n_fft=512,
+                                                               sample_rate=sample_rate,
+                                                               f_max=8000.0,
+                                                               f_min=0.0,
+                                                               )(torch.Tensor(data))
+            # TODO: There is a huge bug!
+            # cut length
+            if self.audio_length is not None:
+                spectrogram = spectrogram[:, :, :self.audio_length]
+            # to mono
+            spectrogram = spectrogram[0, :, :].unsqueeze(0)
+            if self.allow_cache:
+                np.save(spec_path, spectrogram.numpy())
+        return tags, spectrogram
+class MusCapsDataset(Dataset):
+    def __init__(self,
+                 mel_bins,
+                 audio_length,
+                 pad_length,
+                 tag_file_path=r"Music4All/music4all/id_genres.csv",
+                 augment=True):
+        self.tag_file_path = tag_file_path
+        self.allow_cache = True
+        self.mel_bins = mel_bins
+        self.audio_length = audio_length
+        self.pad_length = pad_length
+        self.augment = augment
+        # read all tags
+        tags_file = open(tag_file_path, 'r', encoding='utf-8')
+        self.tags_reader = list(csv.reader(tags_file, delimiter='\t'))[1:]
+        tags_file.close()
+        if self.augment:
+            self.data_augmentation()
+    def data_augmentation(self):
+        pass
+    def __len__(self):
+        return len(self.tags_reader)
+    def __getitem__(self, item):
+        """
+        :param item: index
+        :return: tags and mel-spectrogram.
+        """
+        id = self.tags_reader[item][0]
+        tags = self.tags_reader[item][1] #.split(',')
+        # pad tags
+        # if len(tags) >= self.pad_length:
+        #     tags = tags[:self.pad_length]
+        # else:
+        #     for i in range(self.pad_length - len(tags)):
+        #         tags.append("[PAD]")
+        spec_path = os.path.join("Music4All/temp_data/specs/data_cache/", id + ".npy")
+        exist_cache = os.path.isfile(spec_path)
+        # search cache
+        # if exist cache, load
+        if self.allow_cache and exist_cache:
+            spectrogram = torch.Tensor(np.load(spec_path))
+        # if does not exist, calculate and save
+        else:
+            audio_path = os.path.join("Music4All/music4all/audios",
+                                      id + '.mp3'
+                                      )
+            (data, sample_rate) = torchaudio.backend.sox_io_backend.load(audio_path)
+            spectrogram = torchaudio.transforms.MelSpectrogram(n_mels=self.mel_bins,
+                                                               n_fft=512,
+                                                               sample_rate=sample_rate,
+                                                               f_max=8000.0,
+                                                               f_min=0.0,
+                                                               )(torch.Tensor(data))
+            # cut length
+            if self.audio_length is not None:
+                spectrogram = spectrogram[:, :, :self.audio_length]
+            # to mono
+            spectrogram = spectrogram[0, :, :].unsqueeze(0)
+            np.save(spec_path, spectrogram.numpy())
+        return tags, spectrogram
+class GTZANDataset(Dataset):
+    def __init__(self, raw_dataset, is_augment=True, window=1366):
+        self.raw = raw_dataset
+        self.data = list()
+        self.mel_bins = 96
+        self.gtzan_genres = [
+            "blues",
+            "classical",
+            "country",
+            "disco",
+            "hiphop",
+            "jazz",
+            "metal",
+            "pop",
+            "reggae",
+            "rock",
+        ]
+        self.is_augment = is_augment
+        self.window = window
+        self.init()
+    def init(self):
+        for i, (waveform, sample_rate, label) in enumerate(self.raw):
+            spectrogram = torchaudio.transforms.MelSpectrogram(n_mels=self.mel_bins)(torch.Tensor(waveform))
+            if self.is_augment:
+                self.augment(spectrogram, label)
+            else:
+                self.data.append((spectrogram[:,:,:self.window], label))
+    def augment(self, spectrogram, label):
+        length = spectrogram.shape[-1] # length
+        # augment audio with sliding window
+        hop_length = 250
+        slices = (length - self.window) // hop_length
+        for i in range(slices):
+            self.data.append((spectrogram[:, :, i * hop_length:self.window + i*hop_length], label))
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        spectrogram, label = self.data[index]
+        label = self.gtzan_genres.index(label)
+        return spectrogram, label

code/eval.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from data import LyricsCommentsDatasetPsuedo_fusion
+from torch import utils, nn
+from model import CommentGenerator
+from model_fusion import CommentGenerator_fusion
+import transformers
+import datasets
+from tqdm import tqdm
+import statistics
+import os
+DATASET_PATH = "dataset_test.pkl"
+MODEL_PATH = "model/bart_fusion_full.pt"
+# MODEL_NAME = "bart"
+os.environ["CUDA_VISIBLE_DEVICES"] = "4"
+test_dataset = LyricsCommentsDatasetPsuedo_fusion(DATASET_PATH)
+dataset_length = len(test_dataset)
+test_dataloader = utils.data.DataLoader(test_dataset,
+                                         # batch_size=len(valid_dataset),
+                                         batch_size=32,
+                                         shuffle=False)
+if 'baseline' in MODEL_PATH:
+    model = CommentGenerator().cuda()
+else:
+    model = CommentGenerator_fusion().cuda()
+model.load_state_dict(torch.load(MODEL_PATH))
+model.eval()
+samples_list = list()
+# generate
+for batch_index, [lyrics, comment, music_id] in enumerate(tqdm(test_dataloader)):
+    if 'baseline' in MODEL_PATH:
+        with torch.no_grad():
+            output_samples = model.generate(lyrics)
+    else:
+        with torch.no_grad():
+            output_samples = model.generate(lyrics, music_id)
+    samples_list.append(output_samples)
+# ------ ROUGE ------ #
+metrics = datasets.load_metric('rouge')#, 'sacrebleu', 'meteor', 'bertscore')
+for batch_index, [lyrics, comment, music_id] in enumerate(tqdm(test_dataloader)):
+    output_samples = samples_list[batch_index]
+    metrics.add_batch(predictions=output_samples, references=comment)
+score = metrics.compute()
+print(score)
+# ------ BLEU ------ #
+metrics = datasets.load_metric('sacrebleu')#, 'sacrebleu', 'meteor', 'bertscore')
+for batch_index, [lyrics, comment, music_id] in enumerate(tqdm(test_dataloader)):
+    output_samples = samples_list[batch_index]
+    metrics.add_batch(predictions=output_samples, references=[[i] for i in comment])
+score = metrics.compute()
+print(score)
+# ------ BERTScore ------ #
+metrics = datasets.load_metric('bertscore')#, 'sacrebleu', 'meteor', 'bertscore')
+for batch_index, [lyrics, comment, music_id] in enumerate(tqdm(test_dataloader)):
+    output_samples = samples_list[batch_index]
+    metrics.add_batch(predictions=output_samples, references=[[i] for i in comment])
+score = metrics.compute(lang='en')
+score = statistics.mean(score['f1'])
+print(score)
+# ------ METEOR ------ #
+metrics = datasets.load_metric('meteor')#, 'sacrebleu', 'meteor', 'bertscore')
+for batch_index, [lyrics, comment, music_id] in enumerate(tqdm(test_dataloader)):
+    output_samples = samples_list[batch_index]
+    metrics.add_batch(predictions=output_samples, references=[[i] for i in comment])
+score = metrics.compute()
+print(score)

code/model.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+from torch import nn
+from transformers import BartTokenizer, BartForConditionalGeneration
+class CommentGenerator(nn.Module):
+    def __init__(self):
+        super(CommentGenerator, self).__init__()
+        self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+        self.bart = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
+        # self.bart_config = BartConfig()
+        self.condition = None
+    def forward(self, input_sentence_list, labels=None):
+        encoded_input = self.tokenizer(
+            input_sentence_list,
+            padding=True,
+            truncation=True,
+            max_length=512,
+            return_tensors='pt',
+        )
+        if labels is not None:
+            labels = self.tokenizer(
+                labels,
+                padding=True,
+                truncation=True,
+                max_length=512,
+                return_tensors='pt',
+            )
+        output = self.bart(input_ids=encoded_input['input_ids'].cuda(),
+                           attention_mask=encoded_input['attention_mask'].cuda(),
+                           labels=labels['input_ids'].cuda(),
+                           # labels
+                           )
+        return output
+    def generate(self, input_sentence_list, is_cuda=True):
+        encoded_input = self.tokenizer(input_sentence_list,
+                                       padding=True,
+                                       truncation=True,
+                                       return_tensors='pt',
+                                       )
+        output_ids = self.bart.generate(encoded_input['input_ids'].cuda(),
+                                        num_beams=4,
+                                        max_length=512,
+                                        early_stopping=True,
+                                        do_sample=True)
+        return ([self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+                 for g in output_ids])
+# tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+# encoded_input = tokenizer(['Hello all', 'Hi all'], return_tensors='pt')
+# print(encoded_input)

code/model_fusion.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import torch
+from torch import nn
+from transformers import BartTokenizer
+from modeling_bart import BartForMultimodalGeneration
+from music_encoder import CNNSA
+class CommentGenerator_fusion(nn.Module):
+    def __init__(self):
+        super(CommentGenerator_fusion, self).__init__()
+        self.tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+        model_path = "best_model.pth"
+        self.music_encoder = CNNSA().cuda()
+        self.music_encoder.load_state_dict(torch.load(model_path))
+        # trial: fix music encoder's params
+        for params in self.music_encoder.parameters():
+            params.requires_grad = False
+        self.bart = BartForMultimodalGeneration.from_pretrained("facebook/bart-base",
+                                                                fusion_layers=[4,5], # [4,5]
+                                                                use_forget_gate=False, # [True]
+                                                                dim_common=768, # 256
+                                                                n_attn_heads=1).cuda()
+    def forward(self, input_sentence_list, music_ids, labels=None):
+        encoded_input = self.tokenizer(
+            input_sentence_list,
+            padding=True,
+            truncation=True,
+            max_length=512,
+            return_tensors='pt',
+        )
+        if labels is not None:
+            labels = self.tokenizer(
+                labels,
+                padding=True,
+                truncation=True,
+                max_length=512,
+                return_tensors='pt',
+            )
+        music_features = self.music_encoder(music_ids)
+        output = self.bart(input_ids=encoded_input['input_ids'].cuda(),
+                           attention_mask=encoded_input['attention_mask'].cuda(),
+                           labels=labels['input_ids'].cuda(),
+                           music_features=music_features
+                           # labels
+                           )
+        return output
+    def generate(self, input_sentence_list, music_ids, is_cuda=True):
+        encoded_input = self.tokenizer(input_sentence_list,
+                                       padding=True,
+                                       truncation=True,
+                                       return_tensors='pt',
+                                       )
+        music_features = self.music_encoder(music_ids)
+        output_ids = self.bart.generate(encoded_input['input_ids'].cuda(),
+                                        num_beams=5,
+                                        max_length=512,
+                                        early_stopping=True,
+                                        do_sample=True,
+                                        music_features=music_features)
+        return ([self.tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True)
+                 for g in output_ids])
+# tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
+# encoded_input = tokenizer(['Hello all', 'Hi all'], return_tensors='pt')
+# print(encoded_input)

code/modeling_bart.py ADDED Viewed

	@@ -0,0 +1,1483 @@

+# coding=utf-8
+# Revised by anonymous.
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch BART model. """
+import copy
+import math
+import random
+import warnings
+from typing import Optional, Tuple
+import numpy as np
+import torch.nn.functional as F
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Seq2SeqQuestionAnsweringModelOutput,
+    Seq2SeqSequenceClassifierOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.models.bart.configuration_bart import BartConfig
+from music_encoder import CNNSA
+logger = logging.get_logger(__name__)
+_CHECKPOINT_FOR_DOC = "facebook/bart-large"
+_CONFIG_FOR_DOC = "BartConfig"
+_TOKENIZER_FOR_DOC = "BartTokenizer"
+BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/bart-large",
+    # See all BART models at https://huggingface.co/models?filter=bart
+]
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+    return shifted_input_ids
+def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz, tgt_len = input_ids_shape
+    mask = torch.full((tgt_len, tgt_len), float("-inf"))
+    mask_cond = torch.arange(mask.size(-1))
+    mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+    mask = mask.to(dtype)
+    if past_key_values_length > 0:
+        mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype), mask], dim=-1)
+    return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length)
+def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    bsz, src_len = mask.size()
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
+    inverted_mask = 1.0 - expanded_mask
+    return inverted_mask.masked_fill(inverted_mask.bool(), torch.finfo(dtype).min)
+class BartLearnedPositionalEmbedding(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+    def __init__(self, num_embeddings: int, embedding_dim: int):
+        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
+        # and adjust num_embeddings appropriately. Other models don't have this hack
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim)
+    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        bsz, seq_len = input_ids_shape[:2]
+        positions = torch.arange(
+            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+        )
+        return super().forward(positions + self.offset)
+class BartAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim ** -0.5
+        self.is_decoder = is_decoder
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, _ = hidden_states.size()
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.bmm(attn_probs, value_states)
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output.size()}"
+            )
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned aross GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, attn_weights_reshaped, past_key_value
+class BartEncoderLayer(nn.Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = BartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(seq_len, batch, embed_dim)*
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                *(encoder_attention_heads,)*.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        if hidden_states.dtype == torch.float16 and (
+            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
+        ):
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class BartDecoderLayer(nn.Module):
+    def __init__(self, config: BartConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = BartAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = BartAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`): cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                *(encoder_attention_heads,)*.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size *(decoder_attention_heads,)*.
+            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+        if use_cache:
+            outputs += (present_key_value,)
+        return outputs
+class BartClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+    def __init__(
+        self,
+        input_dim: int,
+        inner_dim: int,
+        num_classes: int,
+        pooler_dropout: float,
+    ):
+        super().__init__()
+        self.dense = nn.Linear(input_dim, inner_dim)
+        self.dropout = nn.Dropout(p=pooler_dropout)
+        self.out_proj = nn.Linear(inner_dim, num_classes)
+    def forward(self, hidden_states: torch.Tensor):
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.dense(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.out_proj(hidden_states)
+        return hidden_states
+class BartPretrainedModel(PreTrainedModel):
+    config_class = BartConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _keys_to_ignore_on_load_unexpected = [r"encoder\.version", r"decoder\.version"]
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (BartDecoder, BartEncoder)):
+            module.gradient_checkpointing = value
+    @property
+    def dummy_inputs(self):
+        pad_token = self.config.pad_token_id
+        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
+        dummy_inputs = {
+            "attention_mask": input_ids.ne(pad_token),
+            "input_ids": input_ids,
+        }
+        return dummy_inputs
+class PretrainedBartModel(BartPretrainedModel):
+    def __init_subclass__(self):
+        warnings.warn(
+            "The class `PretrainedBartModel` has been depreciated, please use `BartPretrainedModel` instead.",
+            FutureWarning,
+        )
+BART_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic
+    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
+    pruning heads etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)
+    subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
+    general usage and behavior.
+    Parameters:
+        config ([`BartConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+BART_GENERATION_EXAMPLE = r"""
+    Summarization example::
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
+        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+    Mask filling example::
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+        >>> logits = model(input_ids).logits
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> probs = logits[0, masked_index].softmax(dim=0)
+        >>> values, predictions = probs.topk(5)
+        >>> tokenizer.decode(predictions).split()
+"""
+BART_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`BartTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+            Indices can be obtained using [`BartTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
+            details.
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+            For translation and summarization training, `decoder_input_ids` should be provided. If no
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to
+            the right for denoising pre-training following the paper.
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will
+            also be used by default.
+            If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_inputs`] and
+            modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+            `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`,
+            *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+            cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors
+            of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+            shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
+            instead of all ``decoder_input_ids``` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated
+            vectors than the model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds`
+            have to be input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds`
+            takes the value of `inputs_embeds`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up
+            decoding (see `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+class BartEncoder(BartPretrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`BartEncoderLayer`].
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(self, config: BartConfig,
+                 embed_tokens: Optional[nn.Embedding] = None,
+                 fusion_layers=[5], # 5 is the last layer
+                 use_forget_gate=True,
+                 dim_common=256,
+                 n_attn_heads=1):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+        )
+        self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)
+        self.gradient_checkpointing = False
+        # ==================== Modification Starts ====================
+        # 1. params and variables
+        self.use_forget_gate = use_forget_gate
+        self.fusion_layers = fusion_layers
+        music_feature_dim = 256
+        text_feature_dim = embed_dim # 768
+        # 2. define attention
+        self._linear_1 = nn.Linear(music_feature_dim, dim_common)  # K
+        self._linear_2 = nn.Linear(music_feature_dim, dim_common)  # V
+        self._linear_3 = nn.Linear(text_feature_dim, dim_common)  # Q
+        self._multi_head_attn = nn.MultiheadAttention(dim_common, n_attn_heads)
+        self._linear_4 = nn.Linear(text_feature_dim + dim_common, text_feature_dim) # TODO: it does not make sense
+        if use_forget_gate:
+            self.fg = nn.Linear(dim_common + text_feature_dim, dim_common)
+        # ==================== Modification Ends ====================
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+        self.sigmoid = nn.Sigmoid()
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        music_features=None,
+        music_len=None
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`BartTokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded
+                representation. This is useful if you want more control over how to convert `input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        # ==================== Modification Starts ====================
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        embed_pos = self.embed_positions(input_shape)
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask, inputs_embeds.dtype)
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+            # ==================== music-text fusion  =====================
+            def forget_gate(music_features, text_features):
+                forget_mask = self.fg(torch.cat((music_features, text_features), 2))
+                forget_mask = self.sigmiod(forget_mask)
+                forget_mask = F.dropout(forget_mask, p=self.dropout, training=self.training)
+                music_features = forget_mask.mul(music_features)
+                return music_features
+            if idx in self.fusion_layers:
+                '''
+                => K_a = linear_1(V) in (S_v, D_a)
+                => V_a = linear_2(V) in (S_v, D_a)
+                => Q_a = linear_3(T) in (S_t, D_a)
+                => T_out = MultiHeadAttn(Q_a, K_a, V_a) in (S_t, D_a)
+                => T_out = linear_4(concat(T, T_out)) in (S_t, D_t)
+                => T_out = T + T_out (Residual Connection)
+                '''
+                K = self._linear_1(music_features).transpose(0, 1)
+                V = self._linear_2(music_features).transpose(0, 1)
+                Q = self._linear_3(hidden_states).transpose(0, 1)
+                attn_output, _ = self._multi_head_attn(Q, K, V)
+                attn_output = attn_output.transpose(0, 1)
+                if self.use_forget_gate:
+                    forget_mask = self.fg(torch.cat((attn_output, hidden_states), 2))
+                    forget_mask = self.sigmoid(forget_mask)
+                    forget_mask = F.dropout(forget_mask, p=self.dropout, training=self.training)
+                    attn_output = forget_mask.mul(attn_output)
+                # output = self._linear_4(torch.cat((hidden_states, attn_output), 2))
+                # Residual Connection
+                hidden_states = hidden_states + 0.1 * attn_output
+                hidden_states = self.final_layer_norm(hidden_states)
+            # ==================== music-text fusion  =====================
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+class BartDecoder(BartPretrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`]
+    Args:
+        config: BartConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+    def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+        )
+        self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
+        # create causal mask
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(
+                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
+            ).to(self.device)
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+            combined_attention_mask = (
+                expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+                Indices can be obtained using [`BartTokenizer`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`]
+                for details.
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2
+                tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional
+                tensors of shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential
+                decoding.
+                If `past_key_values` are used, the user can optionally input only the last
+                `decoder_input_ids` (those that don't have their past key value states given to this model) of
+                shape `(batch_size, 1)` instead of all ``decoder_input_ids``` of shape `(batch_size,
+                sequence_length)`. inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices
+                into associated vectors than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+        attention_mask = self._prepare_decoder_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1])
+        # embed positions
+        positions = self.embed_positions(input_shape, past_key_values_length)
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        "The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+            if self.training and (dropout_probability < self.layerdrop):
+                continue
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    head_mask[idx] if head_mask is not None else None,
+                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                    None,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    cross_attn_layer_head_mask=(
+                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
+                    ),
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+@add_start_docstrings(
+    "The bare BART Model outputting raw hidden-states without any specific head on top.",
+    BART_START_DOCSTRING,
+)
+class BartModel(BartPretrainedModel):
+    def __init__(self, config: BartConfig,
+                 fusion_layers=None,
+                 use_forget_gate=None,
+                 dim_common=256,
+                 n_attn_heads=1):
+        super().__init__(config)
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+        self.encoder = BartEncoder(config, self.shared, fusion_layers, use_forget_gate, dim_common, n_attn_heads)
+        self.decoder = BartDecoder(config, self.shared)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.shared
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+    def get_encoder(self):
+        return self.encoder
+    def get_decoder(self):
+        return self.decoder
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Seq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        music_features=None,
+        music_len=None,
+    ):
+        # different to other models, Bart automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            if input_ids is None:
+                raise ValueError(
+                    "If no `decoder_input_ids` or `decoder_inputs_embeds` are "
+                    "passed, `input_ids` cannot be `None`. Please pass either "
+                    "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`."
+                )
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                music_features=music_features,
+                music_len=music_len,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING
+)
+class BartForMultimodalGeneration(BartPretrainedModel):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = [r"final_logits_bias", r"lm_head\.weight"]
+    def __init__(self, config: BartConfig, fusion_layers=None, use_forget_gate=None, dim_common=256, n_attn_heads=1):
+        super().__init__(config)
+        self.model = BartModel(config, fusion_layers, use_forget_gate, dim_common, n_attn_heads)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_encoder(self):
+        return self.model.get_encoder()
+    def get_decoder(self):
+        return self.model.get_decoder()
+    def resize_token_embeddings(self, new_num_tokens: int) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens)
+        self._resize_final_logits_bias(new_num_tokens)
+        return new_embeddings
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(BART_GENERATION_EXAMPLE)
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        decoder_input_ids=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        encoder_outputs=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        decoder_inputs_embeds=None,
+        labels=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        music_features=None,
+        music_len=None,
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            music_features=music_features,
+            music_len=music_len,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs
+    ):
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+    @staticmethod
+    def _reorder_cache(past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past

code/music_encoder.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torchaudio
+import os
+import random
+from attention_modules import BertConfig, BertEncoder, BertPooler
+class Conv_1d(nn.Module):
+    def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=2):
+        super(Conv_1d, self).__init__()
+        self.conv = nn.Conv1d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+        self.bn = nn.BatchNorm1d(output_channels)
+        self.relu = nn.ReLU()
+        self.mp = nn.MaxPool1d(pooling)
+    def forward(self, x):
+        out = self.mp(self.relu(self.bn(self.conv(x))))
+        return out
+class Conv_2d(nn.Module):
+    def __init__(self, input_channels, output_channels, shape=3, stride=1, pooling=2):
+        super(Conv_2d, self).__init__()
+        self.conv = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+        self.bn = nn.BatchNorm2d(output_channels)
+        self.relu = nn.ReLU()
+        self.mp = nn.MaxPool2d(pooling)
+    def forward(self, x):
+        out = self.mp(self.relu(self.bn(self.conv(x))))
+        return out
+class Res_2d(nn.Module):
+    def __init__(self, input_channels, output_channels, shape=3, stride=2):
+        super(Res_2d, self).__init__()
+        # convolution
+        self.conv_1 = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+        self.bn_1 = nn.BatchNorm2d(output_channels)
+        self.conv_2 = nn.Conv2d(output_channels, output_channels, shape, padding=shape//2)
+        self.bn_2 = nn.BatchNorm2d(output_channels)
+        # residual
+        self.diff = False
+        if (stride != 1) or (input_channels != output_channels):
+            self.conv_3 = nn.Conv2d(input_channels, output_channels, shape, stride=stride, padding=shape//2)
+            self.bn_3 = nn.BatchNorm2d(output_channels)
+            self.diff = True
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        # convolution
+        out = self.bn_2(self.conv_2(self.relu(self.bn_1(self.conv_1(x)))))
+        # residual
+        if self.diff:
+            x = self.bn_3(self.conv_3(x))
+        out = x + out
+        out = self.relu(out)
+        return out
+class CNNSA(nn.Module):
+    '''
+    Won et al. 2019
+    Toward interpretable music tagging with self-attention.
+    Feature extraction with CNN + temporal summary with Transformer encoder.
+    '''
+    def __init__(self,
+                n_channels=128,
+                sample_rate=16000,
+                n_fft=512,
+                f_min=0.0,
+                f_max=8000.0,
+                n_mels=128,
+                n_class=50):
+        super(CNNSA, self).__init__()
+        # Spectrogram
+        self.spec = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
+                                                         n_fft=n_fft,
+                                                         f_min=f_min,
+                                                         f_max=f_max,
+                                                         n_mels=n_mels)
+        self.to_db = torchaudio.transforms.AmplitudeToDB()
+        self.spec_bn = nn.BatchNorm2d(1)
+        # CNN
+        self.layer1 = Res_2d(1, n_channels, stride=2)
+        self.layer2 = Res_2d(n_channels, n_channels, stride=2)
+        self.layer3 = Res_2d(n_channels, n_channels*2, stride=2)
+        self.layer4 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1))
+        self.layer5 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1))
+        self.layer6 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1))
+        self.layer7 = Res_2d(n_channels*2, n_channels*2, stride=(2, 1))
+        # Transformer encoder
+        bert_config = BertConfig(vocab_size=256,
+                                 hidden_size=256,
+                                 num_hidden_layers=2,
+                                 num_attention_heads=8,
+                                 intermediate_size=1024,
+                                 hidden_act="gelu",
+                                 hidden_dropout_prob=0.4,
+                                 max_position_embeddings=700,
+                                 attention_probs_dropout_prob=0.5)
+        self.encoder = BertEncoder(bert_config)
+        self.pooler = BertPooler(bert_config)
+        self.vec_cls = self.get_cls(256)
+        # Dense
+        self.dropout = nn.Dropout(0.5)
+        self.dense = nn.Linear(256, n_class)
+    def get_cls(self, channel):
+        np.random.seed(0)
+        single_cls = torch.Tensor(np.random.random((1, channel)))
+        vec_cls = torch.cat([single_cls for _ in range(64)], dim=0)
+        vec_cls = vec_cls.unsqueeze(1)
+        return vec_cls
+    def append_cls(self, x):
+        batch, _, _ = x.size()
+        part_vec_cls = self.vec_cls[:batch].clone()
+        part_vec_cls = part_vec_cls.to(x.device)
+        return torch.cat([part_vec_cls, x], dim=1)
+    def get_spec(self, ids, audio_length=15*16000, allow_random=False):
+        wav_list = list()
+        for id in ids:
+            audio_path = os.path.join("/import/c4dm-datasets/Music4All/music4all/audios", id + '.mp3')
+            (wav, sample_rate) = torchaudio.backend.sox_io_backend.load(audio_path)
+            # to mono
+            mono_wav = torch.mean(wav, dim=0)
+            # cut length
+            if allow_random:
+                random_index = random.randint(0, len(mono_wav) - audio_length - 1)
+            else:
+                random_index = 0
+            mono_wav_cut = mono_wav[random_index: random_index + audio_length]
+            wav_list.append(mono_wav_cut)
+        # merge wav to (bs, length)
+        data = torch.stack(wav_list, dim=0)
+        # to spectrogram
+        spectrogram = self.spec(data.cuda())
+        return spectrogram
+    def forward(self, ids):
+        # Spectrogram
+        # for batch
+        spec = self.get_spec(ids)
+        spec_db = self.to_db(spec)
+        x = spec_db.unsqueeze(1) # add channel dim
+        x = self.spec_bn(x)
+        # CNN
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.layer5(x)
+        x = self.layer6(x)
+        x = self.layer7(x)
+        x = x.squeeze(2)
+        # Get [CLS] token
+        x = x.permute(0, 2, 1)
+        x = self.append_cls(x)
+        # Transformer encoder
+        x = self.encoder(x)
+        x = x[-1] # last layer
+        # x = self.pooler(x)
+        #
+        # # Dense
+        # x = self.dropout(x)
+        # x = self.dense(x)
+        # x = nn.Sigmoid()(x)
+        return x # return the last layer. Shape: (length, 256)
+# test code
+# model = CNNSA()
+# model.load_state_dict(torch.load("best_model.pth"))
+# id = ["wlIcjSZkgW0cgWrm", "wlIcjSZkgW0cgWrm"]
+# output = model(id)

code/train.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import torch
+from tqdm import tqdm
+from data import LyricsCommentsDatasetPsuedo
+from torch import utils, nn
+from model import CommentGenerator
+import transformers
+import time
+import statistics
+import os
+import random
+import datasets
+IS_LOAD = False
+LOAD_EPOCH = 0
+EPOCH = 20
+BATCH_SIZE = 8
+LOG_INTERVAL = 100
+SAMPLE_INTERVAL = 2000
+VALIDATION_INTERVAL = 2
+LOG_FOLDER = "log/"
+MODEL_FOLDER = "model/"
+EARLY_STOPPING_INTERVAL = 5
+MODEL_NAME = "bart_baseline_full_256"
+CHOICE_NUMBER = 5
+DATASET_PATH = "dataset_not_negative_256.pkl"
+os.environ["CUDA_VISIBLE_DEVICES"] = "4"
+dataset = LyricsCommentsDatasetPsuedo(dataset_path=DATASET_PATH)
+dataset_length = len(dataset)
+train_dataset_length = int(dataset_length * 0.9)
+valid_dataset_length = dataset_length - train_dataset_length
+train_dataset, valid_dataset = utils.data.random_split(dataset,
+                                        [train_dataset_length,
+                                         valid_dataset_length],
+                                        generator=torch.Generator().manual_seed(42))
+train_dataloader = utils.data.DataLoader(train_dataset,
+                                         batch_size=BATCH_SIZE,
+                                         shuffle=True)
+valid_dataloader = utils.data.DataLoader(valid_dataset,
+                                         batch_size=32,
+                                         shuffle=False)
+model = CommentGenerator().cuda()
+criterion = nn.CrossEntropyLoss()
+optimizer = transformers.Adafactor(model.parameters(), warmup_init=False, relative_step=False,
+                                   lr=6e-4,
+                                   )
+loss_stat = list()
+start_time = time.time()
+start_time_local = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
+early_stop_token = (0.0, 0)
+model.train()
+for epoch in range(1 + LOAD_EPOCH, EPOCH + 1 + LOAD_EPOCH):
+    for batch_index, [lyrics, comment] in enumerate(train_dataloader):
+        # pre-process data
+        input_sentences = lyrics
+        raw_labels = comment
+        output = model(input_sentences, raw_labels)
+        loss = output.loss
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        loss_stat.append(loss.item())
+        # log
+        if batch_index and batch_index % LOG_INTERVAL == 0:
+            curr_time = time.time()
+            passed_time_all = curr_time - start_time
+            time_str = f"{int(passed_time_all / 60)}:{int(passed_time_all % 60)}"
+            log = f"{MODEL_NAME}\t" \
+                  f"Time: {time_str}\t" \
+                  f"Epoch {epoch}: {batch_index}/{int(len(train_dataloader.dataset) / BATCH_SIZE)}\t" \
+                  f"Loss: {statistics.mean(loss_stat[-1 * BATCH_SIZE:])}\t" \
+                  f"Avg loss: {statistics.mean(loss_stat)}"
+            if __debug__:
+                print(log)
+            with open(os.path.join(LOG_FOLDER, MODEL_NAME + "_" + start_time_local + ".txt"), 'a+', encoding='utf-8') as r:
+                r.write(log)
+                r.write("\n")
+            loss_stat = list()
+        if batch_index and batch_index % SAMPLE_INTERVAL == 0:
+            model.eval()
+            samples_list = random.choices(valid_dataset, k=CHOICE_NUMBER)
+            sample_sentence, sample_label = zip(*samples_list)
+            output_samples = model.generate(sample_sentence)
+            for sample_index in range(CHOICE_NUMBER):
+                log = f"Lyrics: {sample_sentence[sample_index]}\n" \
+                      f"Sample outputs: {output_samples[sample_index]}\n" \
+                      f"Ground Truth: {sample_label[sample_index]}"
+                if __debug__:
+                    print(log)
+                with open(os.path.join(LOG_FOLDER, MODEL_NAME + "_" + start_time_local + ".txt"), 'a+', encoding='utf-8') as r:
+                    r.write(log)
+                    r.write("\n")
+            model.train()
+    if epoch and epoch % VALIDATION_INTERVAL == 0:
+        model.eval()
+        metrics = datasets.load_metric('rouge')
+        valid_dataloader = utils.data.DataLoader(valid_dataset,
+                                                 batch_size=32,
+                                                 shuffle=False)
+        for batch_index_valid, [lyrics_valid, comment_valid] in enumerate(valid_dataloader):
+            output_samples = model.generate(lyrics_valid)
+            metrics.add_batch(predictions=output_samples, references=comment_valid)
+            # control time.
+            if batch_index_valid > 10:
+                break
+        score = metrics.compute()
+        if __debug__:
+            print(str(score))
+        with open(os.path.join(LOG_FOLDER, MODEL_NAME + '_' + start_time_local + ".txt"), 'a+',
+                  encoding='utf-8') as r:
+            r.write(str(score))
+            r.write("\n")
+        # save
+        if score['rouge1'].mid.recall > early_stop_token[0]:
+            early_stop_token = [score['rouge1'].mid.recall, epoch]  # replace to the best
+            torch.save(model.state_dict(), os.path.join(MODEL_FOLDER, f"{MODEL_NAME}_best.pt"))
+            torch.save(optimizer.state_dict(),
+                       os.path.join(MODEL_FOLDER, f"{MODEL_NAME}_optim_best.pt"))
+        if epoch:
+            torch.save(model.state_dict(), os.path.join(MODEL_FOLDER, f"{MODEL_NAME}_epoch{epoch}.pt"))
+            torch.save(optimizer.state_dict(),
+                       os.path.join(MODEL_FOLDER, f"{MODEL_NAME}_optim_epoch{epoch}.pt"))
+        # early stopping
+        if score['rouge1'].mid.recall <= early_stop_token[0] and epoch > (
+                early_stop_token[1] + EARLY_STOPPING_INTERVAL):
+            print(f"Early Stopping. Best Score: {early_stop_token[0]} at Epoch {early_stop_token[1]}.")
+        model.train()

code/train_fusion.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import torch
+from tqdm import tqdm
+from data import LyricsCommentsDatasetPsuedo_fusion
+from torch import utils, nn
+from model_fusion import CommentGenerator_fusion
+import transformers
+import time
+import statistics
+import os
+import random
+import datasets
+IS_LOAD = False
+LOAD_EPOCH = 0
+EPOCH = 50
+BATCH_SIZE = 8
+LOG_INTERVAL = 100
+SAMPLE_INTERVAL = 1000
+VALIDATION_INTERVAL = 2
+LOG_FOLDER = "log/"
+MODEL_FOLDER = "model/"
+SAVE_INTERVAL = 2
+EARLY_STOPPING_INTERVAL = 5
+MODEL_NAME = "bart_fusion_full_256"
+CHOICE_NUMBER = 2
+DATASET_PATH = "/homes/yz007/multimodal-transformer/comment_generator/dataset_full_256.pkl"
+os.environ["CUDA_VISIBLE_DEVICES"] = "2"
+dataset = LyricsCommentsDatasetPsuedo_fusion(dataset_path=DATASET_PATH)
+dataset_length = len(dataset)
+train_dataset_length = int(dataset_length * 0.9)
+valid_dataset_length = dataset_length - train_dataset_length
+train_dataset, valid_dataset = utils.data.random_split(dataset,
+                                                       [train_dataset_length,
+                                                        valid_dataset_length],
+                                                       generator=torch.Generator().manual_seed(42))
+train_dataloader = utils.data.DataLoader(train_dataset,
+                                         batch_size=BATCH_SIZE,
+                                         shuffle=True)
+# valid_dataloader = utils.data.DataLoader(valid_dataset,
+#                                          batch_size=32,
+#                                          shuffle=False)
+model = CommentGenerator_fusion().cuda()
+criterion = nn.CrossEntropyLoss()
+# optimizer = transformers.Adafactor(filter(lambda p: p.requires_grad, model.parameters()),
+#                                    lr=6e-4,
+#                                    )
+optimizer = transformers.Adafactor(model.parameters(), warmup_init=False, relative_step=False,
+                                   lr=6e-4,
+                                   )
+if IS_LOAD:
+    model.load_state_dict(torch.load("/homes/yz007/multimodal-transformer/comment_generator/model/bart_fusion_positive_256_6e-4_epoch6.pt"))
+    optimizer.load_state_dict(torch.load("/homes/yz007/multimodal-transformer/comment_generator/model/bart_fusion_positive_256_6e-4_optim_epoch6.pt"))
+loss_stat = list()
+start_time = time.time()
+start_time_local = time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime())
+early_stop_token = [0.0, 0]
+validation_loss_history = list()
+model.train()
+for epoch in range(1 + LOAD_EPOCH, EPOCH + 1 + LOAD_EPOCH):
+    for batch_index, [lyrics, comment, music_id] in enumerate(train_dataloader):
+        # pre-process data
+        input_sentences = lyrics
+        raw_labels = comment
+        output = model(input_sentences, music_id, raw_labels)
+        loss = output.loss
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        loss_stat.append(loss.item())
+        # log
+        if batch_index and batch_index % LOG_INTERVAL == 0:
+            curr_time = time.time()
+            passed_time_all = curr_time - start_time
+            time_str = f"{int(passed_time_all / 60)}:{int(passed_time_all % 60)}"
+            log = f"{MODEL_NAME}\t" \
+                  f"Time: {time_str}\t" \
+                  f"Epoch {epoch}: {batch_index}/{int(len(train_dataloader.dataset) / BATCH_SIZE)}\t" \
+                  f"Loss: {statistics.mean(loss_stat[-1 * LOG_INTERVAL * BATCH_SIZE:])}\t" \
+                  f"Avg loss: {statistics.mean(loss_stat)}"
+            if __debug__:
+                print(log)
+            with open(os.path.join(LOG_FOLDER, MODEL_NAME + '_' + start_time_local + ".txt"), 'a+',
+                      encoding='utf-8') as r:
+                r.write(log)
+                r.write("\n")
+            loss_stat = list()
+        if batch_index and batch_index % SAMPLE_INTERVAL == 0:
+            # make samples
+            model.eval()
+            samples_list = random.choices(valid_dataset, k=CHOICE_NUMBER)
+            sample_sentence, sample_label, music_ids = zip(*samples_list)
+            with torch.no_grad():
+                output_samples = model.generate(sample_sentence, music_ids)
+            for sample_index in range(CHOICE_NUMBER):
+                log = f"Lyrics: {sample_sentence[sample_index]}\n" \
+                      f"Sample outputs: {output_samples[sample_index]}\n" \
+                      f"Ground Truth: {sample_label[sample_index]}"
+                if __debug__:
+                    print(log)
+                with open(os.path.join(LOG_FOLDER, MODEL_NAME + '_' + start_time_local + ".txt"), 'a+',
+                          encoding='utf-8') as r:
+                    r.write(log)
+                    r.write("\n")
+            # validation loss
+            valid_dataloader = utils.data.DataLoader(valid_dataset,
+                                                     batch_size=8,
+                                                     shuffle=False)
+            valid_loss_stat = list()
+            for batch_index_valid, [lyrics_valid, comment_valid, music_id_valid] in enumerate(valid_dataloader):
+                with torch.no_grad():
+                    output_valid = model(lyrics_valid, music_id_valid, comment_valid)
+                valid_loss = output_valid.loss.item()
+                valid_loss_stat.append(valid_loss)
+                if batch_index_valid > 15:
+                    break
+            valid_loss_mean = statistics.mean(valid_loss_stat)
+            validation_loss_history.append(valid_loss_mean)
+            log = f"{MODEL_NAME}\t" \
+                  f"Time: {time_str}\t" \
+                  f"Epoch {epoch}: {batch_index}/{int(len(train_dataloader.dataset) / BATCH_SIZE)}\t" \
+                  f"Validation Loss: {valid_loss_mean}\t"
+            if __debug__:
+                print(log)
+            with open(os.path.join(LOG_FOLDER, MODEL_NAME + '_' + start_time_local + ".txt"), 'a+',
+                      encoding='utf-8') as r:
+                r.write(log)
+                r.write("\n")
+            # back to train
+            model.train()
+    if epoch and epoch % VALIDATION_INTERVAL == 0:
+        model.eval()
+        metrics = datasets.load_metric('rouge')
+        valid_dataloader = utils.data.DataLoader(valid_dataset,
+                                                 batch_size=8,
+                                                 shuffle=False)
+        for batch_index_valid, [lyrics_valid, comment_valid, music_id_valid] in enumerate(valid_dataloader):
+            with torch.no_grad():
+                output_samples = model.generate(lyrics_valid, music_id_valid)
+            metrics.add_batch(predictions=output_samples, references=comment_valid)
+            # control time.
+            if batch_index_valid > 10:
+                break
+        score = metrics.compute()
+        if __debug__:
+            print(str(score))
+        with open(os.path.join(LOG_FOLDER, MODEL_NAME + '_' + start_time_local + ".txt"), 'a+',
+                  encoding='utf-8') as r:
+            r.write(str(score))
+            r.write("\n")
+        # save
+        if score['rouge1'].mid.recall > early_stop_token[0]:
+            early_stop_token = [score['rouge1'].mid.recall, epoch]  # replace to the best
+            torch.save(model.state_dict(), os.path.join(MODEL_FOLDER, f"{MODEL_NAME}_best.pt"))
+            torch.save(optimizer.state_dict(),
+                       os.path.join(MODEL_FOLDER, f"{MODEL_NAME}_optim_best.pt"))
+        # save
+        if epoch and epoch % SAVE_INTERVAL == 0:
+            torch.save(model.state_dict(), os.path.join(MODEL_FOLDER, f"{MODEL_NAME}_epoch{epoch}.pt"))
+            torch.save(optimizer.state_dict(),
+                       os.path.join(MODEL_FOLDER, f"{MODEL_NAME}_optim_epoch{epoch}.pt"))
+        # early stopping
+        if len(validation_loss_history) > EARLY_STOPPING_INTERVAL:
+            if min(validation_loss_history[-2 * EARLY_STOPPING_INTERVAL:]) == validation_loss_history[-2 * EARLY_STOPPING_INTERVAL]:
+                print(f"Early Stopping. Best Score: {early_stop_token[0]} at Epoch {early_stop_token[1]}.")
+                break
+        if score['rouge1'].mid.recall <= early_stop_token[0] and epoch > (
+                early_stop_token[1] + EARLY_STOPPING_INTERVAL):
+            print(f"Early Stopping. Best Score: {early_stop_token[0]} at Epoch {early_stop_token[1]}.")
+            break
+        model.train()
+print(f"Training Complete. Best Score: {early_stop_token[0]} at Epoch {early_stop_token[1]}.")