|
from typing import Tuple
|
|
from transformers import PretrainedConfig
|
|
|
|
|
|
class AVHubertConfig(PretrainedConfig):
|
|
model_type = "av_hubert"
|
|
|
|
def __init__(
|
|
self,
|
|
label_rate: int = 25,
|
|
sample_rate: int = 25,
|
|
input_modality: str = "video",
|
|
extractor_mode: str = "default",
|
|
encoder_layers: int = 24,
|
|
encoder_embed_dim: int = 1024,
|
|
encoder_ffn_embed_dim: int = 4096,
|
|
encoder_attention_heads: int = 16,
|
|
activation_fn: str = "gelu",
|
|
dropout: float = 0.1,
|
|
attention_dropout: float = 0.1,
|
|
activation_dropout: float = 0.1,
|
|
encoder_layerdrop: float = 0.0,
|
|
dropout_input: float = 0.0,
|
|
dropout_features: float = 0.0,
|
|
final_dim: int = 256,
|
|
untie_final_proj: bool = False,
|
|
layer_norm_first: bool = False,
|
|
conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
|
|
conv_bias: bool = False,
|
|
logit_temp: float = 0.1,
|
|
target_glu: bool = False,
|
|
feature_grad_mult: float = 1.0,
|
|
mask_length_audio: int = 10,
|
|
mask_prob_audio: float = 0.65,
|
|
mask_length_image: int = 10,
|
|
mask_prob_image: float = 0.65,
|
|
mask_selection: str = "static",
|
|
mask_other: float = 0.0,
|
|
no_mask_overlap: bool = False,
|
|
mask_min_space: int = 1,
|
|
mask_channel_length: int = 64,
|
|
mask_channel_prob: float = 0.5,
|
|
mask_channel_selection: str = "static",
|
|
mask_channel_other: float = 0.0,
|
|
no_mask_channel_overlap: bool = False,
|
|
mask_channel_min_space: int = 1,
|
|
conv_pos: int = 128,
|
|
conv_pos_groups: int = 16,
|
|
latent_temp: Tuple[float, float, float] = (2.0, 0.5, 0.999995),
|
|
skip_masked: bool = False,
|
|
skip_nomask: bool = False,
|
|
resnet_relu_type: str = "prelu",
|
|
resnet_weights: str = None,
|
|
sim_type: str = "cosine",
|
|
sub_encoder_layers: int = 0,
|
|
audio_feat_dim: int = 104,
|
|
modality_dropout: float = 0.0,
|
|
audio_dropout: float = 0.0,
|
|
modality_fuse: str = "concat",
|
|
selection_type: str = "same_other_seq",
|
|
masking_type: str = "input",
|
|
decoder_embed_dim: int = 2560,
|
|
decoder_ffn_embed_dim: int = 3072,
|
|
decoder_layers: int = 6,
|
|
decoder_layerdrop: float = 0.0,
|
|
decoder_attention_heads: int = 4,
|
|
decoder_learned_pos: bool = False,
|
|
decoder_normalize_before: bool = False,
|
|
no_token_positional_embeddings: bool = False,
|
|
decoder_dropout: float = 0.1,
|
|
decoder_attention_dropout: float = 0.1,
|
|
decoder_activation_dropout: float = 0.0,
|
|
max_target_positions: int = 2048,
|
|
share_decoder_input_output_embed: bool = False,
|
|
no_scale_embedding: bool = True,
|
|
num_classes: int = 2004,
|
|
feature_ds_rate: int = 1,
|
|
**kwargs,
|
|
) -> None:
|
|
super().__init__(**kwargs)
|
|
|
|
self.label_rate = label_rate
|
|
self.sample_rate = sample_rate
|
|
self.input_modality = input_modality
|
|
self.extractor_mode = extractor_mode
|
|
self.encoder_layers = encoder_layers
|
|
self.encoder_embed_dim = encoder_embed_dim
|
|
self.encoder_ffn_embed_dim = encoder_ffn_embed_dim
|
|
self.encoder_attention_heads = encoder_attention_heads
|
|
self.activation_fn = activation_fn
|
|
self.dropout = dropout
|
|
self.attention_dropout = attention_dropout
|
|
self.activation_dropout = activation_dropout
|
|
self.encoder_layerdrop = encoder_layerdrop
|
|
self.dropout_input = dropout_input
|
|
self.dropout_features = dropout_features
|
|
self.final_dim = final_dim
|
|
self.untie_final_proj = untie_final_proj
|
|
self.layer_norm_first = layer_norm_first
|
|
self.conv_feature_layers = conv_feature_layers
|
|
self.conv_bias = conv_bias
|
|
self.logit_temp = logit_temp
|
|
self.target_glu = target_glu
|
|
self.feature_grad_mult = feature_grad_mult
|
|
self.mask_length_audio = mask_length_audio
|
|
self.mask_prob_audio = mask_prob_audio
|
|
self.mask_length_image = mask_length_image
|
|
self.mask_prob_image = mask_prob_image
|
|
self.mask_selection = mask_selection
|
|
self.mask_other = mask_other
|
|
self.no_mask_overlap = no_mask_overlap
|
|
self.mask_min_space = mask_min_space
|
|
self.mask_channel_length = mask_channel_length
|
|
self.mask_channel_prob = mask_channel_prob
|
|
self.mask_channel_selection = mask_channel_selection
|
|
self.mask_channel_other = mask_channel_other
|
|
self.no_mask_channel_overlap = no_mask_channel_overlap
|
|
self.mask_channel_min_space = mask_channel_min_space
|
|
self.conv_pos = conv_pos
|
|
self.conv_pos_groups = conv_pos_groups
|
|
self.latent_temp = latent_temp
|
|
self.skip_masked = skip_masked
|
|
self.skip_nomask = skip_nomask
|
|
self.resnet_relu_type = resnet_relu_type
|
|
self.resnet_weights = resnet_weights
|
|
self.sim_type = sim_type
|
|
self.sub_encoder_layers = sub_encoder_layers
|
|
self.audio_feat_dim = audio_feat_dim
|
|
self.modality_dropout = modality_dropout
|
|
self.audio_dropout = audio_dropout
|
|
self.modality_fuse = modality_fuse
|
|
self.selection_type = selection_type
|
|
self.masking_type = masking_type
|
|
self.decoder_embed_dim = decoder_embed_dim
|
|
self.decoder_ffn_embed_dim = decoder_ffn_embed_dim
|
|
self.decoder_layers = decoder_layers
|
|
self.decoder_layerdrop = decoder_layerdrop
|
|
self.decoder_attention_heads = decoder_attention_heads
|
|
self.decoder_learned_pos = decoder_learned_pos
|
|
self.decoder_normalize_before = decoder_normalize_before
|
|
self.no_token_positional_embeddings = no_token_positional_embeddings
|
|
self.decoder_dropout = decoder_dropout
|
|
self.decoder_attention_dropout = decoder_attention_dropout
|
|
self.decoder_activation_dropout = decoder_activation_dropout
|
|
self.max_target_positions = max_target_positions
|
|
self.share_decoder_input_output_embed = share_decoder_input_output_embed
|
|
self.no_scale_embedding = no_scale_embedding
|
|
self.num_classes = num_classes
|
|
self.feature_ds_rate = feature_ds_rate
|
|
|
|
|
|
class AVSPLLMConfig(AVHubertConfig):
|
|
model_type = "avsp_llm"
|
|
|
|
def __init__(
|
|
self,
|
|
llm_ckpt_path: str = "vilm/vinallama-2.7b",
|
|
cache_dir: str = "models/huggingface",
|
|
no_pretrained_weights: bool = False,
|
|
final_dropout: float = 0.1,
|
|
apply_mask: bool = False,
|
|
mask_length: int = 10,
|
|
mask_prob: float = 0.5,
|
|
masking_updates: int = 0,
|
|
layerdrop: float = 0.0,
|
|
normalize: bool = False,
|
|
data: str = None,
|
|
w2v_args: dict = None,
|
|
freeze_finetune_updates: int = 0,
|
|
km_path: str = "model.km",
|
|
**kwargs,
|
|
) -> None:
|
|
super().__init__(**kwargs)
|
|
|
|
self.llm_ckpt_path = llm_ckpt_path
|
|
self.cache_dir = cache_dir
|
|
self.no_pretrained_weights = no_pretrained_weights
|
|
self.final_dropout = final_dropout
|
|
self.apply_mask = apply_mask
|
|
self.mask_length = mask_length
|
|
self.mask_prob = mask_prob
|
|
self.masking_updates = masking_updates
|
|
self.layerdrop = layerdrop
|
|
self.normalize = normalize
|
|
self.data = data
|
|
self.w2v_args = w2v_args
|
|
self.freeze_finetune_updates = freeze_finetune_updates
|
|
self.km_path = km_path
|
|
|