app: vjepa | |
nodes: 16 | |
tasks_per_node: 8 | |
data: | |
dataset_type: VideoDataset | |
datasets: | |
- /your_path_to_kinetics710_csv_file_index.csv | |
- /your_path_to_ssv2_csv_file_index.csv | |
- /your_path_to_howto100m_csv_file_index.csv | |
decode_one_clip: true | |
batch_size: 24 | |
num_clips: 1 | |
num_frames: 16 | |
tubelet_size: 2 | |
sampling_rate: 4 | |
crop_size: 224 | |
patch_size: 16 | |
pin_mem: true | |
num_workers: 12 | |
filter_short_videos: false | |
clip_duration: null | |
data_aug: | |
auto_augment: false | |
motion_shift: false | |
random_resize_aspect_ratio: | |
- 0.75 | |
- 1.35 | |
random_resize_scale: | |
- 0.3 | |
- 1.0 | |
reprob: 0.0 | |
logging: | |
folder: /your_absolute_file_path_for_saving_logs_and_checkpoints/ | |
write_tag: jepa | |
loss: | |
loss_exp: 1.0 | |
reg_coeff: 0.0 | |
mask: | |
- aspect_ratio: | |
- 0.75 | |
- 1.5 | |
num_blocks: 8 | |
spatial_scale: | |
- 0.15 | |
- 0.15 | |
temporal_scale: | |
- 1.0 | |
- 1.0 | |
max_temporal_keep: 1.0 | |
max_keep: null | |
- aspect_ratio: | |
- 0.75 | |
- 1.5 | |
num_blocks: 2 | |
spatial_scale: | |
- 0.7 | |
- 0.7 | |
temporal_scale: | |
- 1.0 | |
- 1.0 | |
max_temporal_keep: 1.0 | |
max_keep: null | |
meta: | |
load_checkpoint: false | |
read_checkpoint: null | |
seed: 234 | |
eval_freq: 100 | |
use_sdpa: true | |
dtype: bfloat16 | |
model: | |
model_name: vit_large | |
pred_depth: 12 | |
pred_embed_dim: 384 | |
uniform_power: true | |
use_mask_tokens: true | |
zero_init_mask_tokens: true | |
optimization: | |
ipe: 300 | |
ipe_scale: 1.25 | |
clip_grad: 10.0 | |
weight_decay: 0.04 | |
final_weight_decay: 0.4 | |
epochs: 300 | |
warmup: 40 | |
start_lr: 0.0002 | |
lr: 0.000625 | |
final_lr: 1.0e-06 | |
ema: | |
- 0.998 | |
- 1.0 | |