sayakpaul
/

vjepa-ckpts

Model card Files Files and versions Community

sayakpaul HF staff commited on 8 days ago

Commit

2d00d92

verified ·

1 Parent(s): 38743dc

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

vith16.pth.tar +3 -0
vith16.yaml +90 -0
vitl16.pth.tar +3 -0
vitl16.yaml +90 -0

vith16.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa02687973a8337c00d999ee3a17d539a0224c4c89b862150e65cbdb2f67b61
+size 10393043714

vith16.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+app: vjepa
+nodes: 16
+tasks_per_node: 8
+data:
+  dataset_type: VideoDataset
+  datasets:
+    - /your_path_to_kinetics710_csv_file_index.csv
+    - /your_path_to_ssv2_csv_file_index.csv
+    - /your_path_to_howto100m_csv_file_index.csv
+  decode_one_clip: true
+  batch_size: 24
+  num_clips: 1
+  num_frames: 16
+  tubelet_size: 2
+  sampling_rate: 4
+  crop_size: 224
+  patch_size: 16
+  pin_mem: true
+  num_workers: 12
+  filter_short_videos: false
+  clip_duration: null
+data_aug:
+  auto_augment: false
+  motion_shift: false
+  random_resize_aspect_ratio:
+  - 0.75
+  - 1.35
+  random_resize_scale:
+  - 0.3
+  - 1.0
+  reprob: 0.0
+logging:
+  folder: /your_absolute_file_path_for_saving_logs_and_checkpoints/
+  write_tag: jepa
+loss:
+  loss_exp: 1.0
+  reg_coeff: 0.0
+mask:
+  - aspect_ratio:
+      - 0.75
+      - 1.5
+    num_blocks: 8
+    spatial_scale:
+      - 0.15
+      - 0.15
+    temporal_scale:
+      - 1.0
+      - 1.0
+    max_temporal_keep: 1.0
+    max_keep: null
+  - aspect_ratio:
+      - 0.75
+      - 1.5
+    num_blocks: 2
+    spatial_scale:
+      - 0.7
+      - 0.7
+    temporal_scale:
+      - 1.0
+      - 1.0
+    max_temporal_keep: 1.0
+    max_keep: null
+meta:
+  load_checkpoint: false
+  read_checkpoint: null
+  seed: 234
+  eval_freq: 100
+  use_sdpa: true
+  dtype: bfloat16
+model:
+  model_name: vit_huge
+  pred_depth: 12
+  pred_embed_dim: 384
+  uniform_power: true
+  use_mask_tokens: true
+  zero_init_mask_tokens: true
+optimization:
+  ipe: 300
+  ipe_scale: 1.25
+  clip_grad: 10.0
+  weight_decay: 0.04
+  final_weight_decay: 0.4
+  epochs: 300
+  warmup: 40
+  start_lr: 0.0002
+  lr: 0.000625
+  final_lr: 1.0e-06
+  ema:
+  - 0.998
+  - 1.0

vitl16.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cd3bfce97d04891a31eac70d573ef94f423d858a6b0dc0b3ddf5f8d00b33fe4
+size 5143105394

vitl16.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+app: vjepa
+nodes: 16
+tasks_per_node: 8
+data:
+  dataset_type: VideoDataset
+  datasets:
+    - /your_path_to_kinetics710_csv_file_index.csv
+    - /your_path_to_ssv2_csv_file_index.csv
+    - /your_path_to_howto100m_csv_file_index.csv
+  decode_one_clip: true
+  batch_size: 24
+  num_clips: 1
+  num_frames: 16
+  tubelet_size: 2
+  sampling_rate: 4
+  crop_size: 224
+  patch_size: 16
+  pin_mem: true
+  num_workers: 12
+  filter_short_videos: false
+  clip_duration: null
+data_aug:
+  auto_augment: false
+  motion_shift: false
+  random_resize_aspect_ratio:
+  - 0.75
+  - 1.35
+  random_resize_scale:
+  - 0.3
+  - 1.0
+  reprob: 0.0
+logging:
+  folder: /your_absolute_file_path_for_saving_logs_and_checkpoints/
+  write_tag: jepa
+loss:
+  loss_exp: 1.0
+  reg_coeff: 0.0
+mask:
+  - aspect_ratio:
+      - 0.75
+      - 1.5
+    num_blocks: 8
+    spatial_scale:
+      - 0.15
+      - 0.15
+    temporal_scale:
+      - 1.0
+      - 1.0
+    max_temporal_keep: 1.0
+    max_keep: null
+  - aspect_ratio:
+      - 0.75
+      - 1.5
+    num_blocks: 2
+    spatial_scale:
+      - 0.7
+      - 0.7
+    temporal_scale:
+      - 1.0
+      - 1.0
+    max_temporal_keep: 1.0
+    max_keep: null
+meta:
+  load_checkpoint: false
+  read_checkpoint: null
+  seed: 234
+  eval_freq: 100
+  use_sdpa: true
+  dtype: bfloat16
+model:
+  model_name: vit_large
+  pred_depth: 12
+  pred_embed_dim: 384
+  uniform_power: true
+  use_mask_tokens: true
+  zero_init_mask_tokens: true
+optimization:
+  ipe: 300
+  ipe_scale: 1.25
+  clip_grad: 10.0
+  weight_decay: 0.04
+  final_weight_decay: 0.4
+  epochs: 300
+  warmup: 40
+  start_lr: 0.0002
+  lr: 0.000625
+  final_lr: 1.0e-06
+  ema:
+  - 0.998
+  - 1.0