sayakpaul HF staff commited on
Commit
2d00d92
·
verified ·
1 Parent(s): 38743dc

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. vith16.pth.tar +3 -0
  2. vith16.yaml +90 -0
  3. vitl16.pth.tar +3 -0
  4. vitl16.yaml +90 -0
vith16.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6fa02687973a8337c00d999ee3a17d539a0224c4c89b862150e65cbdb2f67b61
3
+ size 10393043714
vith16.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ app: vjepa
2
+ nodes: 16
3
+ tasks_per_node: 8
4
+ data:
5
+ dataset_type: VideoDataset
6
+ datasets:
7
+ - /your_path_to_kinetics710_csv_file_index.csv
8
+ - /your_path_to_ssv2_csv_file_index.csv
9
+ - /your_path_to_howto100m_csv_file_index.csv
10
+ decode_one_clip: true
11
+ batch_size: 24
12
+ num_clips: 1
13
+ num_frames: 16
14
+ tubelet_size: 2
15
+ sampling_rate: 4
16
+ crop_size: 224
17
+ patch_size: 16
18
+ pin_mem: true
19
+ num_workers: 12
20
+ filter_short_videos: false
21
+ clip_duration: null
22
+ data_aug:
23
+ auto_augment: false
24
+ motion_shift: false
25
+ random_resize_aspect_ratio:
26
+ - 0.75
27
+ - 1.35
28
+ random_resize_scale:
29
+ - 0.3
30
+ - 1.0
31
+ reprob: 0.0
32
+ logging:
33
+ folder: /your_absolute_file_path_for_saving_logs_and_checkpoints/
34
+ write_tag: jepa
35
+ loss:
36
+ loss_exp: 1.0
37
+ reg_coeff: 0.0
38
+ mask:
39
+ - aspect_ratio:
40
+ - 0.75
41
+ - 1.5
42
+ num_blocks: 8
43
+ spatial_scale:
44
+ - 0.15
45
+ - 0.15
46
+ temporal_scale:
47
+ - 1.0
48
+ - 1.0
49
+ max_temporal_keep: 1.0
50
+ max_keep: null
51
+ - aspect_ratio:
52
+ - 0.75
53
+ - 1.5
54
+ num_blocks: 2
55
+ spatial_scale:
56
+ - 0.7
57
+ - 0.7
58
+ temporal_scale:
59
+ - 1.0
60
+ - 1.0
61
+ max_temporal_keep: 1.0
62
+ max_keep: null
63
+ meta:
64
+ load_checkpoint: false
65
+ read_checkpoint: null
66
+ seed: 234
67
+ eval_freq: 100
68
+ use_sdpa: true
69
+ dtype: bfloat16
70
+ model:
71
+ model_name: vit_huge
72
+ pred_depth: 12
73
+ pred_embed_dim: 384
74
+ uniform_power: true
75
+ use_mask_tokens: true
76
+ zero_init_mask_tokens: true
77
+ optimization:
78
+ ipe: 300
79
+ ipe_scale: 1.25
80
+ clip_grad: 10.0
81
+ weight_decay: 0.04
82
+ final_weight_decay: 0.4
83
+ epochs: 300
84
+ warmup: 40
85
+ start_lr: 0.0002
86
+ lr: 0.000625
87
+ final_lr: 1.0e-06
88
+ ema:
89
+ - 0.998
90
+ - 1.0
vitl16.pth.tar ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cd3bfce97d04891a31eac70d573ef94f423d858a6b0dc0b3ddf5f8d00b33fe4
3
+ size 5143105394
vitl16.yaml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ app: vjepa
2
+ nodes: 16
3
+ tasks_per_node: 8
4
+ data:
5
+ dataset_type: VideoDataset
6
+ datasets:
7
+ - /your_path_to_kinetics710_csv_file_index.csv
8
+ - /your_path_to_ssv2_csv_file_index.csv
9
+ - /your_path_to_howto100m_csv_file_index.csv
10
+ decode_one_clip: true
11
+ batch_size: 24
12
+ num_clips: 1
13
+ num_frames: 16
14
+ tubelet_size: 2
15
+ sampling_rate: 4
16
+ crop_size: 224
17
+ patch_size: 16
18
+ pin_mem: true
19
+ num_workers: 12
20
+ filter_short_videos: false
21
+ clip_duration: null
22
+ data_aug:
23
+ auto_augment: false
24
+ motion_shift: false
25
+ random_resize_aspect_ratio:
26
+ - 0.75
27
+ - 1.35
28
+ random_resize_scale:
29
+ - 0.3
30
+ - 1.0
31
+ reprob: 0.0
32
+ logging:
33
+ folder: /your_absolute_file_path_for_saving_logs_and_checkpoints/
34
+ write_tag: jepa
35
+ loss:
36
+ loss_exp: 1.0
37
+ reg_coeff: 0.0
38
+ mask:
39
+ - aspect_ratio:
40
+ - 0.75
41
+ - 1.5
42
+ num_blocks: 8
43
+ spatial_scale:
44
+ - 0.15
45
+ - 0.15
46
+ temporal_scale:
47
+ - 1.0
48
+ - 1.0
49
+ max_temporal_keep: 1.0
50
+ max_keep: null
51
+ - aspect_ratio:
52
+ - 0.75
53
+ - 1.5
54
+ num_blocks: 2
55
+ spatial_scale:
56
+ - 0.7
57
+ - 0.7
58
+ temporal_scale:
59
+ - 1.0
60
+ - 1.0
61
+ max_temporal_keep: 1.0
62
+ max_keep: null
63
+ meta:
64
+ load_checkpoint: false
65
+ read_checkpoint: null
66
+ seed: 234
67
+ eval_freq: 100
68
+ use_sdpa: true
69
+ dtype: bfloat16
70
+ model:
71
+ model_name: vit_large
72
+ pred_depth: 12
73
+ pred_embed_dim: 384
74
+ uniform_power: true
75
+ use_mask_tokens: true
76
+ zero_init_mask_tokens: true
77
+ optimization:
78
+ ipe: 300
79
+ ipe_scale: 1.25
80
+ clip_grad: 10.0
81
+ weight_decay: 0.04
82
+ final_weight_decay: 0.4
83
+ epochs: 300
84
+ warmup: 40
85
+ start_lr: 0.0002
86
+ lr: 0.000625
87
+ final_lr: 1.0e-06
88
+ ema:
89
+ - 0.998
90
+ - 1.0