|
MODEL: |
|
WEIGHTS: '' |
|
compute_precision: |
|
grad_scaler: true |
|
teacher: |
|
backbone: |
|
sharding_strategy: SHARD_GRAD_OP |
|
mixed_precision: |
|
param_dtype: fp16 |
|
reduce_dtype: fp16 |
|
buffer_dtype: fp32 |
|
dino_head: |
|
sharding_strategy: SHARD_GRAD_OP |
|
mixed_precision: |
|
param_dtype: fp16 |
|
reduce_dtype: fp16 |
|
buffer_dtype: fp32 |
|
ibot_head: |
|
sharding_strategy: SHARD_GRAD_OP |
|
mixed_precision: |
|
param_dtype: fp16 |
|
reduce_dtype: fp16 |
|
buffer_dtype: fp32 |
|
student: |
|
backbone: |
|
sharding_strategy: SHARD_GRAD_OP |
|
mixed_precision: |
|
param_dtype: fp16 |
|
reduce_dtype: fp16 |
|
buffer_dtype: fp32 |
|
dino_head: |
|
sharding_strategy: SHARD_GRAD_OP |
|
mixed_precision: |
|
param_dtype: fp16 |
|
reduce_dtype: fp32 |
|
buffer_dtype: fp32 |
|
ibot_head: |
|
sharding_strategy: SHARD_GRAD_OP |
|
mixed_precision: |
|
param_dtype: fp16 |
|
reduce_dtype: fp32 |
|
buffer_dtype: fp32 |
|
dino: |
|
loss_weight: 1.0 |
|
head_n_prototypes: 65536 |
|
head_bottleneck_dim: 256 |
|
head_nlayers: 3 |
|
head_hidden_dim: 2048 |
|
koleo_loss_weight: 0.1 |
|
ibot: |
|
loss_weight: 1.0 |
|
mask_sample_probability: 0.5 |
|
mask_ratio_min_max: |
|
- 0.1 |
|
- 0.5 |
|
separate_head: false |
|
head_n_prototypes: 65536 |
|
head_bottleneck_dim: 256 |
|
head_nlayers: 3 |
|
head_hidden_dim: 2048 |
|
train: |
|
batch_size_per_gpu: 64 |
|
dataset_path: ImageNet:split=TRAIN |
|
output_dir: . |
|
saveckp_freq: 20 |
|
seed: 0 |
|
num_workers: 10 |
|
OFFICIAL_EPOCH_LENGTH: 1250 |
|
cache_dataset: true |
|
centering: "centering" |
|
student: |
|
arch: vit_large |
|
patch_size: 16 |
|
drop_path_rate: 0.3 |
|
layerscale: 1.0e-05 |
|
drop_path_uniform: true |
|
pretrained_weights: '' |
|
ffn_layer: "mlp" |
|
block_chunks: 0 |
|
qkv_bias: true |
|
proj_bias: true |
|
ffn_bias: true |
|
teacher: |
|
momentum_teacher: 0.992 |
|
final_momentum_teacher: 1 |
|
warmup_teacher_temp: 0.04 |
|
teacher_temp: 0.07 |
|
warmup_teacher_temp_epochs: 30 |
|
optim: |
|
epochs: 100 |
|
weight_decay: 0.04 |
|
weight_decay_end: 0.4 |
|
base_lr: 0.004 |
|
lr: 0. |
|
warmup_epochs: 10 |
|
min_lr: 1.0e-06 |
|
clip_grad: 3.0 |
|
freeze_last_layer_epochs: 1 |
|
scaling_rule: sqrt_wrt_1024 |
|
patch_embed_lr_mult: 0.2 |
|
layerwise_decay: 0.9 |
|
adamw_beta1: 0.9 |
|
adamw_beta2: 0.999 |
|
crops: |
|
global_crops_scale: |
|
- 0.32 |
|
- 1.0 |
|
local_crops_number: 8 |
|
local_crops_scale: |
|
- 0.05 |
|
- 0.32 |
|
global_crops_size: 224 |
|
local_crops_size: 96 |
|
evaluation: |
|
eval_period_iterations: 12500 |
|
|