Adding Viper models
Browse files
viper/glip/checkpoints/glip_large_model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dcb0178264b3193901a11560aa7f31b05821a5bf15225eeb0eeebbedaaa27791
|
3 |
+
size 6896153761
|
viper/glip/configs/glip_Swin_L.yaml
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL:
|
2 |
+
META_ARCHITECTURE: "GeneralizedVLRCNN"
|
3 |
+
WEIGHT: "swin_large_patch4_window12_384_22k.pth"
|
4 |
+
RPN_ONLY: True
|
5 |
+
RPN_ARCHITECTURE: "VLDYHEAD"
|
6 |
+
|
7 |
+
BACKBONE:
|
8 |
+
CONV_BODY: "SWINT-FPN-RETINANET"
|
9 |
+
OUT_CHANNELS: 256
|
10 |
+
|
11 |
+
SWINT:
|
12 |
+
EMBED_DIM: 192
|
13 |
+
DEPTHS: (2, 2, 18, 2)
|
14 |
+
NUM_HEADS: (6, 12, 24, 48)
|
15 |
+
WINDOW_SIZE: 12
|
16 |
+
OUT_CHANNELS: (192, 384, 768, 1536)
|
17 |
+
DROP_PATH_RATE: 0.4
|
18 |
+
|
19 |
+
LANGUAGE_BACKBONE:
|
20 |
+
FREEZE: False
|
21 |
+
MODEL_TYPE: "bert-base-uncased" # "roberta-base", "clip"
|
22 |
+
MASK_SPECIAL: False
|
23 |
+
|
24 |
+
RPN:
|
25 |
+
USE_FPN: True
|
26 |
+
ANCHOR_SIZES: (64, 128, 256, 512, 1024)
|
27 |
+
ANCHOR_STRIDE: (8, 16, 32, 64, 128)
|
28 |
+
ASPECT_RATIOS: (1.0,)
|
29 |
+
SCALES_PER_OCTAVE: 1
|
30 |
+
|
31 |
+
DYHEAD:
|
32 |
+
CHANNELS: 256
|
33 |
+
NUM_CONVS: 8
|
34 |
+
USE_GN: True
|
35 |
+
USE_DYRELU: True
|
36 |
+
USE_DFCONV: True
|
37 |
+
USE_DYFUSE: True
|
38 |
+
TOPK: 9 # topk for selecting candidate positive samples from each level
|
39 |
+
SCORE_AGG: "MEAN"
|
40 |
+
LOG_SCALE: 0.0
|
41 |
+
|
42 |
+
USE_CHECKPOINT: True
|
43 |
+
FUSE_CONFIG:
|
44 |
+
USE_FUSED_FEATURES_DOT_PRODUCT: True
|
45 |
+
EARLY_FUSE_ON: True
|
46 |
+
TYPE: "MHA-B"
|
47 |
+
USE_CLASSIFICATION_LOSS: False
|
48 |
+
USE_TOKEN_LOSS: False
|
49 |
+
USE_CONTRASTIVE_ALIGN_LOSS: False
|
50 |
+
CONTRASTIVE_HIDDEN_DIM: 64
|
51 |
+
USE_DOT_PRODUCT_TOKEN_LOSS: True
|
52 |
+
USE_LAYER_SCALE: True
|
53 |
+
CLAMP_MIN_FOR_UNDERFLOW: True
|
54 |
+
CLAMP_MAX_FOR_OVERFLOW: True
|
55 |
+
CLAMP_BERTATTN_MIN_FOR_UNDERFLOW: True
|
56 |
+
CLAMP_BERTATTN_MAX_FOR_OVERFLOW: True
|
57 |
+
CLAMP_DOT_PRODUCT: True
|
58 |
+
|
59 |
+
DATASETS:
|
60 |
+
|
61 |
+
TRAIN: ("mixed_train_no_coco",) # Place holder dataset for now. To be updated in the next version
|
62 |
+
TEST: ("coco_2017_val", )
|
63 |
+
|
64 |
+
ONE_HOT: False
|
65 |
+
FLICKR_COPY: 8 # 0.15 * 8 = ~1.2M
|
66 |
+
MIXED_COPY: 4 # 0.6 * 4 = ~2.4M
|
67 |
+
OBJECT365_COPY: 2 # 1.4 * 2 = ~2.8M
|
68 |
+
VG_COPY: 3 # 0.4 * 3 = ~1.2M
|
69 |
+
IN_COPY: 2 # 0.67 * 2 = ~1.33M
|
70 |
+
OI_COPY: 1 # 2M * 1 = 2M
|
71 |
+
|
72 |
+
DISABLE_SHUFFLE: False
|
73 |
+
ADD_DET_PROMPT: False
|
74 |
+
RANDOM_SAMPLE_NEG: 85
|
75 |
+
CONTROL_PROB: (0.0, 0.0, 0.5, 0.0)
|
76 |
+
FURTHER_SCREEN: True
|
77 |
+
CAPTION_CONF: 0.5
|
78 |
+
CAPTION_NMS: -1.0
|
79 |
+
CAPTION_MIN_BOX: 1
|
80 |
+
|
81 |
+
SEPARATION_TOKENS: ". "
|
82 |
+
|
83 |
+
PACK_RANDOM_CAPTION_NUMBER: 20
|
84 |
+
NO_RANDOM_PACK_PROBABILITY: 0.4
|
85 |
+
RANDOM_PACK_PROB: 0.5
|
86 |
+
CAPTION_FORMAT_VERSION: "v2"
|
87 |
+
|
88 |
+
INPUT:
|
89 |
+
PIXEL_MEAN: [ 103.530, 116.280, 123.675 ]
|
90 |
+
PIXEL_STD: [ 57.375, 57.120, 58.395 ]
|
91 |
+
MIN_SIZE_TRAIN: 800
|
92 |
+
MAX_SIZE_TRAIN: 1333
|
93 |
+
MIN_SIZE_TEST: 800
|
94 |
+
MAX_SIZE_TEST: 1333
|
95 |
+
|
96 |
+
AUGMENT:
|
97 |
+
MULT_MIN_SIZE_TRAIN: (480,560,640,720,800)
|
98 |
+
|
99 |
+
DATALOADER:
|
100 |
+
SIZE_DIVISIBILITY: 32
|
101 |
+
|
102 |
+
SOLVER:
|
103 |
+
OPTIMIZER: ADAMW
|
104 |
+
BASE_LR: 0.0001
|
105 |
+
LANG_LR: 0.00001
|
106 |
+
WEIGHT_DECAY: 0.01
|
107 |
+
WEIGHT_DECAY_SCHEDULE: True
|
108 |
+
STEPS: (0.67, 0.89)
|
109 |
+
MAX_ITER: 1000000
|
110 |
+
IMS_PER_BATCH: 64
|
111 |
+
WARMUP_ITERS: 2000
|
112 |
+
WARMUP_FACTOR: 0.001
|
113 |
+
|
114 |
+
FIND_UNUSED_PARAMETERS: False
|
115 |
+
|
116 |
+
CLIP_GRADIENTS:
|
117 |
+
ENABLED: True
|
118 |
+
CLIP_TYPE: "full_model"
|
119 |
+
CLIP_VALUE: 1.0
|
120 |
+
NORM_TYPE: 2.0
|
viper/xvlm/retrieval_mscoco_checkpoint_9.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aa979bde985e2b91ce58e33a385ead52cc8edc249cc68e4f83e7bf3878effd1d
|
3 |
+
size 869714238
|