{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "scooter", "1": "behind", "2": "player", "3": "branches", "4": "fence", "5": "spoon", "6": "wood", "7": "brick", "8": "harbor", "9": "lettuce", "10": "van", "11": "sword", "12": "bus", "13": "black", "14": "backpack", "15": "apple", "16": "light blue", "17": "napkin", "18": "bushes", "19": "cabinet", "20": "doll", "21": "tractor", "22": "purple", "23": "skiing", "24": "outdoors", "25": "people", "26": "drawers", "27": "yes", "28": "man", "29": "street", "30": "hot dog", "31": "small", "32": "desk", "33": "goggles", "34": "giraffe", "35": "walkway", "36": "remote control", "37": "sweater", "38": "no", "39": "chair", "40": "refrigerator", "41": "child", "42": "orange", "43": "building", "44": "wetsuit", "45": "train", "46": "passengers", "47": "catcher", "48": "skis", "49": "color", "50": "toilet", "51": "coffee table", "52": "bricks", "53": "woman", "54": "short", "55": "cat", "56": "cap", "57": "dress", "58": "wall", "59": "hat", "60": "sauce", "61": "bed", "62": "standing", "63": "green", "64": "indoors", "65": "parrot", "66": "shelves", "67": "boy", "68": "right", "69": "console", "70": "statue", "71": "sandwich", "72": "pizza", "73": "broccoli", "74": "street light", "75": "table", "76": "bull", "77": "shore", "78": "onions", "79": "left", "80": "book" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "apple": 15, "backpack": 14, "bed": 61, "behind": 1, "black": 13, "book": 80, "boy": 67, "branches": 3, "brick": 7, "bricks": 52, "broccoli": 73, "building": 43, "bull": 76, "bus": 12, "bushes": 18, "cabinet": 19, "cap": 56, "cat": 55, "catcher": 47, "chair": 39, "child": 41, "coffee table": 51, "color": 49, "console": 69, "desk": 32, "doll": 20, "drawers": 26, "dress": 57, "fence": 4, "giraffe": 34, "goggles": 33, "green": 63, "harbor": 8, "hat": 59, "hot dog": 30, "indoors": 64, "left": 79, "lettuce": 9, "light blue": 16, "man": 28, "napkin": 17, "no": 38, "onions": 78, "orange": 42, "outdoors": 24, "parrot": 65, "passengers": 46, "people": 25, "pizza": 72, "player": 2, "purple": 22, "refrigerator": 40, "remote control": 36, "right": 68, "sandwich": 71, "sauce": 60, "scooter": 0, "shelves": 66, "shore": 77, "short": 54, "skiing": 23, "skis": 48, "small": 31, "spoon": 5, "standing": 62, "statue": 70, "street": 29, "street light": 74, "sweater": 37, "sword": 11, "table": 75, "toilet": 50, "tractor": 21, "train": 45, "van": 10, "walkway": 35, "wall": 58, "wetsuit": 44, "woman": 53, "wood": 6, "yes": 27 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.34.1", "type_vocab_size": 2, "vocab_size": 30522 }