allenai
/

MolmoE-1B-0924

@@ -95,14 +95,6 @@
   "rope_theta": 10000.0,
   "scale_logits": false,
   "system_prompt_kind": "demo_or_style",
-  "tokenizer": {
-    "identifier": "allenai/gpt-neox-olmo-dolma-v1_5",
-    "olmo_bos_token_id": null,
-    "olmo_eos_token_id": null,
-    "tokenizer_adds_space": false,
-    "tokenizer_dir": null,
-    "truncate_direction": "right"
-  },
   "transformers_version": "4.45.0.dev0",
   "unconditioned": false,
   "use_cache": true,

   "rope_theta": 10000.0,
   "scale_logits": false,
   "system_prompt_kind": "demo_or_style",
   "transformers_version": "4.45.0.dev0",
   "unconditioned": false,
   "use_cache": true,

config_molmoe.py CHANGED Viewed

@@ -2,7 +2,9 @@ from __future__ import annotations
 import logging
 from dataclasses import asdict, dataclass, field
 from glob import glob
 from pathlib import Path
 from typing import (
     Any,
@@ -17,168 +19,36 @@ from typing import (
     cast,
 )
-import torch
 from transformers import PretrainedConfig
-from omegaconf import DictConfig, ListConfig, OmegaConf
-from omegaconf import OmegaConf as om
-from omegaconf.errors import OmegaConfBaseException
-from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
-import gin
-#from olmo.aliases import PathOrStr
-from .aliases import PathOrStr
-#from olmo.exceptions import OLMoConfigurationError
-from .exceptions import OLMoConfigurationError
-#from olmo.util import StrEnum, resource_path
-from .util import StrEnum, resource_path
-#from olmo.mm_data.data_utils import build_tokenizer
-from .data_utils import build_tokenizer
-#from olmo.multimodal_preprocessor import MultiModalPreprocessor
-from .multimodal_preprocessor import MultiModalPreprocessor
-__all__ = [
-    "ActivationType",
-    "ActivationCheckpointingStrategy",
-    "BlockType",
-    "LayerNormType",
-    "VisionBackboneType",
-    "VisionBackboneConfig",
-    "InitFnType",
-    "ModelConfig",
-    "OptimizerType",
-    "OptimizerConfig",
-    "SchedulerType",
-    "SchedulerConfig",
-    "DataConfig",
-    "InstanceFilterConfig",
-    "EvaluatorConfig",
-    "TokenizerConfig",
-    "TrainConfig",
-    "PaddingDirection",
-    "TruncationDirection",
-    "SpeedMonitorConfig",
-    "WandbConfig",
-    "CompilerConfig",
-    "WandbConfig",
-    "FSDPPrecision",
-    "FSDPWrapStrategy",
-    "FSDPConfig",
-    "CheckpointType",
-]
 C = TypeVar("C", bound="BaseConfig")
 D = TypeVar("D", bound="DictConfig|ListConfig")
 class AttentionType(StrEnum):
     sdpa = "sdpa"
     direct = "direct"
     flash = "flash"
-class BaseConfig:
-    @classmethod
-    def _register_resolvers(cls, validate_paths: bool = True):
-        # Expands path globs into a list.
-        def path_glob(*paths) -> List[str]:
-            out = []
-            for path in paths:
-                matches = sorted(glob(path))
-                if not matches and validate_paths:
-                    raise FileNotFoundError(f"{path} does not match any files or dirs")
-                out.extend(matches)
-            return out
-        # Chooses the first path in the arguments that exists.
-        def path_choose(*paths) -> str:
-            from .util import is_url
-            for path in paths:
-                if is_url(path) or Path(path).exists():
-                    return path
-            if validate_paths:
-                raise FileNotFoundError(", ".join(paths))
-            else:
-                return ""
-        # Finds the latest checkpoint in a folder.
-        def path_last_checkpoint(path) -> str:
-            from .util import find_latest_checkpoint
-            latest_checkpoint = find_latest_checkpoint(path)
-            if latest_checkpoint is None:
-                if validate_paths:
-                    raise FileNotFoundError(f"Could not find a latest checkpoint at {path}")
-                else:
-                    return ""
-            else:
-                return str(latest_checkpoint)
-        om.register_new_resolver("path.glob", path_glob, replace=True)
-        om.register_new_resolver("path.choose", path_choose, replace=True)
-        om.register_new_resolver("path.last_checkpoint", path_last_checkpoint, replace=True)
-    @classmethod
-    def update_legacy_settings(cls, config: D) -> D:
-        """
-        Update the legacy config settings whose schemas have undergone backwards-incompatible changes.
-        """
-        return config
-    @classmethod
-    def new(cls: Type[C], **kwargs) -> C:
-        cls._register_resolvers()
-        conf = om.structured(cls)
-        try:
-            if kwargs:
-                conf = om.merge(conf, kwargs)
-            return cast(C, om.to_object(conf))
-        except OmegaConfBaseException as e:
-            raise OLMoConfigurationError(str(e))
-    @classmethod
-    def load(
-        cls: Type[C],
-        path: PathOrStr,
-        overrides: Optional[List[str]] = None,
-        key: Optional[str] = None,
-        validate_paths: bool = True,
-    ) -> C:
-        """Load from a YAML file."""
-        cls._register_resolvers(validate_paths=validate_paths)
-        schema = om.structured(cls)
-        try:
-            raw = om.load(str(path))
-            # Backwards compatibility hack, we need this here not in `update_legacy_settings`
-            # since it has to be applied before selecting with `key`
-            if "tokenizer" in raw and "model" in raw:
-                raw["model"]["tokenizer"] = raw.pop("tokenizer")
-            if key is not None:
-                raw = raw[key]  # type: ignore
-            raw = cls.update_legacy_settings(raw)
-            conf = om.merge(schema, raw)
-            if overrides:
-                conf = om.merge(conf, om.from_dotlist(overrides))
-            return cast(C, om.to_object(conf))
-        except OmegaConfBaseException as e:
-            raise OLMoConfigurationError(str(e))
-    def save(self, path: PathOrStr) -> None:
-        """Save to a YAML file."""
-        om.save(config=self, f=str(path))
-    def asdict(self, exclude: Optional[Iterable[str]] = None) -> Dict[str, Any]:
-        out = asdict(self)  # type: ignore
-        if exclude is not None:
-            for name in exclude:
-                if name in out:
-                    del out[name]
-        return out
 class LayerNormType(StrEnum):
     default = "default"
     """
@@ -290,7 +160,7 @@ class ImageProjectType(StrEnum):
 @dataclass
-class VisionBackboneConfig(BaseConfig):
     image_model_type: VisionBackboneType = VisionBackboneType.openai
     image_default_input_size: Tuple[int, int] = (336, 336)
     image_patch_size: int = 14
@@ -328,18 +198,7 @@ class TruncationDirection(StrEnum):
 @dataclass
-class TokenizerConfig(BaseConfig):
-    identifier: str = "gpt2"
-    truncate_direction: TruncationDirection = TruncationDirection.right
-    # Does the tokenizer automatically start input text with a space
-    tokenizer_adds_space: Optional[bool] = False
-    tokenizer_dir: Optional[str] = None  # tokenizer directory if using a seqio tokenizer
-    olmo_bos_token_id: Optional[int] = None
-    olmo_eos_token_id: Optional[int] = None
-@dataclass
-class ModelConfig(BaseConfig):
     """
     OLMo (model) configuration.
     """
@@ -429,11 +288,6 @@ class ModelConfig(BaseConfig):
     rope_impl: str = "cockatoo"
-    vision_backbone: Optional[VisionBackboneConfig] = None
-    """
-    Vision backbone settings for multi-modal models.
-    """
     vit_load_path: Optional[str] = None
     """
     Use this to load the vit model.
@@ -749,129 +603,10 @@ class ModelConfig(BaseConfig):
     Used for Gemma-2.
     """
-    tokenizer: TokenizerConfig = field(default_factory=TokenizerConfig)
-    """
-    Tokenizer configuration.
-    """
     loss_token_weighting: Optional[str] = None
     gin_bindings: Optional[str] = None
-    def get_tokenizer(self):
-        tokenizer_cfg = self.tokenizer
-        assert tokenizer_cfg.identifier.startswith("mm:")
-        kargs = {}
-        if tokenizer_cfg.identifier[3:].startswith("olmo-"):
-            kargs["olmo_bos_token_id"] = tokenizer_cfg.olmo_bos_token_id
-            kargs["olmo_eos_token_id"] = tokenizer_cfg.olmo_eos_token_id
-        return build_tokenizer(
-            tokenizer_cfg.identifier[3:],
-            adds_space=tokenizer_cfg.tokenizer_adds_space,
-            tokenizer_dir=tokenizer_cfg.tokenizer_dir,
-            pad_tokenizer_to=self.vocab_size if self.pad_tokenizer else None,
-            **kargs
-        )
-    def get_preprocessor(self):
-        vision_cfg = self.vision_backbone
-        h, w = self.llm_patches_per_crop()
-        return MultiModalPreprocessor(
-            loss_token_weighting=self.loss_token_weighting,
-            always_start_with_space=self.always_start_with_space,
-            tokenizer=self.get_tokenizer(),
-            prompt_override=self.prompt_override,
-            fix_image_input_idx=self.fix_image_input_idx,
-            prompt_templates=self.prompt_type,
-            system_prompt=self.system_prompt_kind,
-            default_inference_len=self.default_inference_len,
-            message_format=self.message_formatting,
-            unconditioned=self.unconditioned,
-            crop_mode=self.crop_mode,
-            max_crops=self.max_crops,
-            do_random_scale=self.do_random_scale,
-            base_image_input_size=vision_cfg.image_default_input_size,
-            image_patch_size=vision_cfg.image_patch_size,
-            image_token_length_h=h,
-            image_token_length_w=w,
-            use_col_tokens=self.use_col_tokens,
-            overlap_margins=self.overlap_margins,
-            image_padding_mask=self.image_padding_embed is not None
-        )
-    def __post_init__(self):
-        self.vit_layers = tuple(self.vit_layers)  # type: ignore[assignment]
-    @classmethod
-    def update_legacy_settings(cls, config: D) -> D:
-        """
-        Update the legacy config settings whose schemas have undergone backwards-incompatible changes.
-        """
-        if "flash_attention" in config:
-            is_flash = config.flash_attention
-            del config.flash_attention
-            config.attention_type = AttentionType.flash if is_flash else AttentionType.sdpa
-        if "bos_token_id" in config:
-            config.tokenizer.olmo_bos_token_id = config.pop("bos_token_id")
-            config.tokenizer.olmo_eos_token_id = config.pop("eos_token_id")
-        if "image_padding_mask" in config:
-            assert not config["image_padding_mask"]
-            del config["image_padding_mask"]
-            config["image_padding_embed"] = None
-        elif "image_padding_embed" not in config:
-            config["image_padding_embed"] = None
-        return config
-    @property
-    def effective_n_kv_heads(self) -> int:
-        if self.n_kv_heads is None:
-            if self.multi_query_attention is True:
-                return 1
-            else:
-                return self.n_heads
-        else:
-            if self.multi_query_attention is None:
-                return self.n_kv_heads
-            if self.multi_query_attention:
-                n_kv_heads_should_be = 1
-            else:
-                n_kv_heads_should_be = self.n_heads
-            if self.n_kv_heads == n_kv_heads_should_be:
-                return n_kv_heads_should_be
-            else:
-                raise OLMoConfigurationError(
-                    "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
-                )
-    @property
-    def image_num_patch(self):
-        assert self.vision_backbone is not None
-        return self.vision_backbone.image_num_patch
-    @property
-    def image_patch_size(self):
-        assert self.vision_backbone is not None
-        return self.visoin_backbone.image_patch_size
-    def llm_patches_per_crop(self):
-        h, w = self.image_num_patch
-        # Round up in case we need to pad the image features for pooling
-        h = (h + self.image_pooling_h - 1) // self.image_pooling_h
-        w = (w + self.image_pooling_w - 1) // self.image_pooling_w
-        return h, w
-    def get_max_crops(self) -> int:
-        """Max numbers of that can be built for one image"""
-        if self.crop_mode == "resize":
-            return 1
-        elif "resize" in self.crop_mode:
-            return 1 + self.max_crops
-        else:
-            return self.max_crops
 class MolmoConfig(PretrainedConfig):
     model_type = "molmo"
@@ -879,7 +614,7 @@ class MolmoConfig(PretrainedConfig):
     def __init__(self, use_cache: bool = False, **kwargs):
         model_config = ModelConfig()
-        all_kwargs = model_config.asdict()
         all_kwargs.update(kwargs)
         all_kwargs.update({"use_cache": use_cache})
         all_kwargs.update(
@@ -901,8 +636,8 @@ class MolmoConfig(PretrainedConfig):
     @property
     def image_num_patch(self):
-        assert self.vision_backbone is not None
-        return self.vision_backbone.image_num_patch
     @property
     def llm_patches_per_crop(self):
@@ -910,4 +645,26 @@ class MolmoConfig(PretrainedConfig):
         # Round up in case we need to pad the image features for pooling
         h = (h + self.image_pooling_h - 1) // self.image_pooling_h
         w = (w + self.image_pooling_w - 1) // self.image_pooling_w
-        return h, w

 import logging
 from dataclasses import asdict, dataclass, field
+from enum import Enum
 from glob import glob
+from os import PathLike
 from pathlib import Path
 from typing import (
     Any,
     cast,
 )
 from transformers import PretrainedConfig
 C = TypeVar("C", bound="BaseConfig")
 D = TypeVar("D", bound="DictConfig|ListConfig")
+PathOrStr = Union[str, PathLike]
+class StrEnum(str, Enum):
+    """
+    This is equivalent to Python's :class:`enum.StrEnum` since version 3.11.
+    We include this here for compatibility with older version of Python.
+    """
+    def __str__(self) -> str:
+        return self.value
+    def __repr__(self) -> str:
+        return f"'{str(self)}'"
 class AttentionType(StrEnum):
     sdpa = "sdpa"
     direct = "direct"
     flash = "flash"
 class LayerNormType(StrEnum):
     default = "default"
     """
 @dataclass
+class VisionBackboneConfig:
     image_model_type: VisionBackboneType = VisionBackboneType.openai
     image_default_input_size: Tuple[int, int] = (336, 336)
     image_patch_size: int = 14
 @dataclass
+class ModelConfig:
     """
     OLMo (model) configuration.
     """
     rope_impl: str = "cockatoo"
     vit_load_path: Optional[str] = None
     """
     Use this to load the vit model.
     Used for Gemma-2.
     """
     loss_token_weighting: Optional[str] = None
     gin_bindings: Optional[str] = None
 class MolmoConfig(PretrainedConfig):
     model_type = "molmo"
     def __init__(self, use_cache: bool = False, **kwargs):
         model_config = ModelConfig()
+        all_kwargs = asdict(model_config)
         all_kwargs.update(kwargs)
         all_kwargs.update({"use_cache": use_cache})
         all_kwargs.update(
     @property
     def image_num_patch(self):
+        h, w = (336, 336)
+        return h // 14, w // 14
     @property
     def llm_patches_per_crop(self):
         # Round up in case we need to pad the image features for pooling
         h = (h + self.image_pooling_h - 1) // self.image_pooling_h
         w = (w + self.image_pooling_w - 1) // self.image_pooling_w
+        return h, w
+    @property
+    def effective_n_kv_heads(self) -> int:
+        if self.n_kv_heads is None:
+            if self.multi_query_attention is True:
+                return 1
+            else:
+                return self.n_heads
+        else:
+            if self.multi_query_attention is None:
+                return self.n_kv_heads
+            if self.multi_query_attention:
+                n_kv_heads_should_be = 1
+            else:
+                n_kv_heads_should_be = self.n_heads
+            if self.n_kv_heads == n_kv_heads_should_be:
+                return n_kv_heads_should_be
+            else:
+                raise ValueError(
+                    "You can't set `multi_query_attention` and `n_kv_heads` at the same time."
+                )

example.py ADDED Viewed

	@@ -0,0 +1,55 @@

+from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig
+from PIL import Image
+import requests
+def main():
+    load_path = "."
+    # load the processor
+    print("Loading processor")
+    processor = AutoProcessor.from_pretrained(
+        load_path,
+        trust_remote_code=True,
+        torch_dtype='auto',
+        device_map='auto'
+    )
+    # load the model
+    print("Loading model")
+    model = AutoModelForCausalLM.from_pretrained(
+        load_path,
+        trust_remote_code=True,
+        torch_dtype='auto',
+        device_map='auto'
+    )
+    # process the image and text
+    print("Processing...")
+    inputs = processor.process(
+        images=[Image.open(requests.get("https://picsum.photos/id/237/536/354", stream=True).raw)],
+        text="Describe this image."
+    )
+    # move inputs to the correct device and make a batch of size 1
+    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
+    # generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
+    print("Generating....")
+    output = model.generate_from_batch(
+        inputs,
+        GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
+        tokenizer=processor.tokenizer
+    )
+    # only get generated tokens; decode them to text
+    generated_tokens = output[0,inputs['input_ids'].size(1):]
+    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    # print the generated text
+    print(generated_text)
+if __name__ == '__main__':
+    main()

modeling_molmoe.py CHANGED Viewed

@@ -27,7 +27,7 @@ from typing import (
     Set,
     Tuple,
     cast,
-    Union,
 )
 from copy import deepcopy
 import torch
@@ -36,17 +36,10 @@ import torch.nn as nn
 import torch.nn.functional as F
 from torch import einsum
 import einops
-from transformers import PreTrainedModel
 from transformers.modeling_outputs import CausalLMOutputWithPast
-from .aliases import PathOrStr
-from .beam_search import (
-    BeamSearch,
-    Constraint,
-    FinalSequenceScorer,
-    Sampler
-)
-from .config import (
     ActivationType,
     BlockType,
     LayerNormType,
@@ -56,10 +49,10 @@ from .config import (
     AttentionType,
 )
-from .util import resource_path
 from .config_molmoe import (
     MolmoConfig,
-    VisionBackboneConfig
 )
 if sys.version_info.minor > 8:
@@ -69,26 +62,14 @@ elif sys.version_info.minor == 8:
 else:
     raise SystemExit("This script supports Python 3.8 or higher")
-__all__ = [
-    "LayerNormBase",
-    "LayerNorm",
-    "RMSLayerNorm",
-    "RotaryEmbedding",
-    "Activation",
-    "GELU",
-    "ReLU",
-    "SwiGLU",
-    "OLMoBlock",
-    "OLMoSequentialBlock",
-    "OLMo",
-    "OLMoOutput",
-    "OLMoGenerateOutput",
-]
 log = logging.getLogger(__name__)
 def activation_checkpoint_function(cfg: ModelConfig):
     preserve_rng_state = not (
         (cfg.attention_dropout == 0.0) and (cfg.embedding_dropout == 0.0) and
@@ -114,20 +95,6 @@ def ensure_finite_(x: torch.Tensor, check_neg_inf: bool = True, check_pos_inf: b
         x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
-def activation_checkpoint_function(cfg: MolmoConfig):
-    preserve_rng_state = not (
-        (cfg.attention_dropout == 0.0) and (cfg.embedding_dropout == 0.0) and
-        (cfg.residual_dropout == 0.0) and (cfg.response_residual_dropout == 0.0)
-    )
-    from torch.utils.checkpoint import checkpoint
-    return partial(
-        checkpoint,
-        preserve_rng_state=True,
-        use_reentrant=False,
-    )
 def vit_activation_checkpoint_function(cfg: MolmoConfig):
     v_cfg = cfg.vision_backbone
     preserve_rng_state = (
@@ -142,22 +109,6 @@ def vit_activation_checkpoint_function(cfg: MolmoConfig):
     )
-def should_checkpoint_block(strategy: Optional[ActivationCheckpointingStrategy], block_idx: int) -> bool:
-    if strategy is None:
-        return False
-    elif (
-        (strategy == ActivationCheckpointingStrategy.whole_layer)
-        or (strategy == ActivationCheckpointingStrategy.one_in_two and block_idx % 2 == 0)
-        or (strategy == ActivationCheckpointingStrategy.one_in_three and block_idx % 3 == 0)
-        or (strategy == ActivationCheckpointingStrategy.one_in_four and block_idx % 4 == 0)
-        or (strategy == ActivationCheckpointingStrategy.two_in_three and block_idx % 3 != 0)
-        or (strategy == ActivationCheckpointingStrategy.three_in_four and block_idx % 4 != 0)
-    ):
-        return True
-    else:
-        return False
 class BufferCache(dict, MutableMapping[str, torch.Tensor]):
     """
     Cache for attention biases and other things that would normally be stored as buffers.
@@ -1557,15 +1508,11 @@ class MolmoVisionBackbone(nn.Module):
         self.image_feature_dropout = Dropout(config.image_feature_dropout)
     @classmethod
-    def build(cls, config: MolmoConfig) -> OLMoVisionBackbone:
         v_cfg = config.vision_backbone
         assert v_cfg is not None
         return MolmoPretrainedVisionBackbone(config)
-    @abstractmethod
-    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
-        raise NotImplementedError()
     def reset_parameters(self):
         if self.image_pooling_2d is not None:
             self.image_pooling_2d.reset_parameters()
@@ -1583,9 +1530,9 @@ class MolmoVisionBackbone(nn.Module):
 class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
-    def __init__(self, config: MolmoVisionBackboneConfig):
         super().__init__(config)
-        v_cfg = self.config.vision_backbone
         if v_cfg.image_model_type == VisionBackboneType.openai:
             self.image_vit = VisionTransformer(config)
@@ -1640,11 +1587,6 @@ class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
         if self.config.use_cls_feature:
             nn.init.xavier_uniform_(self.cls_projector.weight)
-    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
-        self.grad_checkpointing = True
-        if strategy in (ActivationCheckpointingStrategy.whole_layer, ActivationCheckpointingStrategy.vit_only):
-            self.image_vit.set_grad_checkpointing()
     def encode_image(self, images: torch.Tensor) -> torch.Tensor:
         """
         : param images: (batch_size, num_crops, num_patch, n_pixels)
@@ -1802,9 +1744,6 @@ class MolmoModel(MolmoPretrainedModel):
                     "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
                 )
-        self.activation_checkpointing_strategy: Optional[ActivationCheckpointingStrategy] = None
-        self._activation_checkpoint_fn: Callable = activation_checkpoint_function(self.config)
         if not (
             0 < self.config.block_group_size <= self.config.n_layers
             and self.config.n_layers % self.config.block_group_size == 0
@@ -1846,25 +1785,14 @@ class MolmoModel(MolmoPretrainedModel):
         ]
         self.transformer.update({"blocks": nn.ModuleList(layers)})
-        self.vision_backbone: Optional[OLMoVisionBackbone] = None
         if config.vision_backbone is not None:
             self.vision_backbone = MolmoVisionBackbone.build(config)
         if self.vision_backbone is not None:
             self.vision_backbone.reset_with_pretrained_weights()
-    def set_activation_checkpointing(self, strategy: Optional[ActivationCheckpointingStrategy]):
-        self.activation_checkpointing_strategy = strategy
-        if self.config.block_group_size != 1:
-            for block_group in self.transformer.block_groups:
-                block_group.set_activation_checkpointing(strategy)
-        else:
-            for block in self.transformer.blocks:
-                block.set_activation_checkpointing(strategy)
-        if self.vision_backbone is not None:
-            self.vision_backbone.set_activation_checkpointing(strategy)
     @property
     def device(self) -> torch.device:
         device: torch.device = self.transformer.wte.weight.device  # type: ignore
@@ -1873,7 +1801,6 @@ class MolmoModel(MolmoPretrainedModel):
         else:
             return device
     def forward(
         self,
         input_ids: torch.LongTensor,
@@ -2069,14 +1996,7 @@ class MolmoModel(MolmoPretrainedModel):
                 all_hidden_states.append(x)
             layer_past = None if past_key_values is None else past_key_values[block_idx]
-            if should_checkpoint_block(self.activation_checkpointing_strategy, block_idx):
-                # shape: (batch_size, seq_len, d_model)
-                x, cache = self._activation_checkpoint_fn(
-                    layer, x, attention_bias=attention_bias, position_ids=position_ids, drop_mask=response_mask, layer_past=layer_past, use_cache=use_cache
-                )
-            else:
-                # shape: (batch_size, seq_len, d_model)
-                x, cache = layer(x, attention_bias=attention_bias, position_ids=position_ids, drop_mask=response_mask, layer_past=layer_past, use_cache=use_cache)
             if attn_key_values is not None:
                 assert cache is not None

     Set,
     Tuple,
     cast,
+    Union, Any,
 )
 from copy import deepcopy
 import torch
 import torch.nn.functional as F
 from torch import einsum
 import einops
+from transformers import PreTrainedModel, GenerationConfig, Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
+from .config_molmoe import (
     ActivationType,
     BlockType,
     LayerNormType,
     AttentionType,
 )
 from .config_molmoe import (
     MolmoConfig,
+    VisionBackboneConfig, ModelConfig
 )
 if sys.version_info.minor > 8:
 else:
     raise SystemExit("This script supports Python 3.8 or higher")
 log = logging.getLogger(__name__)
+class OLMoConfigurationError(Exception):
+    pass
 def activation_checkpoint_function(cfg: ModelConfig):
     preserve_rng_state = not (
         (cfg.attention_dropout == 0.0) and (cfg.embedding_dropout == 0.0) and
         x.masked_fill_(x == float("inf"), torch.finfo(x.dtype).max)
 def vit_activation_checkpoint_function(cfg: MolmoConfig):
     v_cfg = cfg.vision_backbone
     preserve_rng_state = (
     )
 class BufferCache(dict, MutableMapping[str, torch.Tensor]):
     """
     Cache for attention biases and other things that would normally be stored as buffers.
         self.image_feature_dropout = Dropout(config.image_feature_dropout)
     @classmethod
+    def build(cls, config: MolmoConfig):
         v_cfg = config.vision_backbone
         assert v_cfg is not None
         return MolmoPretrainedVisionBackbone(config)
     def reset_parameters(self):
         if self.image_pooling_2d is not None:
             self.image_pooling_2d.reset_parameters()
 class MolmoPretrainedVisionBackbone(MolmoVisionBackbone):
+    def __init__(self, config: MolmoConfig):
         super().__init__(config)
+        v_cfg = VisionBackboneConfig()
         if v_cfg.image_model_type == VisionBackboneType.openai:
             self.image_vit = VisionTransformer(config)
         if self.config.use_cls_feature:
             nn.init.xavier_uniform_(self.cls_projector.weight)
     def encode_image(self, images: torch.Tensor) -> torch.Tensor:
         """
         : param images: (batch_size, num_crops, num_patch, n_pixels)
                     "Embedding size is not a multiple of 128! This could hurt throughput performance.", UserWarning
                 )
         if not (
             0 < self.config.block_group_size <= self.config.n_layers
             and self.config.n_layers % self.config.block_group_size == 0
         ]
         self.transformer.update({"blocks": nn.ModuleList(layers)})
+        self.vision_backbone: Optional[MolmoVisionBackbone] = None
         if config.vision_backbone is not None:
             self.vision_backbone = MolmoVisionBackbone.build(config)
         if self.vision_backbone is not None:
             self.vision_backbone.reset_with_pretrained_weights()
     @property
     def device(self) -> torch.device:
         device: torch.device = self.transformer.wte.weight.device  # type: ignore
         else:
             return device
     def forward(
         self,
         input_ids: torch.LongTensor,
                 all_hidden_states.append(x)
             layer_past = None if past_key_values is None else past_key_values[block_idx]
+            x, cache = layer(x, attention_bias=attention_bias, position_ids=position_ids, drop_mask=response_mask, layer_past=layer_past, use_cache=use_cache)
             if attn_key_values is not None:
                 assert cache is not None