vikhyatk-moondream1.1old / configuration_moondream.py
sujitvasanth's picture
Duplicate from vikhyatk/moondream1
ac396ab verified
from transformers import PretrainedConfig
from typing import Optional
import math
class PhiConfig(PretrainedConfig):
model_type = "phi-msft"
def __init__(
self,
vocab_size: int = 51200,
n_positions: int = 2048,
n_embd: int = 2048,
n_layer: int = 24,
n_inner: Optional[int] = None,
n_head: int = 32,
n_head_kv: Optional[int] = None,
rotary_dim: Optional[int] = 32,
activation_function: Optional[str] = "gelu_new",
flash_attn: bool = False,
flash_rotary: bool = False,
fused_dense: bool = False,
attn_pdrop: float = 0.0,
embd_pdrop: float = 0.0,
resid_pdrop: float = 0.0,
layer_norm_epsilon: float = 1e-5,
initializer_range: float = 0.02,
tie_word_embeddings: bool = False,
pad_vocab_size_multiple: int = 64,
gradient_checkpointing: bool = False,
**kwargs
):
pad_vocab_size = (
math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
)
super().__init__(
vocab_size=pad_vocab_size,
n_positions=n_positions,
n_embd=n_embd,
n_layer=n_layer,
n_inner=n_inner,
n_head=n_head,
n_head_kv=n_head_kv,
activation_function=activation_function,
attn_pdrop=attn_pdrop,
embd_pdrop=embd_pdrop,
resid_pdrop=resid_pdrop,
layer_norm_epsilon=layer_norm_epsilon,
initializer_range=initializer_range,
pad_vocab_size_multiple=pad_vocab_size_multiple,
tie_word_embeddings=tie_word_embeddings,
gradient_checkpointing=gradient_checkpointing,
**kwargs
)
self.rotary_dim = min(rotary_dim, n_embd // n_head)
self.flash_attn = flash_attn
self.flash_rotary = flash_rotary
self.fused_dense = fused_dense
attribute_map = {
"max_position_embeddings": "n_positions",
"hidden_size": "n_embd",
"num_attention_heads": "n_head",
"num_hidden_layers": "n_layer",
}
class MoondreamConfig(PretrainedConfig):
model_type = "moondream1"
def __init__(self, **kwargs):
self.phi_config = PhiConfig(**kwargs)
super().__init__(**kwargs)