Manli commited on
Commit
bc8b948
1 Parent(s): 66fbbf4

update model weights to bf16

Browse files
config.json CHANGED
@@ -13,8 +13,8 @@
13
  "sliding_window": 2047,
14
  "torch_dtype": "bfloat16"
15
  },
16
- "torch_dtype": "float32",
17
- "transformers_version": "4.41.1",
18
  "vision_encoder_config": {
19
  "anyres_patch_sampling": true,
20
  "image_aspect_ratio": "anyres",
 
13
  "sliding_window": 2047,
14
  "torch_dtype": "bfloat16"
15
  },
16
+ "torch_dtype": "bfloat16",
17
+ "transformers_version": "4.44.2",
18
  "vision_encoder_config": {
19
  "anyres_patch_sampling": true,
20
  "image_aspect_ratio": "anyres",
generation_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
- "eos_token_id": 32000,
5
  "pad_token_id": 32000,
6
- "transformers_version": "4.41.1"
7
  }
 
1
  {
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
+ "eos_token_id": 32007,
5
  "pad_token_id": 32000,
6
+ "transformers_version": "4.44.2"
7
  }
model-00001-of-00004.safetensors → model-00001-of-00002.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2394080dd0bac25f59461eba060e937989ab2336edffb415db3a36d7f3fc371e
3
- size 4962660968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21e0452442b5c189d7f0a1078d243a4ad705036e12703f25f81f0711ae478d70
3
+ size 4972926984
model-00002-of-00004.safetensors → model-00002-of-00002.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d63cfd9676a01932ea6674b0103a88a3863c3c8028b66d6716d4bd36945b66ed
3
- size 4983112136
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a5e8bd3fbe75d20605d6760268337170a544f04bc4dfac00c2cba65981d7deb
3
+ size 3745680670
model-00003-of-00004.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf6be012e8534481a9ca4e8dd8f3e482de42af52cb86b23c463b1e55ab5a40a2
3
- size 4983112168
 
 
 
 
model-00004-of-00004.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:71e9607ebf4884b3913619693b547dd4ae9bb66d9e6de42e80bcb5357de36914
3
- size 2508236156
 
 
 
 
model.safetensors.index.json CHANGED
The diff for this file is too large to render. See raw diff
 
modeling_xgenmm.py CHANGED
@@ -162,6 +162,7 @@ class XGenMMConfig(PretrainedConfig):
162
  "task_specific_params": None,
163
  "problem_type": None,
164
  "model_type": "phi3",
 
165
  }
166
  logger.info(
167
  "text_config is None. Initializing the text config with default values (`Phi3Config`)."
@@ -1031,7 +1032,7 @@ class VLM(nn.Module):
1031
  num_additional_embeddings=len(self.special_tokens),
1032
  _weight=self.lang_model.get_input_embeddings().weight,
1033
  pad_token_id=self.pad_token_id,
1034
- )
1035
  if hasattr(input_embeds, "additional_embedding"):
1036
  input_embeds.additional_embedding.weight.data.normal_(
1037
  mean=0.0,
@@ -1052,7 +1053,7 @@ class VLM(nn.Module):
1052
  if hasattr(self.lang_model.get_output_embeddings(), "bias")
1053
  else None
1054
  ),
1055
- )
1056
  if hasattr(out_embeds, "additional_fc"):
1057
  out_embeds.additional_fc.weight.data.normal_(
1058
  mean=0.0,
@@ -1642,7 +1643,8 @@ class VLMWithLanguageStream(VLM):
1642
  if has_labels:
1643
  new_label = labels[i].clone()
1644
 
1645
- for img_num, img_idx in enumerate(image_token_idxs):
 
1646
  # Get vision token attention mask for padded llava-style any resolution image tokens.
1647
  if self.image_aspect_ratio == "anyres":
1648
  num_vis_tokens = vision_tokens[i][img_num].shape[0]
@@ -1662,6 +1664,10 @@ class VLMWithLanguageStream(VLM):
1662
  vis_attention_mask = torch.ones(
1663
  num_vis_tokens, dtype=torch.long
1664
  ).to(attention_mask.device)
 
 
 
 
1665
 
1666
  new_embed = torch.cat(
1667
  (
@@ -2029,11 +2035,15 @@ class XGenMMModelForConditionalGeneration(PreTrainedModel):
2029
 
2030
  # vision encoder initialization
2031
  vision_encoder = AutoModel.from_pretrained(
2032
- config.vision_encoder_config.model_name
 
2033
  ).vision_model
2034
 
2035
  # language model initialization
2036
- language_model = AutoModelForCausalLM.from_config(config.text_config)
 
 
 
2037
  check_embedding_fns(language_model)
2038
  # Update _tied_weights_keys using the base model used.
2039
  if language_model._tied_weights_keys is not None:
@@ -2052,7 +2062,7 @@ class XGenMMModelForConditionalGeneration(PreTrainedModel):
2052
  f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}."
2053
  )
2054
 
2055
- vision_tokenizer = XGenMMVisionTokenizer(config.vision_tokenizer_config).model
2056
 
2057
  self.vlm = XGenMMPerceiver(
2058
  vision_encoder=vision_encoder,
 
162
  "task_specific_params": None,
163
  "problem_type": None,
164
  "model_type": "phi3",
165
+ "_attn_implementation": "flash_attention_2",
166
  }
167
  logger.info(
168
  "text_config is None. Initializing the text config with default values (`Phi3Config`)."
 
1032
  num_additional_embeddings=len(self.special_tokens),
1033
  _weight=self.lang_model.get_input_embeddings().weight,
1034
  pad_token_id=self.pad_token_id,
1035
+ ).to(self.lang_model.dtype)
1036
  if hasattr(input_embeds, "additional_embedding"):
1037
  input_embeds.additional_embedding.weight.data.normal_(
1038
  mean=0.0,
 
1053
  if hasattr(self.lang_model.get_output_embeddings(), "bias")
1054
  else None
1055
  ),
1056
+ ).to(self.lang_model.dtype)
1057
  if hasattr(out_embeds, "additional_fc"):
1058
  out_embeds.additional_fc.weight.data.normal_(
1059
  mean=0.0,
 
1643
  if has_labels:
1644
  new_label = labels[i].clone()
1645
 
1646
+ for img_num in range(len(image_token_idxs)):
1647
+ img_idx = image_token_idxs[img_num]
1648
  # Get vision token attention mask for padded llava-style any resolution image tokens.
1649
  if self.image_aspect_ratio == "anyres":
1650
  num_vis_tokens = vision_tokens[i][img_num].shape[0]
 
1664
  vis_attention_mask = torch.ones(
1665
  num_vis_tokens, dtype=torch.long
1666
  ).to(attention_mask.device)
1667
+
1668
+ # Offset the rest of image tokens with current num_vis_tokens
1669
+ for j in range(img_num+1, len(image_token_idxs)):
1670
+ image_token_idxs[j] += (num_vis_tokens - 1)
1671
 
1672
  new_embed = torch.cat(
1673
  (
 
2035
 
2036
  # vision encoder initialization
2037
  vision_encoder = AutoModel.from_pretrained(
2038
+ config.vision_encoder_config.model_name,
2039
+ torch_dtype=config.text_config.torch_dtype,
2040
  ).vision_model
2041
 
2042
  # language model initialization
2043
+ language_model = AutoModelForCausalLM.from_config(
2044
+ config.text_config,
2045
+ torch_dtype=config.text_config.torch_dtype,
2046
+ )
2047
  check_embedding_fns(language_model)
2048
  # Update _tied_weights_keys using the base model used.
2049
  if language_model._tied_weights_keys is not None:
 
2062
  f"Warning: The language embedding dimension in the vision tokenizer config is different from the language model's embedding dimension. Overwriting the language embedding dimension in the vision tokenizer config to {overwrite}."
2063
  )
2064
 
2065
+ vision_tokenizer = XGenMMVisionTokenizer(config.vision_tokenizer_config).model.to(language_model.dtype)
2066
 
2067
  self.vlm = XGenMMPerceiver(
2068
  vision_encoder=vision_encoder,
setup.sh CHANGED
@@ -2,6 +2,6 @@ pip install torch==2.2.1 torchvision==0.17.1 torchaudio==2.2.1 --index-url https
2
  pip install open_clip_torch==2.24.0
3
  pip install einops
4
  pip install einops-exts
5
- pip install transformers==4.41.1
6
  # optional
7
  pip install ipywidgets
 
2
  pip install open_clip_torch==2.24.0
3
  pip install einops
4
  pip install einops-exts
5
+ pip install transformers==4.44.2
6
  # optional
7
  pip install ipywidgets
special_tokens_map.json CHANGED
@@ -6,13 +6,7 @@
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": false,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
  "pad_token": {
17
  "content": "<pad>",
18
  "lstrip": false,
 
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
+ "eos_token": "<|end|>",
 
 
 
 
 
 
10
  "pad_token": {
11
  "content": "<pad>",
12
  "lstrip": false,
tokenizer_config.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "add_bos_token": false,
3
  "add_eos_token": false,
 
4
  "added_tokens_decoder": {
5
  "0": {
6
  "content": "<unk>",
@@ -126,7 +127,7 @@
126
  "bos_token": "<s>",
127
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
128
  "clean_up_tokenization_spaces": false,
129
- "eos_token": "<|endoftext|>",
130
  "legacy": false,
131
  "model_max_length": 4096,
132
  "pad_token": "<pad>",
 
1
  {
2
  "add_bos_token": false,
3
  "add_eos_token": false,
4
+ "add_prefix_space": null,
5
  "added_tokens_decoder": {
6
  "0": {
7
  "content": "<unk>",
 
127
  "bos_token": "<s>",
128
  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
129
  "clean_up_tokenization_spaces": false,
130
+ "eos_token": "<|end|>",
131
  "legacy": false,
132
  "model_max_length": 4096,
133
  "pad_token": "<pad>",