Update modeling_colqwenstella.py
Browse files- modeling_colqwenstella.py +2 -55
modeling_colqwenstella.py
CHANGED
@@ -1431,12 +1431,6 @@ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
|
|
1431 |
)
|
1432 |
|
1433 |
|
1434 |
-
####################################################################################################################
|
1435 |
-
####################################################################################################################
|
1436 |
-
####################################################################################################################
|
1437 |
-
####################################################################################################################
|
1438 |
-
### codes for jasper
|
1439 |
-
####################################################################################################################
|
1440 |
from transformers.models.qwen2_vl import Qwen2VLConfig, Qwen2VLForConditionalGeneration
|
1441 |
from transformers import PretrainedConfig
|
1442 |
|
@@ -1446,8 +1440,6 @@ class ColStellaVLConfig(PretrainedConfig):
|
|
1446 |
|
1447 |
def __init__(
|
1448 |
self,
|
1449 |
-
# vector_dropout_p: float = 0.2,
|
1450 |
-
# num_img_tokens: int = 300,
|
1451 |
img_start_token_id: int = 151644,
|
1452 |
img_start_token: str = "<|im_start|>",
|
1453 |
img_token_id: int = 151646,
|
@@ -1459,12 +1451,6 @@ class ColStellaVLConfig(PretrainedConfig):
|
|
1459 |
**kwargs
|
1460 |
):
|
1461 |
super().__init__(**kwargs)
|
1462 |
-
# if vector_dim not in (12288, 1024, 512, 256):
|
1463 |
-
# raise ValueError("vector_dim must be 12288, 1024, 512, 256")
|
1464 |
-
# self.vector_dim = vector_dim
|
1465 |
-
# self.vector_dropout_p = vector_dropout_p
|
1466 |
-
|
1467 |
-
# self.num_img_tokens = num_img_tokens
|
1468 |
|
1469 |
self.img_start_token_id = img_start_token_id
|
1470 |
self.img_start_token = img_start_token
|
@@ -1499,9 +1485,6 @@ class ColStellaVLConfig(PretrainedConfig):
|
|
1499 |
|
1500 |
|
1501 |
|
1502 |
-
# ColStellaVLConfig.from_pretrained("/home/nane.saroyan99/colpali/models/stella_transformer")
|
1503 |
-
|
1504 |
-
|
1505 |
class ColQwenStella(PreTrainedModel):
|
1506 |
config_class = ColStellaVLConfig
|
1507 |
_supports_sdpa = True
|
@@ -1514,34 +1497,11 @@ class ColQwenStella(PreTrainedModel):
|
|
1514 |
super().__init__(config)
|
1515 |
self.model = Qwen2Model(config.text_config)
|
1516 |
self.config = config
|
1517 |
-
# if not config.is_text_encoder:
|
1518 |
-
# self.vision_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct").visual
|
1519 |
-
# qwen =
|
1520 |
-
# print(config.vision_config)
|
1521 |
self.vision_model = Qwen2VLForConditionalGeneration(config.vision_config).visual
|
1522 |
-
# self.get_rope_index = self.vision_model.get_rope_index
|
1523 |
-
# self.vision_model = self.vision_model.visual
|
1524 |
-
if torch.cuda.is_available():
|
1525 |
-
print("deleting all else")
|
1526 |
-
torch.cuda.empty_cache()
|
1527 |
-
# self.adaptive_avg_pool2d = torch.nn.AdaptiveAvgPool2d(
|
1528 |
-
# (self.config.num_img_tokens, config.text_config.hidden_size)
|
1529 |
-
# )
|
1530 |
-
|
1531 |
-
# self.vector_linear_12288 = nn.Linear(config.text_config.hidden_size, 12288, bias=True)
|
1532 |
self.vector_linear_1024 = nn.Linear(config.text_config.hidden_size, 1024, bias=True)
|
1533 |
-
# self.vector_linear_512 = nn.Linear(config.text_config.hidden_size, 512, bias=True)
|
1534 |
-
# self.vector_linear_256 = nn.Linear(config.text_config.hidden_size, 256, bias=True)
|
1535 |
-
# Initialize weights and apply final processing
|
1536 |
-
# self.model.resize_token_embeddings(151647)
|
1537 |
self.post_init()
|
1538 |
-
# self.model.resize_token_embeddings(151647)
|
1539 |
|
1540 |
-
# def get_input_embeddings(self):
|
1541 |
-
# return self.model.embed_tokens
|
1542 |
|
1543 |
-
# def set_input_embeddings(self, value):
|
1544 |
-
# self.model.embed_tokens = value
|
1545 |
|
1546 |
def inner_forward(
|
1547 |
self,
|
@@ -1588,12 +1548,7 @@ class ColQwenStella(PreTrainedModel):
|
|
1588 |
|
1589 |
|
1590 |
|
1591 |
-
def forward(self,
|
1592 |
-
# input_ids: torch.LongTensor = None,
|
1593 |
-
# attention_mask: Optional[torch.Tensor] = None,
|
1594 |
-
# pixel_values: Optional[torch.Tensor] = None,
|
1595 |
-
*args,
|
1596 |
-
**kwargs) -> torch.Tensor:
|
1597 |
# Delete output_hidden_states from kwargs
|
1598 |
kwargs.pop("output_hidden_states", None)
|
1599 |
|
@@ -1625,12 +1580,4 @@ class ColQwenStella(PreTrainedModel):
|
|
1625 |
|
1626 |
@property
|
1627 |
def spatial_merge_size(self) -> int:
|
1628 |
-
return self.vision_model.config.spatial_merge_size
|
1629 |
-
|
1630 |
-
|
1631 |
-
# from models.qwenstella_base.modeling_colqwenstella import ColQwenStella, ColStellaVLConfig
|
1632 |
-
|
1633 |
-
# ColStellaVLConfig.__module__ = "transformers_modules.qwenstella_base.modeling_colqwenstella"
|
1634 |
-
|
1635 |
-
# AutoConfig.register("colstella_vl", ColStellaVLConfig)
|
1636 |
-
# AutoModel.register(ColStellaVLConfig, ColQwenStella)
|
|
|
1431 |
)
|
1432 |
|
1433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1434 |
from transformers.models.qwen2_vl import Qwen2VLConfig, Qwen2VLForConditionalGeneration
|
1435 |
from transformers import PretrainedConfig
|
1436 |
|
|
|
1440 |
|
1441 |
def __init__(
|
1442 |
self,
|
|
|
|
|
1443 |
img_start_token_id: int = 151644,
|
1444 |
img_start_token: str = "<|im_start|>",
|
1445 |
img_token_id: int = 151646,
|
|
|
1451 |
**kwargs
|
1452 |
):
|
1453 |
super().__init__(**kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
1454 |
|
1455 |
self.img_start_token_id = img_start_token_id
|
1456 |
self.img_start_token = img_start_token
|
|
|
1485 |
|
1486 |
|
1487 |
|
|
|
|
|
|
|
1488 |
class ColQwenStella(PreTrainedModel):
|
1489 |
config_class = ColStellaVLConfig
|
1490 |
_supports_sdpa = True
|
|
|
1497 |
super().__init__(config)
|
1498 |
self.model = Qwen2Model(config.text_config)
|
1499 |
self.config = config
|
|
|
|
|
|
|
|
|
1500 |
self.vision_model = Qwen2VLForConditionalGeneration(config.vision_config).visual
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1501 |
self.vector_linear_1024 = nn.Linear(config.text_config.hidden_size, 1024, bias=True)
|
|
|
|
|
|
|
|
|
1502 |
self.post_init()
|
|
|
1503 |
|
|
|
|
|
1504 |
|
|
|
|
|
1505 |
|
1506 |
def inner_forward(
|
1507 |
self,
|
|
|
1548 |
|
1549 |
|
1550 |
|
1551 |
+
def forward(self, *args, **kwargs) -> torch.Tensor:
|
|
|
|
|
|
|
|
|
|
|
1552 |
# Delete output_hidden_states from kwargs
|
1553 |
kwargs.pop("output_hidden_states", None)
|
1554 |
|
|
|
1580 |
|
1581 |
@property
|
1582 |
def spatial_merge_size(self) -> int:
|
1583 |
+
return self.vision_model.config.spatial_merge_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|