mrloh commited on
Commit
ec72405
·
verified ·
1 Parent(s): fec36b2

Add new SentenceTransformer model

Browse files
README.md CHANGED
The diff for this file is too large to render. See raw diff
 
config.json CHANGED
@@ -6,7 +6,7 @@
6
  ],
7
  "attn_implementation": null,
8
  "auto_map": {
9
- "AutoConfig": "misc.ContextualModelConfig",
10
  "AutoModel": "jxm/cde-small-v1--model.DatasetTransformer"
11
  },
12
  "biencoder_pooling_strategy": "mean",
@@ -14,17 +14,14 @@
14
  "config_name": null,
15
  "disable_dropout": true,
16
  "disable_transductive_rotary_embedding": true,
17
- "document_prompt": "search_document: ",
18
  "embedder": "nomic-ai/nomic-bert-2048",
19
  "embedder_rerank": "sentence-transformers/gtr-t5-base",
20
  "embedding_output_dim": null,
21
- "embedding_size": 768,
22
  "limit_layers": null,
23
  "limit_layers_first_stage": null,
24
  "logit_scale": 50.0,
25
  "max_seq_length": 512,
26
  "model_revision": "main",
27
- "query_prompt": "search_query: ",
28
  "tokenizer_name": null,
29
  "torch_dtype": "float32",
30
  "transductive_corpus_size": 512,
 
6
  ],
7
  "attn_implementation": null,
8
  "auto_map": {
9
+ "AutoConfig": "jxm/cde-small-v1--misc.ContextualModelConfig",
10
  "AutoModel": "jxm/cde-small-v1--model.DatasetTransformer"
11
  },
12
  "biencoder_pooling_strategy": "mean",
 
14
  "config_name": null,
15
  "disable_dropout": true,
16
  "disable_transductive_rotary_embedding": true,
 
17
  "embedder": "nomic-ai/nomic-bert-2048",
18
  "embedder_rerank": "sentence-transformers/gtr-t5-base",
19
  "embedding_output_dim": null,
 
20
  "limit_layers": null,
21
  "limit_layers_first_stage": null,
22
  "logit_scale": 50.0,
23
  "max_seq_length": 512,
24
  "model_revision": "main",
 
25
  "tokenizer_name": null,
26
  "torch_dtype": "float32",
27
  "transductive_corpus_size": 512,
config_sentence_transformers.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.2.1",
4
+ "transformers": "4.46.0",
5
+ "pytorch": "2.5.0"
6
+ },
7
+ "prompts": {
8
+ "query": "search_query: ",
9
+ "document": "search_document: "
10
+ },
11
+ "default_prompt_name": null,
12
+ "similarity_fn_name": "cosine"
13
+ }
modules.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers_impl.Transformer"
7
+ }
8
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
sentence_transformers_impl.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ from typing import Any, Optional
7
+
8
+ import torch
9
+ from torch import nn
10
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class Transformer(nn.Module):
16
+ """Hugging Face AutoModel to generate token embeddings.
17
+ Loads the correct class, e.g. BERT / RoBERTa etc.
18
+
19
+ Args:
20
+ model_name_or_path: Hugging Face models name
21
+ (https://huggingface.co/models)
22
+ max_seq_length: Truncate any inputs longer than max_seq_length
23
+ model_args: Keyword arguments passed to the Hugging Face
24
+ Transformers model
25
+ tokenizer_args: Keyword arguments passed to the Hugging Face
26
+ Transformers tokenizer
27
+ config_args: Keyword arguments passed to the Hugging Face
28
+ Transformers config
29
+ cache_dir: Cache dir for Hugging Face Transformers to store/load
30
+ models
31
+ do_lower_case: If true, lowercases the input (independent if the
32
+ model is cased or not)
33
+ tokenizer_name_or_path: Name or path of the tokenizer. When
34
+ None, then model_name_or_path is used
35
+ backend: Backend used for model inference. Can be `torch`, `onnx`,
36
+ or `openvino`. Default is `torch`.
37
+ """
38
+
39
+ save_in_root: bool = True
40
+
41
+ def __init__(
42
+ self,
43
+ model_name_or_path: str,
44
+ model_args: dict[str, Any] | None = None,
45
+ tokenizer_args: dict[str, Any] | None = None,
46
+ config_args: dict[str, Any] | None = None,
47
+ cache_dir: str | None = None,
48
+ **kwargs,
49
+ ) -> None:
50
+ super().__init__()
51
+ if model_args is None:
52
+ model_args = {}
53
+ if tokenizer_args is None:
54
+ tokenizer_args = {}
55
+ if config_args is None:
56
+ config_args = {}
57
+
58
+ if not model_args.get("trust_remote_code", False):
59
+ raise ValueError(
60
+ "You need to set `trust_remote_code=True` to load this model."
61
+ )
62
+
63
+ self.config = AutoConfig.from_pretrained(model_name_or_path, **config_args, cache_dir=cache_dir)
64
+ self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=self.config, cache_dir=cache_dir, **model_args)
65
+
66
+ self.tokenizer = AutoTokenizer.from_pretrained(
67
+ "bert-base-uncased",
68
+ cache_dir=cache_dir,
69
+ **tokenizer_args,
70
+ )
71
+
72
+ def __repr__(self) -> str:
73
+ return f"Transformer({self.get_config_dict()}) with Transformer model: {self.auto_model.__class__.__name__} "
74
+
75
+ def forward(self, features: dict[str, torch.Tensor], dataset_embeddings: Optional[torch.Tensor] = None, **kwargs) -> dict[str, torch.Tensor]:
76
+ """Returns token_embeddings, cls_token"""
77
+ # If we don't have embeddings, then run the 1st stage model.
78
+ # If we do, then run the 2nd stage model.
79
+ if dataset_embeddings is None:
80
+ sentence_embedding = self.auto_model.first_stage_model(
81
+ input_ids=features["input_ids"],
82
+ attention_mask=features["attention_mask"],
83
+ )
84
+ else:
85
+ sentence_embedding = self.auto_model.second_stage_model(
86
+ input_ids=features["input_ids"],
87
+ attention_mask=features["attention_mask"],
88
+ dataset_embeddings=dataset_embeddings,
89
+ )
90
+
91
+ features["sentence_embedding"] = sentence_embedding
92
+ return features
93
+
94
+ def get_word_embedding_dimension(self) -> int:
95
+ return self.auto_model.config.hidden_size
96
+
97
+ def tokenize(
98
+ self, texts: list[str] | list[dict] | list[tuple[str, str]], padding: str | bool = True
99
+ ) -> dict[str, torch.Tensor]:
100
+ """Tokenizes a text and maps tokens to token-ids"""
101
+ output = {}
102
+ if isinstance(texts[0], str):
103
+ to_tokenize = [texts]
104
+ elif isinstance(texts[0], dict):
105
+ to_tokenize = []
106
+ output["text_keys"] = []
107
+ for lookup in texts:
108
+ text_key, text = next(iter(lookup.items()))
109
+ to_tokenize.append(text)
110
+ output["text_keys"].append(text_key)
111
+ to_tokenize = [to_tokenize]
112
+ else:
113
+ batch1, batch2 = [], []
114
+ for text_tuple in texts:
115
+ batch1.append(text_tuple[0])
116
+ batch2.append(text_tuple[1])
117
+ to_tokenize = [batch1, batch2]
118
+
119
+ max_seq_length = self.config.max_seq_length
120
+ output.update(
121
+ self.tokenizer(
122
+ *to_tokenize,
123
+ padding=padding,
124
+ truncation="longest_first",
125
+ return_tensors="pt",
126
+ max_length=max_seq_length,
127
+ )
128
+ )
129
+ return output
130
+
131
+ def get_config_dict(self) -> dict[str, Any]:
132
+ return {}
133
+
134
+ def save(self, output_path: str, safe_serialization: bool = True) -> None:
135
+ self.auto_model.save_pretrained(output_path, safe_serialization=safe_serialization)
136
+ self.tokenizer.save_pretrained(output_path)
137
+
138
+ with open(os.path.join(output_path, "sentence_bert_config.json"), "w") as fOut:
139
+ json.dump(self.get_config_dict(), fOut, indent=2)
140
+
141
+ @classmethod
142
+ def load(cls, input_path: str) -> Transformer:
143
+ sbert_config_path = os.path.join(input_path, "sentence_bert_config.json")
144
+ if not os.path.exists(sbert_config_path):
145
+ return cls(model_name_or_path=input_path)
146
+
147
+ with open(sbert_config_path) as fIn:
148
+ config = json.load(fIn)
149
+ # Don't allow configs to set trust_remote_code
150
+ if "model_args" in config and "trust_remote_code" in config["model_args"]:
151
+ config["model_args"].pop("trust_remote_code")
152
+ if "tokenizer_args" in config and "trust_remote_code" in config["tokenizer_args"]:
153
+ config["tokenizer_args"].pop("trust_remote_code")
154
+ if "config_args" in config and "trust_remote_code" in config["config_args"]:
155
+ config["config_args"].pop("trust_remote_code")
156
+ return cls(model_name_or_path=input_path, **config)
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "mask_token": "[MASK]",
48
+ "model_max_length": 512,
49
+ "pad_token": "[PAD]",
50
+ "sep_token": "[SEP]",
51
+ "strip_accents": null,
52
+ "tokenize_chinese_chars": true,
53
+ "tokenizer_class": "BertTokenizer",
54
+ "unk_token": "[UNK]"
55
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff