Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- 1_Pooling/config.json +10 -0
- README.md +409 -0
- config.json +37 -0
- config.py +150 -0
- config_sentence_transformers.json +10 -0
- model.safetensors +3 -0
- modules.json +20 -0
- sentence_bert_config.json +4 -0
- sentencepiece.bpe.model +3 -0
- special_tokens_map.json +51 -0
- tokenizer.json +3 -0
- tokenizer_config.json +63 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
1_Pooling/config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 1024,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false,
|
9 |
+
"include_prompt": true
|
10 |
+
}
|
README.md
ADDED
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
tags:
|
3 |
+
- sentence-transformers
|
4 |
+
- sentence-similarity
|
5 |
+
- feature-extraction
|
6 |
+
- generated_from_trainer
|
7 |
+
- dataset_size:21769
|
8 |
+
- loss:MultipleNegativesRankingLoss
|
9 |
+
base_model: am-azadi/bilingual-embedding-large_Fine_Tuned_2e
|
10 |
+
widget:
|
11 |
+
- source_sentence: Amen.. This Quran was found at the bottom of the sea Has become
|
12 |
+
a rock but still intact subhanallah, hopefully those who like it, comment amen
|
13 |
+
and share this post sincerely the sustenance tomorrow morning will be abundant
|
14 |
+
from the opposite direction unexpected.amen اهيه
|
15 |
+
sentences:
|
16 |
+
- Mexico deserves an Oscar for the coffin dance The video of uniformed men doing
|
17 |
+
"the coffin dance" was recorded in Colombia, not in Mexico
|
18 |
+
- The Koran was found at the bottom of the sea already turned into a rock but still
|
19 |
+
intact This is a dictionary covered in crystal and is a work of art by an American
|
20 |
+
artist
|
21 |
+
- Video purported to be a video celebrating the inauguration of Hamas' new office
|
22 |
+
in the Indian state of Kerala False, this claim is a video celebrating the inauguration
|
23 |
+
of the Hamas office in India
|
24 |
+
- source_sentence: ' P Stay alert ! 6710 A Japanese man killed his friend just because
|
25 |
+
he didn give him 6x scope in PUBG TAG A PUBG LOVER'
|
26 |
+
sentences:
|
27 |
+
- Japanese man killed friend over video game Japanese man killed his friend in row
|
28 |
+
over video game?
|
29 |
+
- This photo shows the Glastonbury festival after Greta Thunberg's participation
|
30 |
+
in 2022 This Glastonbury festival photo is from 2015, not 2022 after Greta Thunberg's
|
31 |
+
speech
|
32 |
+
- Footage of damaged building was shot in Russia in 2018 Footage shows Ukraine in
|
33 |
+
2022, not Russia in 2018
|
34 |
+
- source_sentence: This is Manoj Tiwari, MP - North East Delhi I am busy bursting
|
35 |
+
firecrackers, after bursting firecrackers all night, I wake up in the morning
|
36 |
+
and say, "Today my eyes are burning in Delhi". Manoj Tiwari Today my eyes are
|
37 |
+
burning in Delhi, and yours? ,
|
38 |
+
sentences:
|
39 |
+
- Images show recent unrest and brutality in Uganda None of these images are related
|
40 |
+
to Uganda’s ongoing political troubles
|
41 |
+
- The photo shows Indian politician Manoj Tiwari lighting fireworks in Delhi during
|
42 |
+
smog crisis. This image of an Indian lawmaker lighting a firecracker has circulated
|
43 |
+
in reports since 2014
|
44 |
+
- World Economic Forum tweet asks if age of consent should be lowered to 13 Fabricated
|
45 |
+
World Economic Forum tweet about 'lowering age of consent' misleads online
|
46 |
+
- source_sentence: ' : He Yunshi was arrested, as expected, but better than expected
|
47 |
+
even faster. . . 6-1 LICEN Fang Bomei BOOT UML'
|
48 |
+
sentences:
|
49 |
+
- In Chile they have just expropriated pensions It is not true that in Chile “the
|
50 |
+
pensions have just been expropriated”
|
51 |
+
- Four British Airways airline pilots have died from the covid-19 vaccine British
|
52 |
+
Airways ruled out link between pilot deaths and vaccinations
|
53 |
+
- Hong Kong Pro-democracy artist Denise Ho arrested in September 2021 Old photos
|
54 |
+
of Hong Kong pro-democracy activist shared in false 'news' of her arrest
|
55 |
+
- source_sentence: 'Uuuuu mepa that they killed the real bald guy EXCLUSIVE What are
|
56 |
+
you doing bald, go getting into it jonca that the 12 wants to take pictures with
|
57 |
+
you at any time 14:04 ✓ they found 2 contact cards for this number re add them
|
58 |
+
to your contacts? T SEE CONTACT CARDS CELL PHONES TURNED OFF THEY LOOK FOR THEM
|
59 |
+
EVERYWHERE THE "12" IS LOOKING FOR THEM "serne HD MRASSIA LEGAL: 11 2159 6256
|
60 |
+
FOR POLICE COMPLAINTS: 11 2159 6256 HERE FOR POLICE COMPLAINTS: 11-'
|
61 |
+
sentences:
|
62 |
+
- The elected mayor of Medellín does not like ESMAD. WHY WILL IT BE? The original
|
63 |
+
video shows Daniel Quintero in a demonstration against violence in Bogotá
|
64 |
+
- Warning in Paris about stroke in children in post-covid vaccine era The stroke
|
65 |
+
campaign in France is not about vaccinating children against covid-19
|
66 |
+
- They find Diego Molina murdered in his apartment, the skinny from the funeral
|
67 |
+
home who took photos with Diego Armando Maradona The images of a lacerated body
|
68 |
+
are not of the person who was photographed with the corpse of Maradona
|
69 |
+
pipeline_tag: sentence-similarity
|
70 |
+
library_name: sentence-transformers
|
71 |
+
---
|
72 |
+
|
73 |
+
# SentenceTransformer based on am-azadi/bilingual-embedding-large_Fine_Tuned_2e
|
74 |
+
|
75 |
+
This is a [sentence-transformers](https://www.SBERT.net) model finetuned from [am-azadi/bilingual-embedding-large_Fine_Tuned_2e](https://huggingface.co/am-azadi/bilingual-embedding-large_Fine_Tuned_2e). It maps sentences & paragraphs to a 1024-dimensional dense vector space and can be used for semantic textual similarity, semantic search, paraphrase mining, text classification, clustering, and more.
|
76 |
+
|
77 |
+
## Model Details
|
78 |
+
|
79 |
+
### Model Description
|
80 |
+
- **Model Type:** Sentence Transformer
|
81 |
+
- **Base model:** [am-azadi/bilingual-embedding-large_Fine_Tuned_2e](https://huggingface.co/am-azadi/bilingual-embedding-large_Fine_Tuned_2e) <!-- at revision a857baa1cb45f6fa6969fb23817866d6810f6c7a -->
|
82 |
+
- **Maximum Sequence Length:** 512 tokens
|
83 |
+
- **Output Dimensionality:** 1024 dimensions
|
84 |
+
- **Similarity Function:** Cosine Similarity
|
85 |
+
<!-- - **Training Dataset:** Unknown -->
|
86 |
+
<!-- - **Language:** Unknown -->
|
87 |
+
<!-- - **License:** Unknown -->
|
88 |
+
|
89 |
+
### Model Sources
|
90 |
+
|
91 |
+
- **Documentation:** [Sentence Transformers Documentation](https://sbert.net)
|
92 |
+
- **Repository:** [Sentence Transformers on GitHub](https://github.com/UKPLab/sentence-transformers)
|
93 |
+
- **Hugging Face:** [Sentence Transformers on Hugging Face](https://huggingface.co/models?library=sentence-transformers)
|
94 |
+
|
95 |
+
### Full Model Architecture
|
96 |
+
|
97 |
+
```
|
98 |
+
SentenceTransformer(
|
99 |
+
(0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BilingualModel
|
100 |
+
(1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
|
101 |
+
(2): Normalize()
|
102 |
+
)
|
103 |
+
```
|
104 |
+
|
105 |
+
## Usage
|
106 |
+
|
107 |
+
### Direct Usage (Sentence Transformers)
|
108 |
+
|
109 |
+
First install the Sentence Transformers library:
|
110 |
+
|
111 |
+
```bash
|
112 |
+
pip install -U sentence-transformers
|
113 |
+
```
|
114 |
+
|
115 |
+
Then you can load this model and run inference.
|
116 |
+
```python
|
117 |
+
from sentence_transformers import SentenceTransformer
|
118 |
+
|
119 |
+
# Download from the 🤗 Hub
|
120 |
+
model = SentenceTransformer("sentence_transformers_model_id")
|
121 |
+
# Run inference
|
122 |
+
sentences = [
|
123 |
+
'Uuuuu mepa that they killed the real bald guy EXCLUSIVE What are you doing bald, go getting into it jonca that the 12 wants to take pictures with you at any time 14:04 ✓ they found 2 contact cards for this number re add them to your contacts? T SEE CONTACT CARDS CELL PHONES TURNED OFF THEY LOOK FOR THEM EVERYWHERE THE "12" IS LOOKING FOR THEM "serne HD MRASSIA LEGAL: 11 2159 6256 FOR POLICE COMPLAINTS: 11 2159 6256 HERE FOR POLICE COMPLAINTS: 11-',
|
124 |
+
'They find Diego Molina murdered in his apartment, the skinny from the funeral home who took photos with Diego Armando Maradona The images of a lacerated body are not of the person who was photographed with the corpse of Maradona',
|
125 |
+
'The elected mayor of Medellín does not like ESMAD. WHY WILL IT BE? The original video shows Daniel Quintero in a demonstration against violence in Bogotá',
|
126 |
+
]
|
127 |
+
embeddings = model.encode(sentences)
|
128 |
+
print(embeddings.shape)
|
129 |
+
# [3, 1024]
|
130 |
+
|
131 |
+
# Get the similarity scores for the embeddings
|
132 |
+
similarities = model.similarity(embeddings, embeddings)
|
133 |
+
print(similarities.shape)
|
134 |
+
# [3, 3]
|
135 |
+
```
|
136 |
+
|
137 |
+
<!--
|
138 |
+
### Direct Usage (Transformers)
|
139 |
+
|
140 |
+
<details><summary>Click to see the direct usage in Transformers</summary>
|
141 |
+
|
142 |
+
</details>
|
143 |
+
-->
|
144 |
+
|
145 |
+
<!--
|
146 |
+
### Downstream Usage (Sentence Transformers)
|
147 |
+
|
148 |
+
You can finetune this model on your own dataset.
|
149 |
+
|
150 |
+
<details><summary>Click to expand</summary>
|
151 |
+
|
152 |
+
</details>
|
153 |
+
-->
|
154 |
+
|
155 |
+
<!--
|
156 |
+
### Out-of-Scope Use
|
157 |
+
|
158 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
159 |
+
-->
|
160 |
+
|
161 |
+
<!--
|
162 |
+
## Bias, Risks and Limitations
|
163 |
+
|
164 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
165 |
+
-->
|
166 |
+
|
167 |
+
<!--
|
168 |
+
### Recommendations
|
169 |
+
|
170 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
171 |
+
-->
|
172 |
+
|
173 |
+
## Training Details
|
174 |
+
|
175 |
+
### Training Dataset
|
176 |
+
|
177 |
+
#### Unnamed Dataset
|
178 |
+
|
179 |
+
* Size: 21,769 training samples
|
180 |
+
* Columns: <code>sentence_0</code> and <code>sentence_1</code>
|
181 |
+
* Approximate statistics based on the first 1000 samples:
|
182 |
+
| | sentence_0 | sentence_1 |
|
183 |
+
|:--------|:------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------|
|
184 |
+
| type | string | string |
|
185 |
+
| details | <ul><li>min: 4 tokens</li><li>mean: 122.97 tokens</li><li>max: 512 tokens</li></ul> | <ul><li>min: 17 tokens</li><li>mean: 38.24 tokens</li><li>max: 109 tokens</li></ul> |
|
186 |
+
* Samples:
|
187 |
+
| sentence_0 | sentence_1 |
|
188 |
+
|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
189 |
+
| <code>NEW HANDLING OF ALERT While the achieves 6,101,968 votes (i.e. 26.8%), the Ministry of the Interior only gives it 5,836,202 votes (i.e. 25.7%) to artificially make 's party appear in the lead . Hello Council of State? </code> | <code>The Ministry of the Interior manipulated the results of the legislative elections Legislative: why are the results of the 1st round contested by the Nupes?</code> |
|
190 |
+
| <code><3<3... Civil Registry Offices in Brazil: The only source that does not lie, as it issues all death certificates daily, for all reasons. This source cannot be disputed by anyone. Only they can say for sure, how many people die each day, and the reason for death. The rest is fake news. Via Jose Mendes Junior Updating... Deaths in Brazil: July 2019 - 119,390 (without pandemic) July 2020 - 113,475 (with pandemic) Source: transparencia.registrocivil.org.br... Now what are they going to say????</code> | <code>More deaths were recorded in Brazil in July 2019, before the pandemic, than in July 2020, during the new coronavirus pandemic. Publications use partial data on deaths recorded in July 2020</code> |
|
191 |
+
| <code>Zimbabwe Police are taking disciplinary action with a church that refused to take closure instructions to prevent the spread of Coronavirus. </code> | <code>Worshipers beaten in Zimbabwe for failing to comply with coronavirus assembly ban No, worshipers have not been beaten by police in Zimbabwe for gathering during the coronavirus outbreak</code> |
|
192 |
+
* Loss: [<code>MultipleNegativesRankingLoss</code>](https://sbert.net/docs/package_reference/sentence_transformer/losses.html#multiplenegativesrankingloss) with these parameters:
|
193 |
+
```json
|
194 |
+
{
|
195 |
+
"scale": 20.0,
|
196 |
+
"similarity_fct": "cos_sim"
|
197 |
+
}
|
198 |
+
```
|
199 |
+
|
200 |
+
### Training Hyperparameters
|
201 |
+
#### Non-Default Hyperparameters
|
202 |
+
|
203 |
+
- `per_device_train_batch_size`: 2
|
204 |
+
- `per_device_eval_batch_size`: 2
|
205 |
+
- `num_train_epochs`: 1
|
206 |
+
- `multi_dataset_batch_sampler`: round_robin
|
207 |
+
|
208 |
+
#### All Hyperparameters
|
209 |
+
<details><summary>Click to expand</summary>
|
210 |
+
|
211 |
+
- `overwrite_output_dir`: False
|
212 |
+
- `do_predict`: False
|
213 |
+
- `eval_strategy`: no
|
214 |
+
- `prediction_loss_only`: True
|
215 |
+
- `per_device_train_batch_size`: 2
|
216 |
+
- `per_device_eval_batch_size`: 2
|
217 |
+
- `per_gpu_train_batch_size`: None
|
218 |
+
- `per_gpu_eval_batch_size`: None
|
219 |
+
- `gradient_accumulation_steps`: 1
|
220 |
+
- `eval_accumulation_steps`: None
|
221 |
+
- `torch_empty_cache_steps`: None
|
222 |
+
- `learning_rate`: 5e-05
|
223 |
+
- `weight_decay`: 0.0
|
224 |
+
- `adam_beta1`: 0.9
|
225 |
+
- `adam_beta2`: 0.999
|
226 |
+
- `adam_epsilon`: 1e-08
|
227 |
+
- `max_grad_norm`: 1
|
228 |
+
- `num_train_epochs`: 1
|
229 |
+
- `max_steps`: -1
|
230 |
+
- `lr_scheduler_type`: linear
|
231 |
+
- `lr_scheduler_kwargs`: {}
|
232 |
+
- `warmup_ratio`: 0.0
|
233 |
+
- `warmup_steps`: 0
|
234 |
+
- `log_level`: passive
|
235 |
+
- `log_level_replica`: warning
|
236 |
+
- `log_on_each_node`: True
|
237 |
+
- `logging_nan_inf_filter`: True
|
238 |
+
- `save_safetensors`: True
|
239 |
+
- `save_on_each_node`: False
|
240 |
+
- `save_only_model`: False
|
241 |
+
- `restore_callback_states_from_checkpoint`: False
|
242 |
+
- `no_cuda`: False
|
243 |
+
- `use_cpu`: False
|
244 |
+
- `use_mps_device`: False
|
245 |
+
- `seed`: 42
|
246 |
+
- `data_seed`: None
|
247 |
+
- `jit_mode_eval`: False
|
248 |
+
- `use_ipex`: False
|
249 |
+
- `bf16`: False
|
250 |
+
- `fp16`: False
|
251 |
+
- `fp16_opt_level`: O1
|
252 |
+
- `half_precision_backend`: auto
|
253 |
+
- `bf16_full_eval`: False
|
254 |
+
- `fp16_full_eval`: False
|
255 |
+
- `tf32`: None
|
256 |
+
- `local_rank`: 0
|
257 |
+
- `ddp_backend`: None
|
258 |
+
- `tpu_num_cores`: None
|
259 |
+
- `tpu_metrics_debug`: False
|
260 |
+
- `debug`: []
|
261 |
+
- `dataloader_drop_last`: False
|
262 |
+
- `dataloader_num_workers`: 0
|
263 |
+
- `dataloader_prefetch_factor`: None
|
264 |
+
- `past_index`: -1
|
265 |
+
- `disable_tqdm`: False
|
266 |
+
- `remove_unused_columns`: True
|
267 |
+
- `label_names`: None
|
268 |
+
- `load_best_model_at_end`: False
|
269 |
+
- `ignore_data_skip`: False
|
270 |
+
- `fsdp`: []
|
271 |
+
- `fsdp_min_num_params`: 0
|
272 |
+
- `fsdp_config`: {'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False}
|
273 |
+
- `fsdp_transformer_layer_cls_to_wrap`: None
|
274 |
+
- `accelerator_config`: {'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None}
|
275 |
+
- `deepspeed`: None
|
276 |
+
- `label_smoothing_factor`: 0.0
|
277 |
+
- `optim`: adamw_torch
|
278 |
+
- `optim_args`: None
|
279 |
+
- `adafactor`: False
|
280 |
+
- `group_by_length`: False
|
281 |
+
- `length_column_name`: length
|
282 |
+
- `ddp_find_unused_parameters`: None
|
283 |
+
- `ddp_bucket_cap_mb`: None
|
284 |
+
- `ddp_broadcast_buffers`: False
|
285 |
+
- `dataloader_pin_memory`: True
|
286 |
+
- `dataloader_persistent_workers`: False
|
287 |
+
- `skip_memory_metrics`: True
|
288 |
+
- `use_legacy_prediction_loop`: False
|
289 |
+
- `push_to_hub`: False
|
290 |
+
- `resume_from_checkpoint`: None
|
291 |
+
- `hub_model_id`: None
|
292 |
+
- `hub_strategy`: every_save
|
293 |
+
- `hub_private_repo`: None
|
294 |
+
- `hub_always_push`: False
|
295 |
+
- `gradient_checkpointing`: False
|
296 |
+
- `gradient_checkpointing_kwargs`: None
|
297 |
+
- `include_inputs_for_metrics`: False
|
298 |
+
- `include_for_metrics`: []
|
299 |
+
- `eval_do_concat_batches`: True
|
300 |
+
- `fp16_backend`: auto
|
301 |
+
- `push_to_hub_model_id`: None
|
302 |
+
- `push_to_hub_organization`: None
|
303 |
+
- `mp_parameters`:
|
304 |
+
- `auto_find_batch_size`: False
|
305 |
+
- `full_determinism`: False
|
306 |
+
- `torchdynamo`: None
|
307 |
+
- `ray_scope`: last
|
308 |
+
- `ddp_timeout`: 1800
|
309 |
+
- `torch_compile`: False
|
310 |
+
- `torch_compile_backend`: None
|
311 |
+
- `torch_compile_mode`: None
|
312 |
+
- `dispatch_batches`: None
|
313 |
+
- `split_batches`: None
|
314 |
+
- `include_tokens_per_second`: False
|
315 |
+
- `include_num_input_tokens_seen`: False
|
316 |
+
- `neftune_noise_alpha`: None
|
317 |
+
- `optim_target_modules`: None
|
318 |
+
- `batch_eval_metrics`: False
|
319 |
+
- `eval_on_start`: False
|
320 |
+
- `use_liger_kernel`: False
|
321 |
+
- `eval_use_gather_object`: False
|
322 |
+
- `average_tokens_across_devices`: False
|
323 |
+
- `prompts`: None
|
324 |
+
- `batch_sampler`: batch_sampler
|
325 |
+
- `multi_dataset_batch_sampler`: round_robin
|
326 |
+
|
327 |
+
</details>
|
328 |
+
|
329 |
+
### Training Logs
|
330 |
+
| Epoch | Step | Training Loss |
|
331 |
+
|:------:|:-----:|:-------------:|
|
332 |
+
| 0.0459 | 500 | 0.0148 |
|
333 |
+
| 0.0919 | 1000 | 0.0066 |
|
334 |
+
| 0.1378 | 1500 | 0.0245 |
|
335 |
+
| 0.1837 | 2000 | 0.0184 |
|
336 |
+
| 0.2297 | 2500 | 0.0174 |
|
337 |
+
| 0.2756 | 3000 | 0.0053 |
|
338 |
+
| 0.3215 | 3500 | 0.025 |
|
339 |
+
| 0.3675 | 4000 | 0.0105 |
|
340 |
+
| 0.4134 | 4500 | 0.0054 |
|
341 |
+
| 0.4593 | 5000 | 0.0076 |
|
342 |
+
| 0.5053 | 5500 | 0.0085 |
|
343 |
+
| 0.5512 | 6000 | 0.0104 |
|
344 |
+
| 0.5972 | 6500 | 0.0208 |
|
345 |
+
| 0.6431 | 7000 | 0.0072 |
|
346 |
+
| 0.6890 | 7500 | 0.0084 |
|
347 |
+
| 0.7350 | 8000 | 0.0053 |
|
348 |
+
| 0.7809 | 8500 | 0.0052 |
|
349 |
+
| 0.8268 | 9000 | 0.0064 |
|
350 |
+
| 0.8728 | 9500 | 0.0074 |
|
351 |
+
| 0.9187 | 10000 | 0.0083 |
|
352 |
+
| 0.9646 | 10500 | 0.008 |
|
353 |
+
|
354 |
+
|
355 |
+
### Framework Versions
|
356 |
+
- Python: 3.11.11
|
357 |
+
- Sentence Transformers: 3.4.1
|
358 |
+
- Transformers: 4.48.3
|
359 |
+
- PyTorch: 2.5.1+cu124
|
360 |
+
- Accelerate: 1.3.0
|
361 |
+
- Datasets: 3.3.2
|
362 |
+
- Tokenizers: 0.21.0
|
363 |
+
|
364 |
+
## Citation
|
365 |
+
|
366 |
+
### BibTeX
|
367 |
+
|
368 |
+
#### Sentence Transformers
|
369 |
+
```bibtex
|
370 |
+
@inproceedings{reimers-2019-sentence-bert,
|
371 |
+
title = "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks",
|
372 |
+
author = "Reimers, Nils and Gurevych, Iryna",
|
373 |
+
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing",
|
374 |
+
month = "11",
|
375 |
+
year = "2019",
|
376 |
+
publisher = "Association for Computational Linguistics",
|
377 |
+
url = "https://arxiv.org/abs/1908.10084",
|
378 |
+
}
|
379 |
+
```
|
380 |
+
|
381 |
+
#### MultipleNegativesRankingLoss
|
382 |
+
```bibtex
|
383 |
+
@misc{henderson2017efficient,
|
384 |
+
title={Efficient Natural Language Response Suggestion for Smart Reply},
|
385 |
+
author={Matthew Henderson and Rami Al-Rfou and Brian Strope and Yun-hsuan Sung and Laszlo Lukacs and Ruiqi Guo and Sanjiv Kumar and Balint Miklos and Ray Kurzweil},
|
386 |
+
year={2017},
|
387 |
+
eprint={1705.00652},
|
388 |
+
archivePrefix={arXiv},
|
389 |
+
primaryClass={cs.CL}
|
390 |
+
}
|
391 |
+
```
|
392 |
+
|
393 |
+
<!--
|
394 |
+
## Glossary
|
395 |
+
|
396 |
+
*Clearly define terms in order to be accessible across audiences.*
|
397 |
+
-->
|
398 |
+
|
399 |
+
<!--
|
400 |
+
## Model Card Authors
|
401 |
+
|
402 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
403 |
+
-->
|
404 |
+
|
405 |
+
<!--
|
406 |
+
## Model Card Contact
|
407 |
+
|
408 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
409 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "am-azadi/bilingual-embedding-large_Fine_Tuned_2e",
|
3 |
+
"architectures": [
|
4 |
+
"BilingualModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"auto_map": {
|
8 |
+
"AutoConfig": "config.BilingualConfig",
|
9 |
+
"AutoModel": "dangvantuan/bilingual_impl--modeling.BilingualModel",
|
10 |
+
"AutoModelForMaskedLM": "dangvantuan/bilingual_impl--modeling.BilingualForMaskedLM",
|
11 |
+
"AutoModelForMultipleChoice": "dangvantuan/bilingual_impl--modeling.BilingualForMultipleChoice",
|
12 |
+
"AutoModelForQuestionAnswering": "dangvantuan/bilingual_impl--modeling.BilingualForQuestionAnswering",
|
13 |
+
"AutoModelForSequenceClassification": "dangvantuan/bilingual_impl--modeling.BilingualForSequenceClassification",
|
14 |
+
"AutoModelForTokenClassification": "dangvantuan/bilingual_impl--modeling.BilingualForTokenClassification"
|
15 |
+
},
|
16 |
+
"bos_token_id": 0,
|
17 |
+
"classifier_dropout": null,
|
18 |
+
"eos_token_id": 2,
|
19 |
+
"hidden_act": "gelu",
|
20 |
+
"hidden_dropout_prob": 0.1,
|
21 |
+
"hidden_size": 1024,
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"intermediate_size": 4096,
|
24 |
+
"layer_norm_eps": 1e-05,
|
25 |
+
"max_position_embeddings": 514,
|
26 |
+
"model_type": "bilingual",
|
27 |
+
"num_attention_heads": 16,
|
28 |
+
"num_hidden_layers": 24,
|
29 |
+
"output_past": true,
|
30 |
+
"pad_token_id": 1,
|
31 |
+
"position_embedding_type": "absolute",
|
32 |
+
"torch_dtype": "float32",
|
33 |
+
"transformers_version": "4.48.3",
|
34 |
+
"type_vocab_size": 1,
|
35 |
+
"use_cache": true,
|
36 |
+
"vocab_size": 250002
|
37 |
+
}
|
config.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
3 |
+
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
4 |
+
#
|
5 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6 |
+
# you may not use this file except in compliance with the License.
|
7 |
+
# You may obtain a copy of the License at
|
8 |
+
#
|
9 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10 |
+
#
|
11 |
+
# Unless required by applicable law or agreed to in writing, software
|
12 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14 |
+
# See the License for the specific language governing permissions and
|
15 |
+
# limitations under the License.
|
16 |
+
"""Bilingual configuration"""
|
17 |
+
|
18 |
+
from collections import OrderedDict
|
19 |
+
from typing import Mapping
|
20 |
+
|
21 |
+
from transformers.configuration_utils import PretrainedConfig
|
22 |
+
from transformers.onnx import OnnxConfig
|
23 |
+
from transformers.utils import logging
|
24 |
+
|
25 |
+
|
26 |
+
logger = logging.get_logger(__name__)
|
27 |
+
|
28 |
+
|
29 |
+
class BilingualConfig(PretrainedConfig):
|
30 |
+
r"""
|
31 |
+
This is the configuration class to store the configuration of a [`BilingualModel`] or a [`TFBilingualModel`]. It
|
32 |
+
is used to instantiate a Bilingual model according to the specified arguments, defining the model architecture.
|
33 |
+
Instantiating a configuration with the defaults will yield a similar configuration to that of the Bilingual
|
34 |
+
|
35 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
36 |
+
documentation from [`PretrainedConfig`] for more information.
|
37 |
+
|
38 |
+
|
39 |
+
Args:
|
40 |
+
vocab_size (`int`, *optional*, defaults to 30522):
|
41 |
+
Vocabulary size of the Bilingual model. Defines the number of different tokens that can be represented by
|
42 |
+
the `inputs_ids` passed when calling [`BilingualModel`] or [`TFBilingualModel`].
|
43 |
+
hidden_size (`int`, *optional*, defaults to 768):
|
44 |
+
Dimensionality of the encoder layers and the pooler layer.
|
45 |
+
num_hidden_layers (`int`, *optional*, defaults to 12):
|
46 |
+
Number of hidden layers in the Transformer encoder.
|
47 |
+
num_attention_heads (`int`, *optional*, defaults to 12):
|
48 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
49 |
+
intermediate_size (`int`, *optional*, defaults to 3072):
|
50 |
+
Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
51 |
+
hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
|
52 |
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
53 |
+
`"relu"`, `"silu"` and `"gelu_new"` are supported.
|
54 |
+
hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
|
55 |
+
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
56 |
+
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
|
57 |
+
The dropout ratio for the attention probabilities.
|
58 |
+
max_position_embeddings (`int`, *optional*, defaults to 512):
|
59 |
+
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
60 |
+
just in case (e.g., 512 or 1024 or 2048).
|
61 |
+
type_vocab_size (`int`, *optional*, defaults to 2):
|
62 |
+
The vocabulary size of the `token_type_ids` passed when calling [`BilingualModel`] or
|
63 |
+
[`TFBilingualModel`].
|
64 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
65 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
66 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
67 |
+
The epsilon used by the layer normalization layers.
|
68 |
+
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
|
69 |
+
Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
|
70 |
+
positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
|
71 |
+
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
72 |
+
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
73 |
+
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
74 |
+
is_decoder (`bool`, *optional*, defaults to `False`):
|
75 |
+
Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
|
76 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
77 |
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
78 |
+
relevant if `config.is_decoder=True`.
|
79 |
+
classifier_dropout (`float`, *optional*):
|
80 |
+
The dropout ratio for the classification head.
|
81 |
+
|
82 |
+
Examples:
|
83 |
+
|
84 |
+
```python
|
85 |
+
>>> from transformers import BilingualConfig, BilingualModel
|
86 |
+
|
87 |
+
>>> configuration = BilingualConfig()
|
88 |
+
|
89 |
+
>>> model = BilingualModel(configuration)
|
90 |
+
|
91 |
+
>>> # Accessing the model configuration
|
92 |
+
>>> configuration = model.config
|
93 |
+
```"""
|
94 |
+
|
95 |
+
model_type = "bilingual"
|
96 |
+
|
97 |
+
def __init__(
|
98 |
+
self,
|
99 |
+
vocab_size=30522,
|
100 |
+
hidden_size=768,
|
101 |
+
num_hidden_layers=12,
|
102 |
+
num_attention_heads=12,
|
103 |
+
intermediate_size=3072,
|
104 |
+
hidden_act="gelu",
|
105 |
+
hidden_dropout_prob=0.1,
|
106 |
+
attention_probs_dropout_prob=0.1,
|
107 |
+
max_position_embeddings=512,
|
108 |
+
type_vocab_size=2,
|
109 |
+
initializer_range=0.02,
|
110 |
+
layer_norm_eps=1e-12,
|
111 |
+
pad_token_id=1,
|
112 |
+
bos_token_id=0,
|
113 |
+
eos_token_id=2,
|
114 |
+
position_embedding_type="absolute",
|
115 |
+
use_cache=True,
|
116 |
+
classifier_dropout=None,
|
117 |
+
**kwargs,
|
118 |
+
):
|
119 |
+
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
120 |
+
|
121 |
+
self.vocab_size = vocab_size
|
122 |
+
self.hidden_size = hidden_size
|
123 |
+
self.num_hidden_layers = num_hidden_layers
|
124 |
+
self.num_attention_heads = num_attention_heads
|
125 |
+
self.hidden_act = hidden_act
|
126 |
+
self.intermediate_size = intermediate_size
|
127 |
+
self.hidden_dropout_prob = hidden_dropout_prob
|
128 |
+
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
129 |
+
self.max_position_embeddings = max_position_embeddings
|
130 |
+
self.type_vocab_size = type_vocab_size
|
131 |
+
self.initializer_range = initializer_range
|
132 |
+
self.layer_norm_eps = layer_norm_eps
|
133 |
+
self.position_embedding_type = position_embedding_type
|
134 |
+
self.use_cache = use_cache
|
135 |
+
self.classifier_dropout = classifier_dropout
|
136 |
+
|
137 |
+
|
138 |
+
class BilingualOnnxConfig(OnnxConfig):
|
139 |
+
@property
|
140 |
+
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
141 |
+
if self.task == "multiple-choice":
|
142 |
+
dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
|
143 |
+
else:
|
144 |
+
dynamic_axis = {0: "batch", 1: "sequence"}
|
145 |
+
return OrderedDict(
|
146 |
+
[
|
147 |
+
("input_ids", dynamic_axis),
|
148 |
+
("attention_mask", dynamic_axis),
|
149 |
+
]
|
150 |
+
)
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "3.4.1",
|
4 |
+
"transformers": "4.48.3",
|
5 |
+
"pytorch": "2.5.1+cu124"
|
6 |
+
},
|
7 |
+
"prompts": {},
|
8 |
+
"default_prompt_name": null,
|
9 |
+
"similarity_fn_name": "cosine"
|
10 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0400716269de0eb09bc71960142ed26515bc9a94d65f6172d059ce01f75e11cf
|
3 |
+
size 2239607176
|
modules.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"idx": 2,
|
16 |
+
"name": "2",
|
17 |
+
"path": "2_Normalize",
|
18 |
+
"type": "sentence_transformers.models.Normalize"
|
19 |
+
}
|
20 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
sentencepiece.bpe.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
|
3 |
+
size 5069051
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": false,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "<unk>",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:883b037111086fd4dfebbbc9b7cee11e1517b5e0c0514879478661440f137085
|
3 |
+
size 17082987
|
tokenizer_config.json
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"3": {
|
28 |
+
"content": "<unk>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"250001": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": true,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"additional_special_tokens": [],
|
45 |
+
"bos_token": "<s>",
|
46 |
+
"clean_up_tokenization_spaces": true,
|
47 |
+
"cls_token": "<s>",
|
48 |
+
"eos_token": "</s>",
|
49 |
+
"extra_special_tokens": {},
|
50 |
+
"mask_token": "<mask>",
|
51 |
+
"max_length": 512,
|
52 |
+
"model_max_length": 512,
|
53 |
+
"pad_to_multiple_of": null,
|
54 |
+
"pad_token": "<pad>",
|
55 |
+
"pad_token_type_id": 0,
|
56 |
+
"padding_side": "right",
|
57 |
+
"sep_token": "</s>",
|
58 |
+
"stride": 0,
|
59 |
+
"tokenizer_class": "XLMRobertaTokenizer",
|
60 |
+
"truncation_side": "right",
|
61 |
+
"truncation_strategy": "longest_first",
|
62 |
+
"unk_token": "<unk>"
|
63 |
+
}
|