Add SetFit model
Browse files- 1_Pooling/config.json +9 -0
- README.md +255 -1
- config.json +24 -0
- config_sentence_transformers.json +7 -0
- config_setfit.json +4 -0
- model.safetensors +3 -0
- model_head.pkl +3 -0
- modules.json +14 -0
- sentence_bert_config.json +4 -0
- special_tokens_map.json +51 -0
- tokenizer.json +0 -0
- tokenizer_config.json +59 -0
- vocab.txt +0 -0
1_Pooling/config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"word_embedding_dimension": 768,
|
3 |
+
"pooling_mode_cls_token": false,
|
4 |
+
"pooling_mode_mean_tokens": true,
|
5 |
+
"pooling_mode_max_tokens": false,
|
6 |
+
"pooling_mode_mean_sqrt_len_tokens": false,
|
7 |
+
"pooling_mode_weightedmean_tokens": false,
|
8 |
+
"pooling_mode_lasttoken": false
|
9 |
+
}
|
README.md
CHANGED
@@ -1,3 +1,257 @@
|
|
1 |
---
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
library_name: setfit
|
3 |
+
tags:
|
4 |
+
- setfit
|
5 |
+
- sentence-transformers
|
6 |
+
- text-classification
|
7 |
+
- generated_from_setfit_trainer
|
8 |
+
metrics:
|
9 |
+
- accuracy
|
10 |
+
widget:
|
11 |
+
- text: interest in third generation biomass such as macroalgae has increased due
|
12 |
+
to their high biomass yield, absence of lignin in their tissues, lower competition
|
13 |
+
for land and fresh water, no fertilization requirements, and efficient co2 capture
|
14 |
+
in coastal ecosystems. however, several challenges still exist in the development
|
15 |
+
of cost effective technologies for processing large amounts of macroalgae. recently,
|
16 |
+
genetically modified micro organisms able to convert brown macroalgae carbohydrates
|
17 |
+
into bioethanol were developed, but still no attempt to scale up production has
|
18 |
+
been proposed. based on giant kelp farming and bioethanol production program carried
|
19 |
+
out in chile, we were able to test and adapt this technology as first attempt
|
20 |
+
to scale up this process using 75 fermentation of genetically modified escherichia
|
21 |
+
coli. laboratory fermentation tests results showed that although biomass growth
|
22 |
+
and yield are not greatly affected by the alginate mannitol ratio, ethanol yield
|
23 |
+
showed clear maximum around alginate mannitol ratio. in . pyrifera, much greater
|
24 |
+
proportion of alginate and lower mannitol abundance is found. in order to make
|
25 |
+
the most of the carbohydrates available for fermentation, we developed four stage
|
26 |
+
process model for scaling up, including acid leaching, depolymerization, saccharification,
|
27 |
+
and fermentation steps. using this process, we obtained .213 kg ethanol kg dry
|
28 |
+
macroalgae, equivalent to . of ethanol hectare year, reaching 64 of the maximum
|
29 |
+
theoretical ethanol yield. we propose strategies to increase this yield, including
|
30 |
+
synthetic biology pathway engineering approaches and process optimization targets.
|
31 |
+
2016 society of chemical industry and john wiley sons, ltd
|
32 |
+
- text: producing concrete that incorporates carbon dioxide into the mix is leveraged
|
33 |
+
to reduce the carbon footprint and produce more sustainable concrete. as the concrete
|
34 |
+
dries, the co2 is mineralized and permanently incorporated into the early carbonation.
|
35 |
+
experimental work has been conducted, and hundreds of specimens with varying ratios
|
36 |
+
of co2 to binder content were cast. co2 to binder ratios of were used to test
|
37 |
+
concrete in workability , mechanical properties , and durability performance .
|
38 |
+
the chemical tests were also conducted to identify the changes in hardened concrete
|
39 |
+
composition for the three mixes . all specimens were field cured and exposed to
|
40 |
+
the coastal environment of ras al khair industrial city in saudi arabia. the results
|
41 |
+
showed that the co2 to binder ratio of . improved the concrete properties, in
|
42 |
+
particular, the effect was clear with higher slump and comparable strength compared
|
43 |
+
to the standard concrete without co2. however, the co2 to binder ratio of . shows
|
44 |
+
negligible increase in the chloride permeability and the internal chloride ion
|
45 |
+
content compared to the standard concrete without co2, whereas the internal sulfate
|
46 |
+
ion content has not increased for both co2 to binder ratios in comparison with
|
47 |
+
the standard concrete without co2, which indicate no reduction in concrete durability.
|
48 |
+
2023 isec press.
|
49 |
+
- text: mangroves are ecosystems made up of trees or shrubs that develop in the intertidal
|
50 |
+
zone and provide many vital environmental services for livelihoods in coastal
|
51 |
+
areas. they are habitat for the reproduction of several marine species. they afford
|
52 |
+
protection from hurricanes, tides, sea level rise and prevent the erosion of the
|
53 |
+
coasts. just one hectare of mangrove forest can hold up to ,000 tons of carbon
|
54 |
+
dioxide, more than tropical forests and jungles. mexico is one of the countries
|
55 |
+
with the greatest abundance of mangroves in the world, with more than 700,000
|
56 |
+
ha. blue carbon can be novel mechanism for promoting communication and cooperation
|
57 |
+
between the investor, the government, the users, and beneficiaries of the environmental
|
58 |
+
services of these ecosystems, creating public private social partnerships through
|
59 |
+
mechanisms such as payment for environmental services, credits, or the voluntary
|
60 |
+
carbon market. this chapter explores the possibilities of incorporating blue carbon
|
61 |
+
in emissions markets. we explore the huge potential of mexico blue carbon to sequester
|
62 |
+
co2. then we analyse the new market instrument that allows countries to sell or
|
63 |
+
transfer mitigation results internationally the sustainable development mechanism
|
64 |
+
, established in the paris agreement. secondly, we present the progress of the
|
65 |
+
commission for environmental cooperation to standardize the methodologies to assess
|
66 |
+
their stock and determine the magnitude of the blue carbon sinks. thirdly, as
|
67 |
+
an opportunity for mexico, the collaboration with the california cap and trade
|
68 |
+
program is analysed. we conclude that blue carbon is very important mitigation
|
69 |
+
tool to be included in the compensation schemes on regional and global levels.
|
70 |
+
additionally, mangrove protection is an excellent example of the mitigation adaptation
|
71 |
+
sustainable development relationship, as well as fostering of governance by the
|
72 |
+
inclusion of the coastal communities in decision making and incomes. 2022, the
|
73 |
+
author.
|
74 |
+
- text: featured application the findings obtained from this study have implications
|
75 |
+
for global blue carbon budgeting. abstract field monitoring and incubation experiments
|
76 |
+
were conducted to evaluate the litter yield and examine the decomposition of the
|
77 |
+
litter of three representative mangrove species frequently used for mangrove re
|
78 |
+
vegetation in subtropical mudflat on the south china coast. the results show that
|
79 |
+
the litter yield of the investigated mangrove species varied significantly from
|
80 |
+
season to season. the annual litter production was in the following decreasing
|
81 |
+
order heritiera littoralis thespesia populnea kandelia obovata. initially, rapid
|
82 |
+
decomposition of easily degradable components of the litter materials resulted
|
83 |
+
in marked weight loss of the mangrove litter. there was good linear relationship
|
84 |
+
between the length of field incubation time and the litter decomposition rate
|
85 |
+
for both the branch and the leaf portion of the three investigated mangrove species.
|
86 |
+
approximately 50 or more of the added mangrove litter could be decomposed within
|
87 |
+
one year and the decomposed litter could be incorporated into the underlying soils
|
88 |
+
and consequently affect the soil carbon dynamics. an annual soil carbon increase
|
89 |
+
from .37 to .64 kg in the top cm of the soil was recorded for the investigated
|
90 |
+
mangrove species.
|
91 |
+
- text: seagrasses provide multitude of ecosystem services and serve as important
|
92 |
+
organic carbon stores. however, seagrass habitats are declining worldwide, threatened
|
93 |
+
by global climate change and regional shifts in water quality. acoustical methods
|
94 |
+
have been applied to assess changes in oxygen production of seagrass meadows since
|
95 |
+
sound propagation is sensitive to the presence of bubbles, which exist both within
|
96 |
+
the plant tissue and freely floating the water as byproducts of photosynthesis.
|
97 |
+
this work applies acoustic remote sensing techniques to characterize two different
|
98 |
+
regions of seagrass meadow densely vegetated meadow of thalassia testudinum and
|
99 |
+
sandy region sparsely populated by isolated stands of . testudinum. bayesian approach
|
100 |
+
is applied to estimate the posterior probability distributions of the unknown
|
101 |
+
model parameters. the sensitivity of sound to the void fraction of gas present
|
102 |
+
in the seagrass meadow was established by the narrow marginal probability distributions
|
103 |
+
that provided distinct estimates of the void fraction between the two sites. the
|
104 |
+
absolute values of the estimated void fractions are biased by limitations in the
|
105 |
+
forward model, which does not capture the full complexity of the seagrass environment.
|
106 |
+
nevertheless, the results demonstrate the potential use of acoustical methods
|
107 |
+
to remotely sense seagrass health and density.
|
108 |
+
pipeline_tag: text-classification
|
109 |
+
inference: false
|
110 |
+
base_model: sentence-transformers/paraphrase-mpnet-base-v2
|
111 |
---
|
112 |
+
|
113 |
+
# SetFit with sentence-transformers/paraphrase-mpnet-base-v2
|
114 |
+
|
115 |
+
This is a [SetFit](https://github.com/huggingface/setfit) model that can be used for Text Classification. This SetFit model uses [sentence-transformers/paraphrase-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-mpnet-base-v2) as the Sentence Transformer embedding model. A MultiOutputClassifier instance is used for classification.
|
116 |
+
|
117 |
+
The model has been trained using an efficient few-shot learning technique that involves:
|
118 |
+
|
119 |
+
1. Fine-tuning a [Sentence Transformer](https://www.sbert.net) with contrastive learning.
|
120 |
+
2. Training a classification head with features from the fine-tuned Sentence Transformer.
|
121 |
+
|
122 |
+
## Model Details
|
123 |
+
|
124 |
+
### Model Description
|
125 |
+
- **Model Type:** SetFit
|
126 |
+
- **Sentence Transformer body:** [sentence-transformers/paraphrase-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-mpnet-base-v2)
|
127 |
+
- **Classification head:** a MultiOutputClassifier instance
|
128 |
+
- **Maximum Sequence Length:** 512 tokens
|
129 |
+
<!-- - **Number of Classes:** Unknown -->
|
130 |
+
<!-- - **Training Dataset:** [Unknown](https://huggingface.co/datasets/unknown) -->
|
131 |
+
<!-- - **Language:** Unknown -->
|
132 |
+
<!-- - **License:** Unknown -->
|
133 |
+
|
134 |
+
### Model Sources
|
135 |
+
|
136 |
+
- **Repository:** [SetFit on GitHub](https://github.com/huggingface/setfit)
|
137 |
+
- **Paper:** [Efficient Few-Shot Learning Without Prompts](https://arxiv.org/abs/2209.11055)
|
138 |
+
- **Blogpost:** [SetFit: Efficient Few-Shot Learning Without Prompts](https://huggingface.co/blog/setfit)
|
139 |
+
|
140 |
+
## Uses
|
141 |
+
|
142 |
+
### Direct Use for Inference
|
143 |
+
|
144 |
+
First install the SetFit library:
|
145 |
+
|
146 |
+
```bash
|
147 |
+
pip install setfit
|
148 |
+
```
|
149 |
+
|
150 |
+
Then you can load this model and run inference.
|
151 |
+
|
152 |
+
```python
|
153 |
+
from setfit import SetFitModel
|
154 |
+
|
155 |
+
# Download from the 🤗 Hub
|
156 |
+
model = SetFitModel.from_pretrained("ignaciosg/blueCarbon")
|
157 |
+
# Run inference
|
158 |
+
preds = model("featured application the findings obtained from this study have implications for global blue carbon budgeting. abstract field monitoring and incubation experiments were conducted to evaluate the litter yield and examine the decomposition of the litter of three representative mangrove species frequently used for mangrove re vegetation in subtropical mudflat on the south china coast. the results show that the litter yield of the investigated mangrove species varied significantly from season to season. the annual litter production was in the following decreasing order heritiera littoralis thespesia populnea kandelia obovata. initially, rapid decomposition of easily degradable components of the litter materials resulted in marked weight loss of the mangrove litter. there was good linear relationship between the length of field incubation time and the litter decomposition rate for both the branch and the leaf portion of the three investigated mangrove species. approximately 50 or more of the added mangrove litter could be decomposed within one year and the decomposed litter could be incorporated into the underlying soils and consequently affect the soil carbon dynamics. an annual soil carbon increase from .37 to .64 kg in the top cm of the soil was recorded for the investigated mangrove species.")
|
159 |
+
```
|
160 |
+
|
161 |
+
<!--
|
162 |
+
### Downstream Use
|
163 |
+
|
164 |
+
*List how someone could finetune this model on their own dataset.*
|
165 |
+
-->
|
166 |
+
|
167 |
+
<!--
|
168 |
+
### Out-of-Scope Use
|
169 |
+
|
170 |
+
*List how the model may foreseeably be misused and address what users ought not to do with the model.*
|
171 |
+
-->
|
172 |
+
|
173 |
+
<!--
|
174 |
+
## Bias, Risks and Limitations
|
175 |
+
|
176 |
+
*What are the known or foreseeable issues stemming from this model? You could also flag here known failure cases or weaknesses of the model.*
|
177 |
+
-->
|
178 |
+
|
179 |
+
<!--
|
180 |
+
### Recommendations
|
181 |
+
|
182 |
+
*What are recommendations with respect to the foreseeable issues? For example, filtering explicit content.*
|
183 |
+
-->
|
184 |
+
|
185 |
+
## Training Details
|
186 |
+
|
187 |
+
### Training Set Metrics
|
188 |
+
| Training set | Min | Median | Max |
|
189 |
+
|:-------------|:----|:---------|:----|
|
190 |
+
| Word count | 80 | 236.0127 | 453 |
|
191 |
+
|
192 |
+
### Training Hyperparameters
|
193 |
+
- batch_size: (1, 1)
|
194 |
+
- num_epochs: (1, 1)
|
195 |
+
- max_steps: 1
|
196 |
+
- sampling_strategy: oversampling
|
197 |
+
- num_iterations: 1
|
198 |
+
- body_learning_rate: (2e-05, 1e-05)
|
199 |
+
- head_learning_rate: 0.01
|
200 |
+
- loss: CosineSimilarityLoss
|
201 |
+
- distance_metric: cosine_distance
|
202 |
+
- margin: 0.25
|
203 |
+
- end_to_end: False
|
204 |
+
- use_amp: False
|
205 |
+
- warmup_proportion: 0.1
|
206 |
+
- max_length: 750
|
207 |
+
- seed: 42
|
208 |
+
- eval_max_steps: 1
|
209 |
+
- load_best_model_at_end: False
|
210 |
+
|
211 |
+
### Training Results
|
212 |
+
| Epoch | Step | Training Loss | Validation Loss |
|
213 |
+
|:------:|:----:|:-------------:|:---------------:|
|
214 |
+
| 0.0001 | 1 | 0.2289 | - |
|
215 |
+
|
216 |
+
### Framework Versions
|
217 |
+
- Python: 3.10.12
|
218 |
+
- SetFit: 1.0.3
|
219 |
+
- Sentence Transformers: 2.3.1
|
220 |
+
- Transformers: 4.35.2
|
221 |
+
- PyTorch: 2.1.0+cu121
|
222 |
+
- Datasets: 2.17.1
|
223 |
+
- Tokenizers: 0.15.2
|
224 |
+
|
225 |
+
## Citation
|
226 |
+
|
227 |
+
### BibTeX
|
228 |
+
```bibtex
|
229 |
+
@article{https://doi.org/10.48550/arxiv.2209.11055,
|
230 |
+
doi = {10.48550/ARXIV.2209.11055},
|
231 |
+
url = {https://arxiv.org/abs/2209.11055},
|
232 |
+
author = {Tunstall, Lewis and Reimers, Nils and Jo, Unso Eun Seo and Bates, Luke and Korat, Daniel and Wasserblat, Moshe and Pereg, Oren},
|
233 |
+
keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
|
234 |
+
title = {Efficient Few-Shot Learning Without Prompts},
|
235 |
+
publisher = {arXiv},
|
236 |
+
year = {2022},
|
237 |
+
copyright = {Creative Commons Attribution 4.0 International}
|
238 |
+
}
|
239 |
+
```
|
240 |
+
|
241 |
+
<!--
|
242 |
+
## Glossary
|
243 |
+
|
244 |
+
*Clearly define terms in order to be accessible across audiences.*
|
245 |
+
-->
|
246 |
+
|
247 |
+
<!--
|
248 |
+
## Model Card Authors
|
249 |
+
|
250 |
+
*Lists the people who create the model card, providing recognition and accountability for the detailed work that goes into its construction.*
|
251 |
+
-->
|
252 |
+
|
253 |
+
<!--
|
254 |
+
## Model Card Contact
|
255 |
+
|
256 |
+
*Provides a way for people who have updates to the Model Card, suggestions, or questions, to contact the Model Card authors.*
|
257 |
+
-->
|
config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "sentence-transformers/paraphrase-mpnet-base-v2",
|
3 |
+
"architectures": [
|
4 |
+
"MPNetModel"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 514,
|
16 |
+
"model_type": "mpnet",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"relative_attention_num_buckets": 32,
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.35.2",
|
23 |
+
"vocab_size": 30527
|
24 |
+
}
|
config_sentence_transformers.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"__version__": {
|
3 |
+
"sentence_transformers": "2.0.0",
|
4 |
+
"transformers": "4.7.0",
|
5 |
+
"pytorch": "1.9.0+cu102"
|
6 |
+
}
|
7 |
+
}
|
config_setfit.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"normalize_embeddings": false,
|
3 |
+
"labels": null
|
4 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f839cf4fdde8eff477d7f56a42186948f5e236e0c5350b9b8685d7f810b8813
|
3 |
+
size 437967672
|
model_head.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:115e3cd3f7169d7400758edf60c67d23df246656b17016193fc2044daaefe498
|
3 |
+
size 1460265
|
modules.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"idx": 0,
|
4 |
+
"name": "0",
|
5 |
+
"path": "",
|
6 |
+
"type": "sentence_transformers.models.Transformer"
|
7 |
+
},
|
8 |
+
{
|
9 |
+
"idx": 1,
|
10 |
+
"name": "1",
|
11 |
+
"path": "1_Pooling",
|
12 |
+
"type": "sentence_transformers.models.Pooling"
|
13 |
+
}
|
14 |
+
]
|
sentence_bert_config.json
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"max_seq_length": 512,
|
3 |
+
"do_lower_case": false
|
4 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"cls_token": {
|
10 |
+
"content": "<s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": true,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"eos_token": {
|
17 |
+
"content": "</s>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"mask_token": {
|
24 |
+
"content": "<mask>",
|
25 |
+
"lstrip": true,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"pad_token": {
|
31 |
+
"content": "<pad>",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
},
|
37 |
+
"sep_token": {
|
38 |
+
"content": "</s>",
|
39 |
+
"lstrip": false,
|
40 |
+
"normalized": true,
|
41 |
+
"rstrip": false,
|
42 |
+
"single_word": false
|
43 |
+
},
|
44 |
+
"unk_token": {
|
45 |
+
"content": "[UNK]",
|
46 |
+
"lstrip": false,
|
47 |
+
"normalized": false,
|
48 |
+
"rstrip": false,
|
49 |
+
"single_word": false
|
50 |
+
}
|
51 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "<s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<pad>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"2": {
|
20 |
+
"content": "</s>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"104": {
|
28 |
+
"content": "[UNK]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"30526": {
|
36 |
+
"content": "<mask>",
|
37 |
+
"lstrip": true,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"bos_token": "<s>",
|
45 |
+
"clean_up_tokenization_spaces": true,
|
46 |
+
"cls_token": "<s>",
|
47 |
+
"do_basic_tokenize": true,
|
48 |
+
"do_lower_case": true,
|
49 |
+
"eos_token": "</s>",
|
50 |
+
"mask_token": "<mask>",
|
51 |
+
"model_max_length": 512,
|
52 |
+
"never_split": null,
|
53 |
+
"pad_token": "<pad>",
|
54 |
+
"sep_token": "</s>",
|
55 |
+
"strip_accents": null,
|
56 |
+
"tokenize_chinese_chars": true,
|
57 |
+
"tokenizer_class": "MPNetTokenizer",
|
58 |
+
"unk_token": "[UNK]"
|
59 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|