new
Browse files- README.md +102 -3
- config.json +26 -0
- model.safetensors +3 -0
- special_tokens_map.json +37 -0
- tokenizer.json +0 -0
- tokenizer_config.json +64 -0
- vocab.txt +0 -0
README.md
CHANGED
@@ -1,3 +1,102 @@
|
|
1 |
-
---
|
2 |
-
license:
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
language:
|
4 |
+
- en
|
5 |
+
- zh
|
6 |
+
pipeline_tag: token-classification
|
7 |
+
---
|
8 |
+
# bert-chunker-0.2
|
9 |
+
|
10 |
+
|
11 |
+
## Introduction
|
12 |
+
|
13 |
+
bert-chunker-0.2 is a text chunker based on BERT with a classifier head to predict the start token of chunks (for use in RAG, etc), and using a sliding window it cuts documents of any size into chunks. It was finetuned on top of [nreimers/MiniLM-L6-H384-uncased](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2).
|
14 |
+
|
15 |
+
bert-chunker-0.2 is a new experimental version of bert-chunker. It's enhanced for article structures, by reaching a balance between semantics chunking and structure chunking. It is a 0.1:0.9 linear weight merging of a trained semantics chunker and a trained structure chunker.
|
16 |
+
|
17 |
+
|
18 |
+
## Quickstart
|
19 |
+
Run the following:
|
20 |
+
|
21 |
+
```python
|
22 |
+
import torch
|
23 |
+
from transformers import AutoConfig,AutoTokenizer,BertForTokenClassification
|
24 |
+
import math
|
25 |
+
model_path="tim1900/bert-chunker-0.2"
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
27 |
+
model_path,
|
28 |
+
padding_side="right",
|
29 |
+
model_max_length=255,
|
30 |
+
trust_remote_code=True,
|
31 |
+
)
|
32 |
+
config = AutoConfig.from_pretrained(
|
33 |
+
model_path,
|
34 |
+
trust_remote_code=True,
|
35 |
+
)
|
36 |
+
device = 'cpu'
|
37 |
+
model = BertForTokenClassification.from_pretrained(model_path, ).to(device)
|
38 |
+
def chunk_text(model,text:str, tokenizer, prob_threshold=0.5)->list[str]:
|
39 |
+
# slide context window
|
40 |
+
MAX_TOKENS=255
|
41 |
+
tokens=tokenizer(text, return_tensors="pt",truncation=False)
|
42 |
+
input_ids=tokens['input_ids']
|
43 |
+
attention_mask=tokens['attention_mask'][:,0:MAX_TOKENS]
|
44 |
+
attention_mask=attention_mask.to(model.device)
|
45 |
+
CLS=input_ids[:,0].unsqueeze(0)
|
46 |
+
SEP=input_ids[:,-1].unsqueeze(0)
|
47 |
+
input_ids=input_ids[:,1:-1]
|
48 |
+
model.eval()
|
49 |
+
split_str_poses=[]
|
50 |
+
|
51 |
+
token_pos = []
|
52 |
+
|
53 |
+
windows_start =0
|
54 |
+
windows_end= 0
|
55 |
+
logits_threshold = math.log(1/prob_threshold-1)
|
56 |
+
|
57 |
+
print(f'Processing {input_ids.shape[1]} tokens...')
|
58 |
+
while windows_end <= input_ids.shape[1]:
|
59 |
+
windows_end= windows_start + MAX_TOKENS-2
|
60 |
+
|
61 |
+
ids=torch.cat((CLS, input_ids[:,windows_start:windows_end],SEP),1)
|
62 |
+
|
63 |
+
ids=ids.to(model.device)
|
64 |
+
|
65 |
+
output=model(input_ids=ids,attention_mask=torch.ones(1, ids.shape[1],device=model.device))
|
66 |
+
logits = output['logits'][:, 1:-1,:]
|
67 |
+
chunk_decision = (logits[:,:,1]>(logits[:,:,0]-logits_threshold))
|
68 |
+
greater_rows_indices = torch.where(chunk_decision)[1].tolist()
|
69 |
+
|
70 |
+
# null or not
|
71 |
+
if len(greater_rows_indices)>0 and (not (greater_rows_indices[0] == 0 and len(greater_rows_indices)==1)):
|
72 |
+
|
73 |
+
split_str_pos=[tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices]
|
74 |
+
token_pos +=[sp + windows_start + 1 for sp in greater_rows_indices]
|
75 |
+
split_str_poses += split_str_pos
|
76 |
+
|
77 |
+
windows_start = greater_rows_indices[-1] + windows_start
|
78 |
+
|
79 |
+
else:
|
80 |
+
|
81 |
+
windows_start = windows_end
|
82 |
+
|
83 |
+
substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]
|
84 |
+
token_pos = [0] + token_pos
|
85 |
+
return substrings,token_pos
|
86 |
+
|
87 |
+
|
88 |
+
|
89 |
+
text='''In the heart of the bustling city, where towering skyscrapers touch the clouds and the symphony
|
90 |
+
of honking cars never ceases, Sarah, an aspiring novelist, found solace in the quiet corners of the ancient library
|
91 |
+
Surrounded by shelves that whispered stories of centuries past, she crafted her own world with words, oblivious to the rush outside Dr.Alexander Thompson, aboard the spaceship 'Pandora's Venture', was en route to the newly discovered exoplanet Zephyr-7.
|
92 |
+
As the lead astrobiologist of the expedition, his mission was to uncover signs of microbial life within the planet's subterranean ice caves.
|
93 |
+
With each passing light year, the anticipation of unraveling secrets that could alter humanity's
|
94 |
+
understanding of life in the universe grew ever stronger.'''
|
95 |
+
|
96 |
+
chunks, token_pos=chunk_text(model,text, tokenizer, prob_threshold=0.5)
|
97 |
+
|
98 |
+
# print chunks
|
99 |
+
for i, (c,t) in enumerate(zip(chunks,token_pos)):
|
100 |
+
print(f'-----chunk: {i}----token_idx: {t}--------')
|
101 |
+
print(c)
|
102 |
+
```
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/data/pretrained_model",
|
3 |
+
"architectures": [
|
4 |
+
"BertForTokenClassification"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 384,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 1536,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 6,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.46.3",
|
23 |
+
"type_vocab_size": 2,
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 30522
|
26 |
+
}
|
model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0a87cabf3305e1e1a41712a73829c4a4305fb6a7df0e3487aac02afb2e636994
|
3 |
+
size 90276408
|
special_tokens_map.json
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": {
|
3 |
+
"content": "[CLS]",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"mask_token": {
|
10 |
+
"content": "[MASK]",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "[PAD]",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"sep_token": {
|
24 |
+
"content": "[SEP]",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
},
|
30 |
+
"unk_token": {
|
31 |
+
"content": "[UNK]",
|
32 |
+
"lstrip": false,
|
33 |
+
"normalized": false,
|
34 |
+
"rstrip": false,
|
35 |
+
"single_word": false
|
36 |
+
}
|
37 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "[PAD]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"100": {
|
12 |
+
"content": "[UNK]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"101": {
|
20 |
+
"content": "[CLS]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"102": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"103": {
|
36 |
+
"content": "[MASK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": false,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": true,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"max_length": 128,
|
50 |
+
"model_max_length": 255,
|
51 |
+
"never_split": null,
|
52 |
+
"pad_to_multiple_of": null,
|
53 |
+
"pad_token": "[PAD]",
|
54 |
+
"pad_token_type_id": 0,
|
55 |
+
"padding_side": "right",
|
56 |
+
"sep_token": "[SEP]",
|
57 |
+
"stride": 0,
|
58 |
+
"strip_accents": null,
|
59 |
+
"tokenize_chinese_chars": true,
|
60 |
+
"tokenizer_class": "BertTokenizer",
|
61 |
+
"truncation_side": "right",
|
62 |
+
"truncation_strategy": "longest_first",
|
63 |
+
"unk_token": "[UNK]"
|
64 |
+
}
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|