tim1900 commited on
Commit
d317588
·
verified ·
1 Parent(s): 036c618
Files changed (7) hide show
  1. README.md +102 -3
  2. config.json +26 -0
  3. model.safetensors +3 -0
  4. special_tokens_map.json +37 -0
  5. tokenizer.json +0 -0
  6. tokenizer_config.json +64 -0
  7. vocab.txt +0 -0
README.md CHANGED
@@ -1,3 +1,102 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - en
5
+ - zh
6
+ pipeline_tag: token-classification
7
+ ---
8
+ # bert-chunker-0.2
9
+
10
+
11
+ ## Introduction
12
+
13
+ bert-chunker-0.2 is a text chunker based on BERT with a classifier head to predict the start token of chunks (for use in RAG, etc), and using a sliding window it cuts documents of any size into chunks. It was finetuned on top of [nreimers/MiniLM-L6-H384-uncased](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2).
14
+
15
+ bert-chunker-0.2 is a new experimental version of bert-chunker. It's enhanced for article structures, by reaching a balance between semantics chunking and structure chunking. It is a 0.1:0.9 linear weight merging of a trained semantics chunker and a trained structure chunker.
16
+
17
+
18
+ ## Quickstart
19
+ Run the following:
20
+
21
+ ```python
22
+ import torch
23
+ from transformers import AutoConfig,AutoTokenizer,BertForTokenClassification
24
+ import math
25
+ model_path="tim1900/bert-chunker-0.2"
26
+ tokenizer = AutoTokenizer.from_pretrained(
27
+ model_path,
28
+ padding_side="right",
29
+ model_max_length=255,
30
+ trust_remote_code=True,
31
+ )
32
+ config = AutoConfig.from_pretrained(
33
+ model_path,
34
+ trust_remote_code=True,
35
+ )
36
+ device = 'cpu'
37
+ model = BertForTokenClassification.from_pretrained(model_path, ).to(device)
38
+ def chunk_text(model,text:str, tokenizer, prob_threshold=0.5)->list[str]:
39
+ # slide context window
40
+ MAX_TOKENS=255
41
+ tokens=tokenizer(text, return_tensors="pt",truncation=False)
42
+ input_ids=tokens['input_ids']
43
+ attention_mask=tokens['attention_mask'][:,0:MAX_TOKENS]
44
+ attention_mask=attention_mask.to(model.device)
45
+ CLS=input_ids[:,0].unsqueeze(0)
46
+ SEP=input_ids[:,-1].unsqueeze(0)
47
+ input_ids=input_ids[:,1:-1]
48
+ model.eval()
49
+ split_str_poses=[]
50
+
51
+ token_pos = []
52
+
53
+ windows_start =0
54
+ windows_end= 0
55
+ logits_threshold = math.log(1/prob_threshold-1)
56
+
57
+ print(f'Processing {input_ids.shape[1]} tokens...')
58
+ while windows_end <= input_ids.shape[1]:
59
+ windows_end= windows_start + MAX_TOKENS-2
60
+
61
+ ids=torch.cat((CLS, input_ids[:,windows_start:windows_end],SEP),1)
62
+
63
+ ids=ids.to(model.device)
64
+
65
+ output=model(input_ids=ids,attention_mask=torch.ones(1, ids.shape[1],device=model.device))
66
+ logits = output['logits'][:, 1:-1,:]
67
+ chunk_decision = (logits[:,:,1]>(logits[:,:,0]-logits_threshold))
68
+ greater_rows_indices = torch.where(chunk_decision)[1].tolist()
69
+
70
+ # null or not
71
+ if len(greater_rows_indices)>0 and (not (greater_rows_indices[0] == 0 and len(greater_rows_indices)==1)):
72
+
73
+ split_str_pos=[tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices]
74
+ token_pos +=[sp + windows_start + 1 for sp in greater_rows_indices]
75
+ split_str_poses += split_str_pos
76
+
77
+ windows_start = greater_rows_indices[-1] + windows_start
78
+
79
+ else:
80
+
81
+ windows_start = windows_end
82
+
83
+ substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]
84
+ token_pos = [0] + token_pos
85
+ return substrings,token_pos
86
+
87
+
88
+
89
+ text='''In the heart of the bustling city, where towering skyscrapers touch the clouds and the symphony
90
+ of honking cars never ceases, Sarah, an aspiring novelist, found solace in the quiet corners of the ancient library
91
+ Surrounded by shelves that whispered stories of centuries past, she crafted her own world with words, oblivious to the rush outside Dr.Alexander Thompson, aboard the spaceship 'Pandora's Venture', was en route to the newly discovered exoplanet Zephyr-7.
92
+ As the lead astrobiologist of the expedition, his mission was to uncover signs of microbial life within the planet's subterranean ice caves.
93
+ With each passing light year, the anticipation of unraveling secrets that could alter humanity's
94
+ understanding of life in the universe grew ever stronger.'''
95
+
96
+ chunks, token_pos=chunk_text(model,text, tokenizer, prob_threshold=0.5)
97
+
98
+ # print chunks
99
+ for i, (c,t) in enumerate(zip(chunks,token_pos)):
100
+ print(f'-----chunk: {i}----token_idx: {t}--------')
101
+ print(c)
102
+ ```
config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/data/pretrained_model",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 384,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1536,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "torch_dtype": "float32",
22
+ "transformers_version": "4.46.3",
23
+ "type_vocab_size": 2,
24
+ "use_cache": true,
25
+ "vocab_size": 30522
26
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a87cabf3305e1e1a41712a73829c4a4305fb6a7df0e3487aac02afb2e636994
3
+ size 90276408
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "max_length": 128,
50
+ "model_max_length": 255,
51
+ "never_split": null,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "[PAD]",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "[SEP]",
57
+ "stride": 0,
58
+ "strip_accents": null,
59
+ "tokenize_chinese_chars": true,
60
+ "tokenizer_class": "BertTokenizer",
61
+ "truncation_side": "right",
62
+ "truncation_strategy": "longest_first",
63
+ "unk_token": "[UNK]"
64
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff