JeremyHibiki commited on
Commit
35ad950
Β·
verified Β·
1 Parent(s): 3960134

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +107 -1
README.md CHANGED
@@ -5,4 +5,110 @@ pipeline_tag: feature-extraction
5
  tags:
6
  - bge-m3
7
  - onnx
8
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  tags:
6
  - bge-m3
7
  - onnx
8
+ ---
9
+
10
+ Based on `aapot/bge-m3-onnx` and `philipchung/bge-m3-onnx`
11
+
12
+ ## Deploy with tritonserver
13
+
14
+ - Folder structure
15
+
16
+ ```
17
+ .
18
+ β”œβ”€β”€ model_repository
19
+ β”‚ └── bge-m3
20
+ β”‚ β”œβ”€β”€ 1
21
+ β”‚ β”‚ β”œβ”€β”€ model.onnx
22
+ β”‚ β”‚ └── model.onnx.data
23
+ β”‚ └── config.pbtxt
24
+ ```
25
+
26
+ - `config.pbtxt` file
27
+
28
+ ```
29
+ name: "bge-m3"
30
+ backend: "onnxruntime"
31
+ max_batch_size : 4
32
+
33
+ input [
34
+ {
35
+ name: "input_ids"
36
+ data_type: TYPE_INT64
37
+ dims: [ -1 ]
38
+ },
39
+ {
40
+ name: "attention_mask"
41
+ data_type: TYPE_INT64
42
+ dims: [ -1 ]
43
+ }
44
+ ]
45
+
46
+ output [
47
+ {
48
+ name: "dense_vecs"
49
+ data_type: TYPE_FP32
50
+ dims: [ 1024 ]
51
+ },
52
+ {
53
+ name: "sparse_vecs"
54
+ data_type: TYPE_FP32
55
+ dims: [ -1, 1 ]
56
+ },
57
+ {
58
+ name: "colbert_vecs"
59
+ data_type: TYPE_FP32
60
+ dims: [ -1, 1024 ]
61
+ }
62
+ ]
63
+
64
+ ```
65
+
66
+ - Run with tritonserver docker image
67
+
68
+ ```bash
69
+ docker run --gpus all --rm -p 8000:8000 -p 8001:8001 -p 8002:8002 -v ./model_repository:/models nvcr.io/nvidia/tritonserver:24.12-py3 tritonserver --
70
+ model-repository=/models
71
+ ```
72
+
73
+ - Infer with `tritonsclient`
74
+
75
+ ```python
76
+ from typing import List
77
+ from tritonclient.http import InferenceServerClient, InferInput
78
+ from datasets import load_dataset
79
+ from transformers import AutoTokenizer
80
+
81
+ BS = 4
82
+ TOKENIZER_NAME = "BAAI/bge-m3"
83
+ TRITON_MODEL_NAME = "bge-m3"
84
+
85
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
86
+ data: List[str] = [x["text"] for x in load_dataset("BeiR/scidocs", "corpus")["corpus"]]
87
+ batch = data[:BS]
88
+
89
+ client = InferenceServerClient("localhost:8000")
90
+
91
+ tokenized = tokenizer(batch, padding=True, truncation=True, return_tensors="np")
92
+ input_ids, attention_mask = tokenized.input_ids, tokenized.attention_mask
93
+
94
+ inputs = [
95
+ InferInput("input_ids", [len(batch), len(input_ids[0])], "INT64"),
96
+ InferInput("attention_mask", [len(batch), len(attention_mask[0])], "INT64"),
97
+ ]
98
+ inputs[0].set_data_from_numpy(input_ids)
99
+ inputs[1].set_data_from_numpy(attention_mask)
100
+
101
+ results = client.infer(TRITON_MODEL_NAME, inputs)
102
+
103
+ dense_vecs = results.as_numpy("dense_vecs")
104
+ sparse_vecs = results.as_numpy("sparse_vecs").squeeze(-1)
105
+ colbert_vecs = results.as_numpy("colbert_vecs")
106
+
107
+ output = {
108
+ "dense_vecs": dense_vecs.tolist(),
109
+ "sparse_vecs": sparse_vecs.tolist(),
110
+ "colbert_vecs": colbert_vecs.tolist(),
111
+ }
112
+ print(output)
113
+
114
+ ```