jpohhhh
/

msmarco-MiniLM-L-6-v3_onnx

sentence-embeddings

endpoints-template

Inference Endpoints

Model card Files Files and versions Community

jpohhhh commited on Jun 21, 2023

Commit

59199c3

·

1 Parent(s): 4016518

Update handler.py

Files changed (1) hide show

handler.py +8 -9

handler.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from typing import Dict, List, Any
 from transformers import AutoTokenizer, AutoModel
 import torch
 #Mean Pooling - Take attention mask into account for correct averaging
@@ -11,10 +13,10 @@ def mean_pooling(model_output, attention_mask):
 class EndpointHandler():
     def __init__(self, path=""):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')
-        self.model = AutoModel.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')
-        self.model.to(self.device)
-        print("model will run on ", self.device)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
@@ -25,11 +27,8 @@ class EndpointHandler():
             A :obj:`list` | `dict`: will be serialized and returned
         """
         sentences = data.pop("inputs",data)
-        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
-        encoded_input = {key: value.to(self.device) for key, value in encoded_input.items()}
-        # Compute token embeddings
-        with torch.no_grad():
-            model_output = self.model(**encoded_input)
         # Perform pooling. In this case, max pooling.
         sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

 from typing import Dict, List, Any
 from transformers import AutoTokenizer, AutoModel
+from optimum.onnxruntime.modeling_ort import ORTModelForCustomTasks
 import torch
 #Mean Pooling - Take attention mask into account for correct averaging
 class EndpointHandler():
     def __init__(self, path=""):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = ORTModelForCustomTasks.from_pretrained("optimum/sbert-all-MiniLM-L6-with-pooler")
+        self.tokenizer = AutoTokenizer.from_pretrained("optimum/sbert-all-MiniLM-L6-with-pooler")
+        # self.model.to(self.device)
+        # print("model will run on ", self.device)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         """
             A :obj:`list` | `dict`: will be serialized and returned
         """
         sentences = data.pop("inputs",data)
+        inputs = tokenizer("I love burritos!", return_tensors="pt")
+        pred = self.model(**encoded_input)
         # Perform pooling. In this case, max pooling.
         sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])