from typing import Dict, List, Any from transformers import AutoTokenizer, AutoModel from optimum.pipelines import pipeline from optimum.onnxruntime import ORTModelForFeatureExtraction from pathlib import Path import time import os import torch def mean_pooling(model_output): # Get dimensions Z, Y = len(model_output[0]), len(model_output[0][0]) # Initialize an empty list with length Y (384 in your case) output_array = [0.0] * Y # Loop over secondary arrays (Z) for i in range(Z): # Loop over values in innermost arrays (Y) for j in range(Y): # Accumulate values output_array[j] += model_output[0][i][j] # Compute mean output_array = [val / Z for val in output_array] return output_array class EndpointHandler(): def __init__(self, path=""): task = "feature-extraction" self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3') model_regular = ORTModelForFeatureExtraction.from_pretrained("jpohhhh/msmarco-MiniLM-L-6-v3_onnx", from_transformers=False) self.onnx_extractor = pipeline(task, model=model_regular, tokenizer=self.tokenizer) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str` | `PIL.Image` | `np.array`) kwargs Return: A :obj:`list` | `dict`: will be serialized and returned """ sentences = data.pop("inputs",data) sentence_embeddings = [] for sentence in sentences: # Compute token embeddings with torch.no_grad(): model_output = self.onnx_extractor(sentence) sentence_embeddings.append(mean_pooling(model_output)) return sentence_embeddings