jpohhhh
/

msmarco-MiniLM-L-6-v3_onnx

sentence-embeddings

endpoints-template

Inference Endpoints

Model card Files Files and versions Community

msmarco-MiniLM-L-6-v3_onnx / handler.py

jpohhhh's picture

Try GPT4 suggestions

8af4da5 over 1 year ago

2.21 kB

	from typing import Dict, List, Any
	from transformers import AutoTokenizer, AutoModel
	from optimum.pipelines import pipeline
	from optimum.onnxruntime import ORTModelForFeatureExtraction
	from pathlib import Path
	from multiprocessing import Pool
	import time

	import os
	import torch

	def mean_pooling2(model_output):
	"""Perform mean pooling on tensor T
	Args:
	model_output: tensor T (elements are 2 dimentional float arrays).
	Returns:
	array of mean values.
	"""
	return torch.mean(model_output[0], dim=1)


	def mean_pooling(model_output):
	# Get dimensions
	Z, Y = len(model_output[0]), len(model_output[0][0])

	# Initialize an empty list with length Y (384 in your case)
	output_array = [0.0] * Y

	# Loop over secondary arrays (Z)
	for i in range(Z):
	# Loop over values in innermost arrays (Y)
	for j in range(Y):
	# Accumulate values
	output_array[j] += model_output[0][i][j]

	# Compute mean
	output_array = [val / Z for val in output_array]

	return output_array


	class EndpointHandler():
	def __init__(self, path=""):
	task = "feature-extraction"
	self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')
	model_regular = ORTModelForFeatureExtraction.from_pretrained("jpohhhh/msmarco-MiniLM-L-6-v3_onnx", from_transformers=False)
	self.onnx_extractor = pipeline(task, model=model_regular, tokenizer=self.tokenizer)
	self.pool = Pool(4)

	def process_sentence(self, sentence): # Factored out for parallelization
	with torch.no_grad():
	model_output = self.onnx_extractor(sentence)
	return mean_pooling2(model_output)

	def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""
	data args:
	inputs (:obj: `str` \| `PIL.Image` \| `np.array`)
	kwargs
	Return:
	A :obj:`list` \| `dict`: will be serialized and returned
	"""
	sentences = data.pop("inputs",data)
	# Compute embeddings in parallel
	sentence_embeddings = self.pool.map(self.process_sentence, sentences)
	return sentence_embeddings