File size: 3,033 Bytes

4016518
 
96355e1
a61e58e
dbdba21
75ae405
dbdba21
df00d4a
4016518
 
1e0a1be
 
 
 
 
 
 
 
 
 
 
 
 
 
4016518
 
 
 
 
 
 
 
df00d4a
 
b5cf395
 
 
96355e1
b20a254
f9063f4
 
b5cf395
cb578d2
59199c3
 
4016518
 
 
 
 
 
 
 
 
65555af
9ee4b8b
65555af
9ee4b8b
65555af
dccbfd3
65555af
75ae405
65555af
dccbfd3
 
bb069b4
65555af
 
dccbfd3
09aaf23
65555af
75ae405
a5e5b1d
65555af
dccbfd3

from typing import Dict, List, Any
from transformers import AutoTokenizer, AutoModel
from optimum.pipelines import pipeline
from optimum.onnxruntime import ORTModelForFeatureExtraction
from pathlib import Path
import time

import os
import torch

def max_pooling(model_output):    
    # Get dimensions    
    _, Z, Y = model_output.shape    
    # Initialize an empty list with length Y (384 in your case)    
    output_array = [0] * Y    
    # Loop over secondary arrays (Z)    
    for i in range(Z):        
        # Loop over values in innermost arrays (Y)        
        for j in range(Y):            
            # If value is greater than current max, update max            
            if model_output[0][i][j] > output_array[j]:                
                output_array[j] = model_output[0][i][j]    
    return output_array
    
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
class EndpointHandler():
    def __init__(self, path=""):
        print("HELLO THIS IS THE CWD:", os.getcwd())
        print("HELLO THIS IS THE PATH ARG:", path)
        files = os.listdir(path)
        for file in files:    
            print(file)
        # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        task = "feature-extraction"
        self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')
        model_regular = ORTModelForFeatureExtraction.from_pretrained("jpohhhh/msmarco-MiniLM-L-6-v3_onnx", from_transformers=False)
 
        self.onnx_extractor = pipeline(task, model=model_regular, tokenizer=self.tokenizer)
        # self.model.to(self.device)
        # print("model will run on ", self.device)

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
       data args:
            inputs (:obj: `str` | `PIL.Image` | `np.array`)
            kwargs
      Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """
        print("A")
        sentences = data.pop("inputs",data)
        print("B")
        sentence_embeddings = []
        print("C")
        for sentence in sentences:    
            print("D")
            

            # Compute token embeddings    
            with torch.no_grad():       
                model_output = self.onnx_extractor(sentence)  
            print("E")

            # Perform pooling. In this case, max pooling.    
            # embedding = mean_pooling(model_output, encoded_input['attention_mask'])
            print("F")
            
            sentence_embeddings.append(max_pooling(model_output))
            print("G")
        return sentence_embeddings