from typing import Dict, List, Any
from transformers import AutoTokenizer, AutoModel
from optimum.pipelines import pipeline
from optimum.onnxruntime import ORTModelForFeatureExtraction
from pathlib import Path
import time

import os
import torch

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
class EndpointHandler():
    def __init__(self, path=""):
        print("HELLO THIS IS THE CWD:", os.getcwd())
        print("HELLO THIS IS THE PATH ARG:", path)
        files = os.listdir(path)
        for file in files:    
            print(file)
        # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        task = "feature-extraction"
        self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/msmarco-MiniLM-L-6-v3')
        model_regular = ORTModelForFeatureExtraction.from_pretrained("jpohhhh/msmarco-MiniLM-L-6-v3_onnx", from_transformers=False)
 
        self.onnx_extractor = pipeline(task, model=model_regular, tokenizer=self.tokenizer)
        # self.model.to(self.device)
        # print("model will run on ", self.device)

    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
       data args:
            inputs (:obj: `str` | `PIL.Image` | `np.array`)
            kwargs
      Return:
            A :obj:`list` | `dict`: will be serialized and returned
        """
        print("A")
        sentences = data.pop("inputs",data)
        print("B")
        sentence_embeddings = []
        print("C")
        for sentence in sentences:    
            print("D")
            

            # Compute token embeddings    
            with torch.no_grad():       
                model_output = self.onnx_extractor(sentence)  
            print("E")

            # Perform pooling. In this case, max pooling.    
            # embedding = mean_pooling(model_output, encoded_input['attention_mask'])
            print("F")
            
            sentence_embeddings.append(model_output)
            print("G")
        return sentence_embeddings