import pandas as pd import re from sentence_transformers import SentenceTransformer import faiss import numpy as np def save_embeddings(embeddings, file_name): embeddings = embeddings.cpu().numpy() # Convert PyTorch tensor to numpy array dimension = embeddings.shape[1] # Save embeddings as .npy file np.save(f"{file_name}_embeddings.npy", embeddings) index = faiss.IndexFlatL2(dimension) faiss.normalize_L2(embeddings) index.add(embeddings) faiss.write_index(index, file_name) return index def normalize_embeddings(embeddings): embeddings = embeddings.cpu().numpy() # Convert PyTorch tensor to numpy array faiss.normalize_L2(embeddings) return embeddings def train_model(model_name): model = SentenceTransformer(model_name) return model def get_embeddings(model, texts): embeddings = model.encode(texts, convert_to_tensor=True) return embeddings def load_data(file_path): data = pd.read_csv(file_path) return data def clean_text(text): # Function to clean text text = text.lower() text = re.sub(r'[^\w\s]', '', text) text = re.sub(r'\d+', '', text) text = text.strip() return text def preprocess_data(data): data['utterance'] = data['utterance'].apply(clean_text) return data # Load and preprocess data data_file_path = r"C:\Users\serban.tica\Documents\Intent_detection\data\Pager_Intents_Recent.csv" data = load_data(data_file_path) data = preprocess_data(data) # Models to evaluate models = { "multilingual-e5-small":"intfloat/multilingual-e5-small" #"bert-base-nli-mean-tokens":"sentence-transformers/bert-base-nli-mean-tokens", #"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2", "all-distilroberta-v1":"sentence-transformers/all-distilroberta-v1" } # "all-mpnet-base-v2":"sentence-transformers/all-mpnet-base-v2", # "bert-base-nli":"sentence-transformers/bert-base-nli-mean-tokens", # "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2", # "all-distilroberta-v1":"sentence-transformers/all-distilroberta-v1" # "bert-base-romanian-cased-v1": "sentence-transformers/bert-base-romanian-cased-v1", # "bert-base-romanian-uncased-v1": "sentence-transformers/dumitrescustefan/bert-base-romanian-uncased-v1", #"mBERT": "bert-base-multilingual-cased", "XLM-R": "xlm-roberta-base", "Romanian BERT": "dumitrescustefan/bert-base-romanian-cased-v1", "dumitrescustefan/bert-base-romanian-uncased-v1": "dumitrescustefan/bert-base-romanian-uncased-v1" # Generate and save embeddings for each model, "xlm-r-distilroberta-base-paraphrase-v1" for model_name, model_path in models.items(): print(f"Processing model: {model_name}") model = train_model(model_path) texts = data['utterance'].tolist() embeddings = get_embeddings(model, texts) save_embeddings(embeddings, file_name=f"embeddings/{model_name}_vector_db.index")