from sentence_transformers import SentenceTransformer from A_Preprocess import load_pdf_data from E_Model_utils import batch_process_transformes_embeddings, get_embeddings, get_transformes_embeddings from E_Faiss_utils import save_faiss_embeddings_index from transformers import AutoTokenizer, AutoModel import numpy as np import os import sys from pathlib import Path sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) BASE_DIR = Path(__file__).resolve().parent.parent # Load and preprocess data # old data_file_path = r'C:\Users\ZZ029K826\Documents\GitHub\LLM_Intent_Recognition\data\Pager_Intents_cleaned.csv' # old data = load_pdf_data(data_file_path) # Load and preprocess data file_name = 'InvoiceDetailsExplanation.csv' data_file_path = BASE_DIR / "data" / file_name data = load_pdf_data(str(data_file_path)) sentences = data['utterance'].tolist() tokenizer = AutoTokenizer.from_pretrained("intfloat/multilingual-e5-small") # Load model from local path model = AutoModel.from_pretrained("intfloat/multilingual-e5-small", local_files_only=True) model_name = 'multilingual-e5-small' # filter randomly only 100 sentences - for testing faster # import random # random.seed(42) # random.shuffle(sentences) # sentences = sentences[:100] # ** Uncomment the following lines to load the model name ** #model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2') #model_name = 'paraphrase-multilingual-MiniLM-L12-v2' #model = SentenceTransformer('AlexHung29629/sgpt-llama3.2-1b-stage1') #model_name = 'llama3.2-1b' #model = SentenceTransformer('sentence-transformers/multilingual-e5-small') #model_name = 'all-MiniLM-L6-v2' # ** Uncomment the following lines to save the embeddings from sentence-transformers modeling ** #embeddings = get_embeddings(model, sentences) #save_faiss_embeddings_index(embeddings, file_name=f"embeddings/{model_name}_vector_db.index") # print(f'Embeddings shape: {embeddings.shape}.') # print(embeddings[:10]) # ** Uncomment the following lines to save the embeddings from transformers modeling ** # Load Romanian BERT model and tokenizer #tokenizer = AutoTokenizer.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") #model = AutoModel.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1") #model_name = 'bert-base-romanian-cased-v1' # Require a lot of memory #embeddings = get_transformes_embeddings(sentences, model, tokenizer) # Using batch processing - for low memory embeddings = batch_process_transformes_embeddings(sentences, model, tokenizer, batch_size=16, max_length=128) save_faiss_embeddings_index(embeddings, file_name=f"{file_name}_{model_name}_vector_db.index") print(f'Embeddings shape: {embeddings.shape}.')