File size: 2,879 Bytes
5ecde30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np  

def save_embeddings(embeddings, file_name):
    embeddings = embeddings.cpu().numpy()  # Convert PyTorch tensor to numpy array
    dimension = embeddings.shape[1]

    # Save embeddings as .npy file
    np.save(f"{file_name}_embeddings.npy", embeddings)
    index = faiss.IndexFlatL2(dimension)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    faiss.write_index(index, file_name)
    
    return index

def normalize_embeddings(embeddings):
    embeddings = embeddings.cpu().numpy()  # Convert PyTorch tensor to numpy array
    faiss.normalize_L2(embeddings)
    return embeddings

def train_model(model_name):
    model = SentenceTransformer(model_name)
    return model

def get_embeddings(model, texts):
    embeddings = model.encode(texts, convert_to_tensor=True)
    return embeddings


def load_data(file_path):
    data = pd.read_csv(file_path)
    return data

def clean_text(text):
    # Function to clean text
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.strip()
    return text

def preprocess_data(data):
    data['utterance'] = data['utterance'].apply(clean_text)
    return data


# Load and preprocess data
data_file_path = r"C:\Users\serban.tica\Documents\Intent_detection\data\Pager_Intents_Recent.csv"
data = load_data(data_file_path)
data = preprocess_data(data)

# Models to evaluate
models = {
    "multilingual-e5-small":"intfloat/multilingual-e5-small" #"bert-base-nli-mean-tokens":"sentence-transformers/bert-base-nli-mean-tokens", #"all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2", "all-distilroberta-v1":"sentence-transformers/all-distilroberta-v1"
}

# "all-mpnet-base-v2":"sentence-transformers/all-mpnet-base-v2",
# "bert-base-nli":"sentence-transformers/bert-base-nli-mean-tokens",
# "all-MiniLM-L6-v2": "sentence-transformers/all-MiniLM-L6-v2",
# "all-distilroberta-v1":"sentence-transformers/all-distilroberta-v1"
# "bert-base-romanian-cased-v1": "sentence-transformers/bert-base-romanian-cased-v1",
# "bert-base-romanian-uncased-v1": "sentence-transformers/dumitrescustefan/bert-base-romanian-uncased-v1",
#"mBERT": "bert-base-multilingual-cased", "XLM-R": "xlm-roberta-base", "Romanian BERT": "dumitrescustefan/bert-base-romanian-cased-v1", "dumitrescustefan/bert-base-romanian-uncased-v1": "dumitrescustefan/bert-base-romanian-uncased-v1"
# Generate and save embeddings for each model, "xlm-r-distilroberta-base-paraphrase-v1"

for model_name, model_path in models.items():
    print(f"Processing model: {model_name}")
    model = train_model(model_path)
    texts = data['utterance'].tolist()
    embeddings = get_embeddings(model, texts)
    save_embeddings(embeddings, file_name=f"embeddings/{model_name}_vector_db.index")