File size: 5,463 Bytes
f3d0f1e
 
 
 
 
 
 
 
85df319
 
 
 
 
 
 
 
 
f3d0f1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85df319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3d0f1e
85df319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3d0f1e
 
85df319
 
f3d0f1e
 
 
 
 
 
 
 
 
85df319
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_community.llms import HuggingFaceHub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from faiss import IndexFlatL2
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.embeddings import SentenceTransformerEmbeddings
import functools




import pandas as pd

import os
#from dotenv import load_dotenv

#Load environmental variables from .env-file
#load_dotenv()


# Load documents to create a vectorstore later
def load_documents(df):
    # To Do: Create one initial vectore store loading all the documents with this function
    #loader = CSVLoader(index_name, source_column="speech_content") #unprocessed csv file
    loader = DataFrameLoader(data_frame=df, page_content_column='speech_content') #df
    data = loader.load()
    splitter = RecursiveCharacterTextSplitter(
            chunk_size=1024,
            chunk_overlap=32,
            length_function=len,
            is_separator_regex=False,
        )
    documents = splitter.split_documents(documents=data)
    return documents


#@functools.lru_cache()
def get_vectorstore(inputs, embeddings):
    """

    Combine multiple FAISS vector stores into a single vector store based on the specified inputs.



    Parameters:

    ----------

    inputs : list of str

        A list of strings specifying which vector stores to combine. Each string represents a specific 

        index or a special keyword "All". If "All" is included in the list, it will load a pre-defined 

        comprehensive vector store and return immediately.

        

    embeddings : Embeddings

        An instance of embeddings that will be used to load the vector stores. The specific type and

        structure of `embeddings` depend on the implementation of the `get_vectorstore` function.



    Returns:

    -------

    FAISS

        A FAISS vector store that combines the specified indices into a single vector store.

    

    Notes:

    -----

    - The `folder_path` variable is set to the default path "./src/FAISS", where the FAISS index files are stored.

    - The function initializes an empty FAISS vector store with a dimensionality of 128.

    - If "All" is specified in the `inputs`, it directly loads and returns the comprehensive vector store named "speeches_1949_09_12".

    - For each specific index in `inputs`, it retrieves the corresponding vector store and merges it with the initialized FAISS vector store.

    - The `FAISS.load_local` method is used to load vector stores from the local file system. 

      The `allow_dangerous_deserialization` parameter is set to True to allow loading of potentially unsafe serialized objects.

    """

    # Default folder path
    folder_path = "./src/FAISS"

    if inputs[0] == "All":
        index_name = "speeches_1949_09_12"
        db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
                                            embeddings=embeddings, allow_dangerous_deserialization=True)
        return db
    

    # Initialize empty db
    embedding_function = embeddings #SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    dimensions: int = len(embedding_function.embed_query("dummy"))

    db = FAISS(
        embedding_function=embedding_function,
        index=IndexFlatL2(dimensions),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

    # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
    for input in inputs:
        # Retrieve selected index and merge vector stores
        index = input.split(".")[0]
        index_name = f'{index}_legislature'
        local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
                                            embeddings=embeddings, allow_dangerous_deserialization=True)
        db.merge_from(local_db)
    return db



# Apply RAG by providing the context and the question to the LLM using the predefined template
def RAG(llm, prompt, db, question):       
    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    retriever = db.as_retriever()
    retrieval_chain = create_retrieval_chain(retriever, document_chain)

    response = retrieval_chain.invoke({"input": question})
    return response

#########
# Dynamically loading vector_db
##########

def get_similar_vectorstore(start_date, end_date, party, base_path='src\FAISS'):

    # Get all file names
    vector_stores = [store for store in os.listdir(base_path) if store.split(".")[1] == "faiss"]

    df = pd.DataFrame(culumns=["file_name", "start_date", "end_date", "date_diff"])
    # Extract metadata of file from its name
    for file_name in vector_stores:
        file_name = file_name.split(".")[0]
        file_elements = file_name.split("_")
        file_start_date, file_end_date, file_party = file_elements[1], file_elements[2], file_elements[3]

        if file_party == party and file_start_date <= start_date:
            None