File size: 5,644 Bytes
f3d0f1e
 
 
d0fd192
f3d0f1e
 
 
d0fd192
85df319
d0fd192
 
c98215f
85df319
 
f3d0f1e
 
c98215f
 
 
f3d0f1e
a3f5633
b5a209d
 
 
f3d0f1e
 
b5a209d
 
 
 
 
 
 
 
 
 
 
 
 
c98215f
b5a209d
 
 
f3d0f1e
b5a209d
 
f3d0f1e
 
 
 
 
 
b5a209d
 
f3d0f1e
b5a209d
f3d0f1e
 
85df319
 
 
 
 
 
d0fd192
85df319
 
 
d0fd192
 
85df319
 
 
 
 
d0fd192
85df319
 
 
 
 
 
 
 
 
d0fd192
b5a209d
85df319
 
d0fd192
 
85df319
 
 
 
 
 
 
 
 
 
 
d0fd192
 
 
85df319
 
 
 
d0fd192
85df319
d0fd192
f3d0f1e
 
85df319
 
d0fd192
b5a209d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f3d0f1e
b5a209d
f3d0f1e
b5a209d
f3d0f1e
b5a209d
f3d0f1e
b5a209d
f3d0f1e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

from langchain_community.docstore.in_memory import InMemoryDocstore
from faiss import IndexFlatL2

#import functools

import pandas as pd
import os

# For local run load environmental variables from .env-file
# from dotenv import load_dotenv
# load_dotenv()

# Define important variables
embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
                                            embeddings=embeddings, allow_dangerous_deserialization=True)

def load_documents(df):
    """

    Load documents from a DataFrame and split them into smaller chunks for vector storage.



    Parameters:

    ----------

    df : pandas.DataFrame

        A DataFrame containing the documents to be processed, with a column named 'speech_content' that holds the text content.



    Returns:

    -------

    list

        A list of split document chunks ready for further processing or vectorization.

    """
    
    # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load
    loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')
    # Load the data from the DataFrame into a suitable format for processing
    data = loader.load()
    
    # Initialize a RecursiveCharacterTextSplitter to split the text into chunks
    splitter = RecursiveCharacterTextSplitter(
            chunk_size=1024,
            chunk_overlap=32,
            length_function=len,
            is_separator_regex=False,
        )
    
    # Split the loaded data into smaller chunks using the splitter
    documents = splitter.split_documents(documents=data)
    
    return documents


#@functools.lru_cache()
def get_vectorstore(inputs, embeddings):
    """

    Combine multiple FAISS vector stores into a single vector store based on the specified inputs.



    Parameters

    ----------

    inputs : list of str

        A list of strings specifying which vector stores to combine. Each string represents a specific 

        index or a special keyword "All". If "All" is the first entry in the list, 

        it directly return the pre-defined vectorstore for all speeches

        

    embeddings : Embeddings

        An instance of embeddings that will be used to load the vector stores. The specific type and

        structure of `embeddings` depend on the implementation of the `get_vectorstore` function.



    Returns

    -------

    FAISS

        A FAISS vector store that combines the specified indices into a single vector store.

    

    """

    # Default folder path
    folder_path = "./src/FAISS"

    if inputs[0] == "All" or inputs[0] is None:
        return db_all

    # Initialize empty db
    embedding_function = embeddings 
    dimensions = len(embedding_function.embed_query("dummy"))

    db = FAISS(
        embedding_function=embedding_function,
        index=IndexFlatL2(dimensions),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

    # Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
    for input in inputs:
        # Ignore if user also selected All among other legislatures
        if input == "All":
            continue
        # Retrieve selected index and merge vector stores
        index = input.split(".")[0]
        index_name = f'{index}_legislature'
        local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
                                    embeddings=embeddings, allow_dangerous_deserialization=True)
        db.merge_from(local_db)
        print('Successfully merged inputs')
    return db




def RAG(llm, prompt, db, question):
    """

    Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the 

    language model using a predefined template.



    Parameters:

    ----------

    llm : LanguageModel

        An instance of the language model to be used for generating responses.

        

    prompt : str

        A predefined template or prompt that structures how the context and question are presented to the language model.

        

    db : VectorStore

        A vector store instance that supports retrieval of relevant documents based on the input question.

        

    question : str

        The question or query to be answered by the language model.



    Returns:

    -------

    str

        The response generated by the language model, based on the retrieved context and provided question.

    """
    # Create a document chain using the provided language model and prompt template
    document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
    # Convert the vector store into a retriever
    retriever = db.as_retriever()
    # Create a retrieval chain that integrates the retriever with the document chain
    retrieval_chain = create_retrieval_chain(retriever, document_chain)
    # Invoke the retrieval chain with the input question to get the final response
    response = retrieval_chain.invoke({"input": question})
    
    return response