Spaces:
Runtime error
Runtime error
File size: 5,644 Bytes
f3d0f1e d0fd192 f3d0f1e d0fd192 85df319 d0fd192 c98215f 85df319 f3d0f1e c98215f f3d0f1e a3f5633 b5a209d f3d0f1e b5a209d c98215f b5a209d f3d0f1e b5a209d f3d0f1e b5a209d f3d0f1e b5a209d f3d0f1e 85df319 d0fd192 85df319 d0fd192 85df319 d0fd192 85df319 d0fd192 b5a209d 85df319 d0fd192 85df319 d0fd192 85df319 d0fd192 85df319 d0fd192 f3d0f1e 85df319 d0fd192 b5a209d f3d0f1e b5a209d f3d0f1e b5a209d f3d0f1e b5a209d f3d0f1e b5a209d f3d0f1e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.docstore.in_memory import InMemoryDocstore
from faiss import IndexFlatL2
#import functools
import pandas as pd
import os
# For local run load environmental variables from .env-file
# from dotenv import load_dotenv
# load_dotenv()
# Define important variables
embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
embeddings=embeddings, allow_dangerous_deserialization=True)
def load_documents(df):
"""
Load documents from a DataFrame and split them into smaller chunks for vector storage.
Parameters:
----------
df : pandas.DataFrame
A DataFrame containing the documents to be processed, with a column named 'speech_content' that holds the text content.
Returns:
-------
list
A list of split document chunks ready for further processing or vectorization.
"""
# Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load
loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')
# Load the data from the DataFrame into a suitable format for processing
data = loader.load()
# Initialize a RecursiveCharacterTextSplitter to split the text into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=1024,
chunk_overlap=32,
length_function=len,
is_separator_regex=False,
)
# Split the loaded data into smaller chunks using the splitter
documents = splitter.split_documents(documents=data)
return documents
#@functools.lru_cache()
def get_vectorstore(inputs, embeddings):
"""
Combine multiple FAISS vector stores into a single vector store based on the specified inputs.
Parameters
----------
inputs : list of str
A list of strings specifying which vector stores to combine. Each string represents a specific
index or a special keyword "All". If "All" is the first entry in the list,
it directly return the pre-defined vectorstore for all speeches
embeddings : Embeddings
An instance of embeddings that will be used to load the vector stores. The specific type and
structure of `embeddings` depend on the implementation of the `get_vectorstore` function.
Returns
-------
FAISS
A FAISS vector store that combines the specified indices into a single vector store.
"""
# Default folder path
folder_path = "./src/FAISS"
if inputs[0] == "All" or inputs[0] is None:
return db_all
# Initialize empty db
embedding_function = embeddings
dimensions = len(embedding_function.embed_query("dummy"))
db = FAISS(
embedding_function=embedding_function,
index=IndexFlatL2(dimensions),
docstore=InMemoryDocstore(),
index_to_docstore_id={},
normalize_L2=False
)
# Retrieve inputs: 20. Legislaturperiode, 19. Legislaturperiode, ...
for input in inputs:
# Ignore if user also selected All among other legislatures
if input == "All":
continue
# Retrieve selected index and merge vector stores
index = input.split(".")[0]
index_name = f'{index}_legislature'
local_db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
embeddings=embeddings, allow_dangerous_deserialization=True)
db.merge_from(local_db)
print('Successfully merged inputs')
return db
def RAG(llm, prompt, db, question):
"""
Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
language model using a predefined template.
Parameters:
----------
llm : LanguageModel
An instance of the language model to be used for generating responses.
prompt : str
A predefined template or prompt that structures how the context and question are presented to the language model.
db : VectorStore
A vector store instance that supports retrieval of relevant documents based on the input question.
question : str
The question or query to be answered by the language model.
Returns:
-------
str
The response generated by the language model, based on the retrieved context and provided question.
"""
# Create a document chain using the provided language model and prompt template
document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
# Convert the vector store into a retriever
retriever = db.as_retriever()
# Create a retrieval chain that integrates the retriever with the document chain
retrieval_chain = create_retrieval_chain(retriever, document_chain)
# Invoke the retrieval chain with the input question to get the final response
response = retrieval_chain.invoke({"input": question})
return response
|