Spaces:

TomData
/

PoliticsToYou

Sleeping

App Files Files Community

TomData commited on Jun 3, 2024

Commit

b5a209d

1 Parent(s): c98215f

load db_all before inference

Browse files

Files changed (2) hide show

Home.py +13 -2
src/vectordatabase.py +59 -9

Home.py CHANGED Viewed

@@ -58,8 +58,9 @@ with gr.Blocks() as App:
                 with gr.Row() as additional_input:
                     n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
                     party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') # change choices to all possible options
-                    start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
-                    end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
             search_btn = gr.Button('Search')
@@ -106,6 +107,16 @@ with gr.Blocks() as App:
                 inputs=[results_df, keyword_box, ftype_dropdown],
                 outputs=[file],
             )
 if __name__ == "__main__":

                 with gr.Row() as additional_input:
                     n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
                     party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') # change choices to all possible options
+                    # ToDo: Add date or legislature filter as input
+                    #start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
+                    #end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
             search_btn = gr.Button('Search')
                 inputs=[results_df, keyword_box, ftype_dropdown],
                 outputs=[file],
             )
+    with gr.Tab("About"):
+        gr.Markdown(text="""**Motivation:**
+                    The idea of this project is a combination of my curiosity in LLM application and my affection for speech data, that I developed during my bachelor thesis on measuring populism in text data.
+                    I would like to allow people to discover interesting discussions, opinions and positions that were communicated in the german parliament thoughout the years.
+                    **Development status:**
+                    Chatbot: Users can interact with the chatbot asking questions about anything that can be answered by speeches. Furthermore they can select any legislature as a basis for the chatbot's reply.
+                    Keyword
+                    """)
 if __name__ == "__main__":

src/vectordatabase.py CHANGED Viewed

@@ -18,19 +18,42 @@ import os
 # from dotenv import load_dotenv
 # load_dotenv()
-# Load documents to create a vectorstore later
 def load_documents(df):
-    loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')
     data = loader.load()
     splitter = RecursiveCharacterTextSplitter(
             chunk_size=1024,
             chunk_overlap=32,
             length_function=len,
             is_separator_regex=False,
         )
     documents = splitter.split_documents(documents=data)
     return documents
@@ -69,10 +92,10 @@ def get_vectorstore(inputs, embeddings):
     folder_path = "./src/FAISS"
     if inputs[0] == "All":
-        index_name = "speeches_1949_09_12"
-        db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
-                                            embeddings=embeddings, allow_dangerous_deserialization=True)
-        return db
     # Initialize empty db
@@ -99,15 +122,42 @@ def get_vectorstore(inputs, embeddings):
-# Apply RAG by providing the context and the question to the LLM using the predefined template
-def RAG(llm, prompt, db, question):
     document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
     retriever = db.as_retriever()
     retrieval_chain = create_retrieval_chain(retriever, document_chain)
     response = retrieval_chain.invoke({"input": question})
     return response
 #########
 # Dynamically loading vector_db
 ##########

 # from dotenv import load_dotenv
 # load_dotenv()
+# Global variables
+embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
+db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
+                                            embeddings=embeddings, allow_dangerous_deserialization=True)
 def load_documents(df):
+    """
+    Load documents from a DataFrame and split them into smaller chunks for vector storage.
+    Parameters:
+    ----------
+    df : pandas.DataFrame
+        A DataFrame containing the documents to be processed, with a column named 'speech_content' that holds the text content.
+    Returns:
+    -------
+    list
+        A list of split document chunks ready for further processing or vectorization.
+    """
+    # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load
+    loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')
+    # Load the data from the DataFrame into a suitable format for processing
     data = loader.load()
+    # Initialize a RecursiveCharacterTextSplitter to split the text into chunks
     splitter = RecursiveCharacterTextSplitter(
             chunk_size=1024,
             chunk_overlap=32,
             length_function=len,
             is_separator_regex=False,
         )
+    # Split the loaded data into smaller chunks using the splitter
     documents = splitter.split_documents(documents=data)
     return documents
     folder_path = "./src/FAISS"
     if inputs[0] == "All":
+        # index_name = "speeches_1949_09_12"
+        # db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
+        #                                     embeddings=embeddings, allow_dangerous_deserialization=True)
+        return db_all
     # Initialize empty db
+def RAG(llm, prompt, db, question):
+    """
+    Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
+    language model using a predefined template.
+    Parameters:
+    ----------
+    llm : LanguageModel
+        An instance of the language model to be used for generating responses.
+    prompt : str
+        A predefined template or prompt that structures how the context and question are presented to the language model.
+    db : VectorStore
+        A vector store instance that supports retrieval of relevant documents based on the input question.
+    question : str
+        The question or query to be answered by the language model.
+    Returns:
+    -------
+    str
+        The response generated by the language model, based on the retrieved context and provided question.
+    """
+    # Create a document chain using the provided language model and prompt template
     document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
+    # Convert the vector store into a retriever
     retriever = db.as_retriever()
+    # Create a retrieval chain that integrates the retriever with the document chain
     retrieval_chain = create_retrieval_chain(retriever, document_chain)
+    # Invoke the retrieval chain with the input question to get the final response
     response = retrieval_chain.invoke({"input": question})
     return response
 #########
 # Dynamically loading vector_db
 ##########