TomData commited on
Commit
b5a209d
·
1 Parent(s): c98215f

load db_all before inference

Browse files
Files changed (2) hide show
  1. Home.py +13 -2
  2. src/vectordatabase.py +59 -9
Home.py CHANGED
@@ -58,8 +58,9 @@ with gr.Blocks() as App:
58
  with gr.Row() as additional_input:
59
  n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
60
  party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') # change choices to all possible options
61
- start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
62
- end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
 
63
 
64
  search_btn = gr.Button('Search')
65
 
@@ -106,6 +107,16 @@ with gr.Blocks() as App:
106
  inputs=[results_df, keyword_box, ftype_dropdown],
107
  outputs=[file],
108
  )
 
 
 
 
 
 
 
 
 
 
109
 
110
 
111
  if __name__ == "__main__":
 
58
  with gr.Row() as additional_input:
59
  n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
60
  party_dopdown = gr.Dropdown(value='All', choices=['All','CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party') # change choices to all possible options
61
+ # ToDo: Add date or legislature filter as input
62
+ #start_date = Calendar(value="1949-01-01", type="datetime", label="Select start date", info="Click the calendar icon to bring up the calendar.", interactive=True)
63
+ #end_date = Calendar(value=datetime.today().strftime('%Y-%m-%d'), type="datetime", label="Select end date", info="Click the calendar icon to bring up the calendar.", interactive=True)
64
 
65
  search_btn = gr.Button('Search')
66
 
 
107
  inputs=[results_df, keyword_box, ftype_dropdown],
108
  outputs=[file],
109
  )
110
+
111
+ with gr.Tab("About"):
112
+ gr.Markdown(text="""**Motivation:**
113
+ The idea of this project is a combination of my curiosity in LLM application and my affection for speech data, that I developed during my bachelor thesis on measuring populism in text data.
114
+ I would like to allow people to discover interesting discussions, opinions and positions that were communicated in the german parliament thoughout the years.
115
+ **Development status:**
116
+ Chatbot: Users can interact with the chatbot asking questions about anything that can be answered by speeches. Furthermore they can select any legislature as a basis for the chatbot's reply.
117
+ Keyword
118
+
119
+ """)
120
 
121
 
122
  if __name__ == "__main__":
src/vectordatabase.py CHANGED
@@ -18,19 +18,42 @@ import os
18
  # from dotenv import load_dotenv
19
  # load_dotenv()
20
 
 
 
 
 
21
 
22
- # Load documents to create a vectorstore later
23
  def load_documents(df):
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')
 
 
26
  data = loader.load()
 
 
27
  splitter = RecursiveCharacterTextSplitter(
28
  chunk_size=1024,
29
  chunk_overlap=32,
30
  length_function=len,
31
  is_separator_regex=False,
32
  )
 
 
33
  documents = splitter.split_documents(documents=data)
 
34
  return documents
35
 
36
 
@@ -69,10 +92,10 @@ def get_vectorstore(inputs, embeddings):
69
  folder_path = "./src/FAISS"
70
 
71
  if inputs[0] == "All":
72
- index_name = "speeches_1949_09_12"
73
- db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
74
- embeddings=embeddings, allow_dangerous_deserialization=True)
75
- return db
76
 
77
 
78
  # Initialize empty db
@@ -99,15 +122,42 @@ def get_vectorstore(inputs, embeddings):
99
 
100
 
101
 
102
- # Apply RAG by providing the context and the question to the LLM using the predefined template
103
- def RAG(llm, prompt, db, question):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
 
105
  retriever = db.as_retriever()
 
106
  retrieval_chain = create_retrieval_chain(retriever, document_chain)
107
-
108
  response = retrieval_chain.invoke({"input": question})
 
109
  return response
110
 
 
111
  #########
112
  # Dynamically loading vector_db
113
  ##########
 
18
  # from dotenv import load_dotenv
19
  # load_dotenv()
20
 
21
+ # Global variables
22
+ embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
23
+ db_all = FAISS.load_local(folder_path="./src/FAISS", index_name="speeches_1949_09_12",
24
+ embeddings=embeddings, allow_dangerous_deserialization=True)
25
 
 
26
  def load_documents(df):
27
+ """
28
+ Load documents from a DataFrame and split them into smaller chunks for vector storage.
29
+
30
+ Parameters:
31
+ ----------
32
+ df : pandas.DataFrame
33
+ A DataFrame containing the documents to be processed, with a column named 'speech_content' that holds the text content.
34
+
35
+ Returns:
36
+ -------
37
+ list
38
+ A list of split document chunks ready for further processing or vectorization.
39
+ """
40
 
41
+ # Initialize a DataFrameLoader with the given DataFrame and specify the column containing the content to load
42
+ loader = DataFrameLoader(data_frame=df, page_content_column='speech_content')
43
+ # Load the data from the DataFrame into a suitable format for processing
44
  data = loader.load()
45
+
46
+ # Initialize a RecursiveCharacterTextSplitter to split the text into chunks
47
  splitter = RecursiveCharacterTextSplitter(
48
  chunk_size=1024,
49
  chunk_overlap=32,
50
  length_function=len,
51
  is_separator_regex=False,
52
  )
53
+
54
+ # Split the loaded data into smaller chunks using the splitter
55
  documents = splitter.split_documents(documents=data)
56
+
57
  return documents
58
 
59
 
 
92
  folder_path = "./src/FAISS"
93
 
94
  if inputs[0] == "All":
95
+ # index_name = "speeches_1949_09_12"
96
+ # db = FAISS.load_local(folder_path=folder_path, index_name=index_name,
97
+ # embeddings=embeddings, allow_dangerous_deserialization=True)
98
+ return db_all
99
 
100
 
101
  # Initialize empty db
 
122
 
123
 
124
 
125
+ def RAG(llm, prompt, db, question):
126
+ """
127
+ Apply Retrieval-Augmented Generation (RAG) by providing the context and the question to the
128
+ language model using a predefined template.
129
+
130
+ Parameters:
131
+ ----------
132
+ llm : LanguageModel
133
+ An instance of the language model to be used for generating responses.
134
+
135
+ prompt : str
136
+ A predefined template or prompt that structures how the context and question are presented to the language model.
137
+
138
+ db : VectorStore
139
+ A vector store instance that supports retrieval of relevant documents based on the input question.
140
+
141
+ question : str
142
+ The question or query to be answered by the language model.
143
+
144
+ Returns:
145
+ -------
146
+ str
147
+ The response generated by the language model, based on the retrieved context and provided question.
148
+ """
149
+ # Create a document chain using the provided language model and prompt template
150
  document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
151
+ # Convert the vector store into a retriever
152
  retriever = db.as_retriever()
153
+ # Create a retrieval chain that integrates the retriever with the document chain
154
  retrieval_chain = create_retrieval_chain(retriever, document_chain)
155
+ # Invoke the retrieval chain with the input question to get the final response
156
  response = retrieval_chain.invoke({"input": question})
157
+
158
  return response
159
 
160
+
161
  #########
162
  # Dynamically loading vector_db
163
  ##########