Spaces:
Sleeping
Sleeping
test party filter
Browse files- Home.py +2 -2
- src/chatbot.py +29 -20
Home.py
CHANGED
@@ -39,10 +39,10 @@ with gr.Blocks() as App:
|
|
39 |
file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
|
40 |
|
41 |
#Keyword Search on click
|
42 |
-
def search(keyword, n, party): #ToDo: Include party
|
43 |
return {
|
44 |
output_col: gr.Column(visible=True),
|
45 |
-
results_df: keyword_search(query=keyword, n=n),
|
46 |
}
|
47 |
|
48 |
search_btn.click(
|
|
|
39 |
file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
|
40 |
|
41 |
#Keyword Search on click
|
42 |
+
def search(keyword, n, party): #ToDo: Include party and timedate
|
43 |
return {
|
44 |
output_col: gr.Column(visible=True),
|
45 |
+
results_df: keyword_search(query=keyword, n=n, party_filter=party),
|
46 |
}
|
47 |
|
48 |
search_btn.click(
|
src/chatbot.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1 |
-
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
2 |
from langchain_core.prompts import ChatPromptTemplate
|
3 |
from langchain_community.llms.huggingface_hub import HuggingFaceHub
|
4 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
5 |
|
6 |
from src.vectordatabase import RAG, get_vectorstore
|
7 |
import pandas as pd
|
8 |
-
import
|
9 |
-
#from dotenv import load_dotenv, find_dotenv
|
10 |
|
11 |
#Load environmental variables from .env-file
|
12 |
#load_dotenv(find_dotenv())
|
@@ -63,22 +61,33 @@ def chatbot(message, history, db=db, llm=llm, prompt=prompt2):
|
|
63 |
return response
|
64 |
|
65 |
# Retrieve speech contents based on keywords
|
66 |
-
def keyword_search(query,n=10, db=db, embeddings=embeddings):
|
67 |
query_embedding = embeddings.embed_query(query)
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
return df_res
|
|
|
|
|
1 |
from langchain_core.prompts import ChatPromptTemplate
|
2 |
from langchain_community.llms.huggingface_hub import HuggingFaceHub
|
3 |
from langchain_community.embeddings import HuggingFaceEmbeddings
|
4 |
|
5 |
from src.vectordatabase import RAG, get_vectorstore
|
6 |
import pandas as pd
|
7 |
+
from dotenv import load_dotenv, find_dotenv
|
|
|
8 |
|
9 |
#Load environmental variables from .env-file
|
10 |
#load_dotenv(find_dotenv())
|
|
|
61 |
return response
|
62 |
|
63 |
# Retrieve speech contents based on keywords
|
64 |
+
def keyword_search(query,n=10, db=db, embeddings=embeddings, method='ss', party_filter = ''):
|
65 |
query_embedding = embeddings.embed_query(query)
|
66 |
+
if method == 'mmr':
|
67 |
+
df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party', 'Relevance']) # Add Date/Party/Politician
|
68 |
+
results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k = n, fetch_k = n + 10) #Add filter
|
69 |
+
for doc in results:
|
70 |
+
speech_content = doc[0].page_content
|
71 |
+
speech_date = doc[0].metadata["date"]
|
72 |
+
party = doc[0].metadata["party"]
|
73 |
+
score = round(doc[1], ndigits=2) # Relevance based on relevance search
|
74 |
+
df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
|
75 |
+
'Date': [speech_date],
|
76 |
+
'Party': [party],
|
77 |
+
'Relevance': [score]})], ignore_index=True)
|
78 |
+
df_res.sort_values('Relevance', inplace=True, ascending=True)
|
79 |
+
else:
|
80 |
+
df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party']) # Add Date/Party/Politician #Add filter
|
81 |
+
results = db.similarity_search_by_vector(query_embedding, k = n, filter={"party": party_filter})
|
82 |
+
for doc in results:
|
83 |
+
party = doc.metadata["party"]
|
84 |
+
#Filter by party input
|
85 |
+
#if party != party_filter or party_filter == '':
|
86 |
+
# continue
|
87 |
+
speech_content = doc.page_content
|
88 |
+
speech_date = doc.metadata["date"]
|
89 |
+
|
90 |
+
df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
|
91 |
+
'Date': [speech_date],
|
92 |
+
'Party': [party]})], ignore_index=True)
|
93 |
return df_res
|