TomData commited on
Commit
33014c1
·
1 Parent(s): b714046

New kws layout

Browse files
Files changed (3) hide show
  1. Home.py +60 -6
  2. requirements.txt +3 -1
  3. src/chatbot.py +4 -9
Home.py CHANGED
@@ -1,13 +1,10 @@
1
  import gradio as gr
2
  from src.chatbot import chatbot, keyword_search
3
 
4
- # Adjust size of each block is not yet working
5
- output = [gr.Dataframe(line_breaks=True)]
6
- input = gr.Textbox()
7
 
8
  with gr.Blocks() as App:
9
  with gr.Tab("ChatBot"):
10
- # Apply RAG using chatbut function from local file ChatBot.py
11
  gr.ChatInterface(chatbot,
12
  title="PoliticsToYou",
13
  description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
@@ -15,10 +12,67 @@ with gr.Blocks() as App:
15
  examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
16
  cache_examples=False, #true increases the loading time
17
  )
 
18
  with gr.Tab("KeyWordSearch"):
19
- gr.Interface(fn=keyword_search, inputs=input, outputs=output, max_batch_size = 10)
20
-
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  if __name__ == "__main__":
23
  App.launch(share=False) #true not supported on hf spaces
24
 
 
1
  import gradio as gr
2
  from src.chatbot import chatbot, keyword_search
3
 
 
 
 
4
 
5
  with gr.Blocks() as App:
6
  with gr.Tab("ChatBot"):
7
+ #Apply RAG using chatbut function from local file ChatBot.py
8
  gr.ChatInterface(chatbot,
9
  title="PoliticsToYou",
10
  description= "This chatbot uses the infomation of speeches of the german parliament (since 2021) \
 
12
  examples=["Wie steht die CDU zur Cannabislegalisierung?","Was waren die wichtigsten Themen in der aktuellen Legislaturperiode?"], #change to meaningful examples
13
  cache_examples=False, #true increases the loading time
14
  )
15
+
16
  with gr.Tab("KeyWordSearch"):
 
 
17
 
18
+ with gr.Blocks() as Block:
19
+ #Keyword Input
20
+ keyword_box = gr.Textbox(label='keyword')
21
+
22
+ #Additional Input (hidden)
23
+ with gr.Accordion('Detailed filters', open=False):
24
+ #Row orientation
25
+ with gr.Row() as additional_input:
26
+ n_slider = gr.Slider(label="Number of Results", minimum=1, maximum=100, step=1, value=10)
27
+ party_dopdown = gr.Dropdown(choices=['CDU/CSU','SPD','FDP','Grüne','not found','DIE LINKE.','PDS','KPD'], label='Party')
28
+
29
+ search_btn = gr.Button('Search')
30
+
31
+ with gr.Column(visible=False) as output_col:
32
+ results_df = gr.Dataframe(label='Results', interactive=False)
33
+
34
+ #Download results from keyword search
35
+ with gr.Accordion('Would you like to download your results?', open=False) as download_row:
36
+ with gr.Row():
37
+ ftype_dropdown = gr.Dropdown(choices=["csv","excel","json"], label="Format")
38
+ export_btn = gr.Button('Export')
39
+ file = gr.File(file_types=[".xlsx", ".csv", ".json"], visible=False)
40
+
41
+ #Keyword Search on click
42
+ def search(keyword, n, party): #ToDo: Include party
43
+ return {
44
+ output_col: gr.Column(visible=True),
45
+ results_df: keyword_search(query=keyword, n=n),
46
+ }
47
+
48
+ search_btn.click(
49
+ fn=search,
50
+ inputs=[keyword_box, n_slider, party_dopdown],
51
+ outputs=[output_col, results_df],
52
+ )
53
+
54
+ #Export data to a downloadable format
55
+ def export(df, keyword, ftype=None):
56
+ if ftype == "csv":
57
+ file = f'{keyword}.csv'
58
+ df.to_csv(file, index = False)
59
+ return gr.File(value=file,visible=True)
60
+ elif ftype == "json":
61
+ file = f'{keyword}.json'
62
+ df.to_json(file, index = True)
63
+ return gr.File(value=file,visible=True)
64
+ else:
65
+ file = f'{keyword}.xlsx'
66
+ df.to_excel(file, index = True)
67
+ return gr.File(value=file,visible=True)
68
+
69
+ export_btn.click(
70
+ fn=export,
71
+ inputs=[results_df, keyword_box, ftype_dropdown],
72
+ outputs=[file],
73
+ )
74
+
75
+
76
  if __name__ == "__main__":
77
  App.launch(share=False) #true not supported on hf spaces
78
 
requirements.txt CHANGED
@@ -2,6 +2,8 @@ pandas==2.1.3
2
  langchain==0.1.15
3
  transformers==4.35.2
4
  gradio==4.26.0
 
5
  sentence-transformers==2.6.1
6
  python-dotenv
7
- faiss-cpu
 
 
2
  langchain==0.1.15
3
  transformers==4.35.2
4
  gradio==4.26.0
5
+ gradio-calendar
6
  sentence-transformers==2.6.1
7
  python-dotenv
8
+ faiss-cpu
9
+ openpyxl
src/chatbot.py CHANGED
@@ -12,7 +12,6 @@ import os
12
  #load_dotenv(find_dotenv())
13
 
14
 
15
-
16
  embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
17
  llm = HuggingFaceHub(
18
  # Try different model here
@@ -64,26 +63,22 @@ def chatbot(message, history, db=db, llm=llm, prompt=prompt2):
64
  return response
65
 
66
  # Retrieve speech contents based on keywords
67
- def keyword_search(query, db=db, embeddings=embeddings):
68
  query_embedding = embeddings.embed_query(query)
69
- results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding)
70
  # Format vector store query results into dataframe
71
  #print(results[0][0].metadata.keys())
72
 
73
  df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party', 'Relevance']) # Add Date/Party/Politician
74
- i = 0
75
  for doc in results:
76
  speech_content = doc[0].page_content
77
  speech_date = doc[0].metadata["date"]
78
  party = doc[0].metadata["party"]
79
- score = doc[1] # Relevance based on relevance search
80
  df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
81
  'Date': [speech_date],
82
  'Party': [party],
83
  'Relevance': [score]})], ignore_index=True)
84
- i = i + 1
85
- if i > 2:
86
- break
87
-
88
  df_res.sort_values('Relevance', inplace=True, ascending=False)
89
  return df_res
 
12
  #load_dotenv(find_dotenv())
13
 
14
 
 
15
  embeddings = HuggingFaceEmbeddings(model_name="paraphrase-multilingual-MiniLM-L12-v2")
16
  llm = HuggingFaceHub(
17
  # Try different model here
 
63
  return response
64
 
65
  # Retrieve speech contents based on keywords
66
+ def keyword_search(query,n=10, db=db, embeddings=embeddings):
67
  query_embedding = embeddings.embed_query(query)
68
+ results = db.max_marginal_relevance_search_with_score_by_vector(query_embedding, k = n)
69
  # Format vector store query results into dataframe
70
  #print(results[0][0].metadata.keys())
71
 
72
  df_res = pd.DataFrame(columns=['Speech Content','Date', 'Party', 'Relevance']) # Add Date/Party/Politician
 
73
  for doc in results:
74
  speech_content = doc[0].page_content
75
  speech_date = doc[0].metadata["date"]
76
  party = doc[0].metadata["party"]
77
+ score = round(doc[1], ndigits=2) # Relevance based on relevance search
78
  df_res = pd.concat([df_res, pd.DataFrame({'Speech Content': [speech_content],
79
  'Date': [speech_date],
80
  'Party': [party],
81
  'Relevance': [score]})], ignore_index=True)
82
+
 
 
 
83
  df_res.sort_values('Relevance', inplace=True, ascending=False)
84
  return df_res