intoxication commited on
Commit
df118cf
·
1 Parent(s): 209df17

Update utils/haystack.py

Browse files
Files changed (1) hide show
  1. utils/haystack.py +66 -16
utils/haystack.py CHANGED
@@ -1,22 +1,72 @@
1
  import streamlit as st
 
 
 
 
 
 
 
 
2
 
3
- from haystack import Pipeline
4
- from haystack.schema import Answer
5
- #Use this file to set up your Haystack pipeline and querying
6
 
 
 
7
 
8
- # cached to make index and models load only at start
9
- @st.cache_resource(show_spinner=False)
10
- def start_haystack():
11
- #Use this function to contruct a pipeline
12
- pipe = Pipeline()
13
- return pipe
 
 
 
 
 
14
 
15
- pipe = start_haystack()
 
 
 
 
 
16
 
17
- @st.cache_data(show_spinner=True)
18
- def query(question):
19
- print("Received question")
20
- params = {}
21
- # results = pipe.run(question, params=params)
22
- return [Answer(answer="results", context="Call pipe.run(question, params=params) and return results in /utils/haystack.py query()")]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import logging
3
+ import pandas as pd
4
+ from haystack.utils import print_answers
5
+ from haystack.pipelines import Pipeline
6
+ from haystack.document_stores import ElasticsearchDocumentStore
7
+ from haystack.nodes import EmbeddingRetriever
8
+ from haystack.nodes.other.docs2answers import Docs2Answers
9
+ from haystack.utils import launch_es, fetch_archive_from_http
10
 
11
+ # Initialize logging
12
+ logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING)
13
+ logging.getLogger("haystack").setLevel(logging.INFO)
14
 
15
+ # Launch Elasticsearch
16
+ launch_es()
17
 
18
+ # Initialize the Haystack pipeline and document store
19
+ document_store = ElasticsearchDocumentStore(
20
+ host="localhost",
21
+ username="",
22
+ password="",
23
+ index="document",
24
+ embedding_field="question_emb",
25
+ embedding_dim=384,
26
+ excluded_meta_data=["question_emb"],
27
+ similarity="cosine",
28
+ )
29
 
30
+ retriever = EmbeddingRetriever(
31
+ document_store=document_store,
32
+ embedding_model="sentence-transformers/all-MiniLM-L6-v2",
33
+ use_gpu=True,
34
+ scale_score=False,
35
+ )
36
 
37
+ doc_to_answers = Docs2Answers()
38
+
39
+ doc_dir = "data/basic_faq_pipeline"
40
+ s3_url = "https://core-engineering.s3.eu-central-1.amazonaws.com/public/scripts/small_faq_covid.csv1.zip"
41
+ fetch_archive_from_http(url=s3_url, output_dir=doc_dir)
42
+
43
+ df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv")
44
+
45
+ # Minimal cleaning
46
+ df.fillna(value="", inplace=True)
47
+ df["question"] = df["question"].apply(lambda x: x.strip())
48
+
49
+ # Get embeddings for our questions from the FAQs
50
+ questions = list(df["question"].values)
51
+ df["question_emb"] = retriever.embed_queries(queries=questions).tolist()
52
+ df = df.rename(columns={"question": "content"})
53
+
54
+ # Convert Dataframe to list of dicts and index them in our DocumentStore
55
+ docs_to_index = df.to_dict(orient="records")
56
+ document_store.write_documents(docs_to_index)
57
+
58
+ # Initialize a Pipeline (this time without a reader) and ask questions
59
+ pipeline = Pipeline()
60
+ pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
61
+ pipeline.add_node(component=doc_to_answers, name="Docs2Answers", inputs=["Retriever"])
62
+
63
+ # Create the Streamlit app
64
+ st.title("FAQ Search")
65
+ question = st.text_input("Ask a question:")
66
+
67
+ if question:
68
+ params = {"Retriever": {"top_k": 10}} # Modify parameters as needed
69
+ prediction = pipeline.run(query=question, params=params)
70
+
71
+ st.subheader("Answers:")
72
+ print_answers(prediction, details="medium")