Spaces:

mdredze1
/

tobacco-watcher-chat-with-citations

Sleeping

App Files Files Community

vtiyyal1 commited on Dec 17, 2024

Commit

05e0faf

verified ·

1 Parent(s): 229d228

Upload 3 files

Browse files

fixed rerank to open ai

Files changed (2) hide show

full_chain.py +4 -3
rerank.py +77 -72

full_chain.py CHANGED Viewed

@@ -2,13 +2,14 @@ import os
 import pandas as pd
 from get_keywords import get_keywords
 from get_articles import save_solr_articles_full
-from rerank import langchain_rerank_answer, langchain_with_sources, crossencoder_rerank_answer, \
-    crossencoder_rerank_sentencewise, crossencoder_rerank_sentencewise_articles, no_rerank
 #from feed_to_llm import feed_articles_to_gpt_with_links
 from feed_to_llm_v2 import feed_articles_to_gpt_with_links
 def get_response(question, rerank_type="crossencoder", llm_type="chat"):
-    csv_path = save_solr_articles_full(question, keyword_type="rake", num_articles=10)
     reranked_out = crossencoder_rerank_answer(csv_path, question)
     return feed_articles_to_gpt_with_links(reranked_out, question)

 import pandas as pd
 from get_keywords import get_keywords
 from get_articles import save_solr_articles_full
+# from rerank import langchain_rerank_answer, langchain_with_sources, crossencoder_rerank_answer, \
+#     crossencoder_rerank_sentencewise, crossencoder_rerank_sentencewise_articles, no_rerank
 #from feed_to_llm import feed_articles_to_gpt_with_links
+from rerank import crossencoder_rerank_answer
 from feed_to_llm_v2 import feed_articles_to_gpt_with_links
 def get_response(question, rerank_type="crossencoder", llm_type="chat"):
+    csv_path = save_solr_articles_full(question, keyword_type="rake", num_articles=15)
     reranked_out = crossencoder_rerank_answer(csv_path, question)
     return feed_articles_to_gpt_with_links(reranked_out, question)

rerank.py CHANGED Viewed

@@ -1,11 +1,16 @@
 # reranks the top articles from a given csv file
-from langchain_openai import ChatOpenAI
-from langchain.chains import RetrievalQA
-from langchain_community.document_loaders.csv_loader import CSVLoader
-from langchain_community.vectorstores import DocArrayInMemorySearch
 from sentence_transformers import CrossEncoder
 import pandas as pd
 import time
 """
 This function rerank top articles (15 -> 4) from a given csv, then sends to LLM
@@ -24,73 +29,73 @@ Update: Use langchain_RAG instead.
 """
-def langchain_rerank_answer(csv_path, question, source='url', top_n=4):
-    llm = ChatOpenAI(temperature=0.0)
-    loader = CSVLoader(csv_path, source_column="url")
-    index = VectorstoreIndexCreator(
-        vectorstore_cls=DocArrayInMemorySearch,
-    ).from_loaders([loader])
-    # prompt_template = """You are an a chatbot that answers tobacco related questions with source. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
-    # {context}
-    # Question: {question}"""
-    # PROMPT = PromptTemplate(
-    # template=prompt_template, input_variables=["context", "question"]
-    # )
-    # chain_type_kwargs = {"prompt": PROMPT}
-    qa = RetrievalQA.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=index.vectorstore.as_retriever(),
-        verbose=False,
-        return_source_documents=True,
-        # chain_type_kwargs=chain_type_kwargs,
-        # chain_type_kwargs = {
-        #     "document_separator": "<<<<>>>>>"
-        # },
-    )
-    answer = qa({"query": question})
-    sources = answer['source_documents']
-    sources_out = [source.metadata['source'] for source in sources]
-    return answer['result'], sources_out
-"""
-    Langchain with sources.
-    This function is deprecated. Use langchain_RAG instead.
-"""
-def langchain_with_sources(csv_path, question, top_n=4):
-    llm = ChatOpenAI(temperature=0.0)
-    loader = CSVLoader(csv_path, source_column="uuid")
-    index = VectorstoreIndexCreator(
-        vectorstore_cls=DocArrayInMemorySearch,
-    ).from_loaders([loader])
-    qa = RetrievalQAWithSourcesChain.from_chain_type(
-        llm=llm,
-        chain_type="stuff",
-        retriever=index.vectorstore.as_retriever(),
-    )
-    output = qa({"question": question}, return_only_outputs=True)
-    return output['answer'], output['sources']
-"""
-    Reranks the top articles using crossencoder.
-    Uses cross-encoder/ms-marco-MiniLM-L-6-v2 for embedding / reranking.
-    Input:
-        csv_path: str
-        question: str
-        top_n: int
-    Output:
-        out_values: list of [content, uuid, title]
-"""
 # returns list of top n similar articles using crossencoder
@@ -187,7 +192,7 @@ def crossencoder_rerank_sentencewise_sentence_chunks(csv_path, question, top_n=1
     new_uuids = []
     new_titles = []
     new_domains = []
     for idx in range(len(contents)):
         sents = sent_tokenize(contents[idx])
         sents_merged = []

 # reranks the top articles from a given csv file
+# from langchain_openai import ChatOpenAI
+# from langchain.chains import RetrievalQA
+# from langchain_community.document_loaders.csv_loader import CSVLoader
+# from langchain_community.vectorstores import DocArrayInMemorySearch
 from sentence_transformers import CrossEncoder
 import pandas as pd
 import time
+import nltk
+nltk.download('stopwords')
+nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
 """
 This function rerank top articles (15 -> 4) from a given csv, then sends to LLM
 """
+# def langchain_rerank_answer(csv_path, question, source='url', top_n=4):
+#     llm = ChatOpenAI(temperature=0.0)
+#     loader = CSVLoader(csv_path, source_column="url")
+#     index = VectorstoreIndexCreator(
+#         vectorstore_cls=DocArrayInMemorySearch,
+#     ).from_loaders([loader])
+#     # prompt_template = """You are an a chatbot that answers tobacco related questions with source. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.
+#     # {context}
+#     # Question: {question}"""
+#     # PROMPT = PromptTemplate(
+#     # template=prompt_template, input_variables=["context", "question"]
+#     # )
+#     # chain_type_kwargs = {"prompt": PROMPT}
+#     qa = RetrievalQA.from_chain_type(
+#         llm=llm,
+#         chain_type="stuff",
+#         retriever=index.vectorstore.as_retriever(),
+#         verbose=False,
+#         return_source_documents=True,
+#         # chain_type_kwargs=chain_type_kwargs,
+#         # chain_type_kwargs = {
+#         #     "document_separator": "<<<<>>>>>"
+#         # },
+#     )
+#     answer = qa({"query": question})
+#     sources = answer['source_documents']
+#     sources_out = [source.metadata['source'] for source in sources]
+#     return answer['result'], sources_out
+# """
+#     Langchain with sources.
+#     This function is deprecated. Use langchain_RAG instead.
+# """
+# def langchain_with_sources(csv_path, question, top_n=4):
+#     llm = ChatOpenAI(temperature=0.0)
+#     loader = CSVLoader(csv_path, source_column="uuid")
+#     index = VectorstoreIndexCreator(
+#         vectorstore_cls=DocArrayInMemorySearch,
+#     ).from_loaders([loader])
+#     qa = RetrievalQAWithSourcesChain.from_chain_type(
+#         llm=llm,
+#         chain_type="stuff",
+#         retriever=index.vectorstore.as_retriever(),
+#     )
+#     output = qa({"question": question}, return_only_outputs=True)
+#     return output['answer'], output['sources']
+# """
+#     Reranks the top articles using crossencoder.
+#     Uses cross-encoder/ms-marco-MiniLM-L-6-v2 for embedding / reranking.
+#     Input:
+#         csv_path: str
+#         question: str
+#         top_n: int
+#     Output:
+#         out_values: list of [content, uuid, title]
+# """
 # returns list of top n similar articles using crossencoder
     new_uuids = []
     new_titles = []
     new_domains = []
     for idx in range(len(contents)):
         sents = sent_tokenize(contents[idx])
         sents_merged = []