Spaces:
Running
Running
import os | |
import pickle | |
import time | |
import gradio as gr | |
from langchain import OpenAI | |
from langchain.chains import RetrievalQAWithSourcesChain | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.document_loaders import UnstructuredURLLoader | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import FAISS | |
from dotenv import load_dotenv | |
load_dotenv() # take environment variables from .env (especially openai api key) | |
# Define the main function to process URLs and handle queries | |
def process_and_query(url1, url2, url3, query): | |
urls = [url1, url2, url3] | |
file_path = "faiss_store_openai.pkl" | |
llm = OpenAI(temperature=0.9, max_tokens=500) | |
# Load data | |
loader = UnstructuredURLLoader(urls=urls) | |
data = loader.load() | |
# Split data | |
text_splitter = RecursiveCharacterTextSplitter( | |
separators=['\n\n', '\n', '.', ','], | |
chunk_size=1000 | |
) | |
docs = text_splitter.split_documents(data) | |
# Create embeddings and save it to FAISS index | |
embeddings = OpenAIEmbeddings() | |
vectorstore_openai = FAISS.from_documents(docs, embeddings) | |
# Save the FAISS index to a pickle file | |
with open(file_path, "wb") as f: | |
pickle.dump(vectorstore_openai, f) | |
# Process the query | |
if os.path.exists(file_path): | |
with open(file_path, "rb") as f: | |
vectorstore = pickle.load(f) | |
chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever()) | |
result = chain({"question": query}, return_only_outputs=True) | |
answer = result["answer"] | |
# Extract and format sources | |
sources = result.get("sources", "") | |
sources_list = sources.split("\n") if sources else [] | |
return answer, sources_list | |
# Define the Gradio interface | |
url1_input = gr.Textbox(label="URL 1") | |
url2_input = gr.Textbox(label="URL 2") | |
url3_input = gr.Textbox(label="URL 3") | |
query_input = gr.Textbox(label="Question") | |
output_text = gr.Textbox(label="Answer") | |
output_sources = gr.Textbox(label="Sources") | |
interface = gr.Interface( | |
fn=process_and_query, | |
inputs=[url1_input, url2_input, url3_input, query_input], | |
outputs=[output_text, output_sources], | |
title="RockyBot: News Research Tool π", | |
description="Enter up to three news article URLs and ask a question. The bot will process the articles and provide an answer along with the sources." | |
) | |
if __name__ == "__main__": | |
interface.launch() | |