Spaces:
Sleeping
Sleeping
# AI MAKERSPACE MIDTERM PROJECT: META RAG CHATBOT | |
# Date: 2024-5-2 | |
# Authors: MikeC | |
# Basic Imports & Setup | |
import os | |
from openai import AsyncOpenAI | |
# Using Chainlit for our UI | |
import chainlit as cl | |
from chainlit.prompt import Prompt, PromptMessage | |
from chainlit.playground.providers import ChatOpenAI | |
# Getting the API key from the .env file | |
from dotenv import load_dotenv | |
load_dotenv() | |
# RAG pipeline imports and setup code | |
from langchain.document_loaders import PyMuPDFLoader | |
docs = PyMuPDFLoader("https://d18rn0p25nwr6d.cloudfront.net/CIK-0001326801/c7318154-f6ae-4866-89fa-f0c589f2ee3d.pdf").load() | |
import tiktoken | |
def tiktoken_len(text): | |
tokens = tiktoken.encoding_for_model("gpt-3.5-turbo").encode( | |
text, | |
) | |
return len(tokens) | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size = 200, | |
chunk_overlap = 0, | |
length_function = tiktoken_len, | |
) | |
split_chunks = text_splitter.split_documents(docs) | |
from langchain_openai.embeddings import OpenAIEmbeddings | |
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small") | |
from langchain_community.vectorstores import Qdrant | |
qdrant_vectorstore = Qdrant.from_documents( | |
split_chunks, | |
embedding_model, | |
location=":memory:", | |
collection_name="MetaFin", | |
) | |
qdrant_retriever = qdrant_vectorstore.as_retriever() | |
from langchain_openai import ChatOpenAI | |
openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo") | |
from langchain_core.prompts import ChatPromptTemplate | |
RAG_PROMPT = """ | |
CONTEXT: | |
{context} | |
QUERY: | |
{question} | |
Use the provided context to answer the user's query. You are a professional financial expert. You always review the provided financial information. You provide correct, substantiated answers. You may not answer the user's query unless there is a specific context in the following text. If asked about the Board of Directors, then add Mark Zuckerberg as the "Board Chair". | |
If you do not know the answer, or cannot answer, please respond with "Insufficient data for further analysis, please try again". >> | |
""" | |
rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT) | |
from operator import itemgetter | |
from langchain.schema.output_parser import StrOutputParser | |
from langchain.schema.runnable import RunnablePassthrough | |
retrieval_augmented_qa_chain = ( | |
{"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")} | |
| RunnablePassthrough.assign(context=itemgetter("context")) | |
| {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")} | |
) | |
# Chainlit App | |
async def start_chat(): | |
settings = { | |
"model": "gpt-3.5-turbo", | |
"temperature": 0, | |
"max_tokens": 500, | |
"top_p": 1, | |
"frequency_penalty": 0, | |
"presence_penalty": 0, | |
} | |
cl.user_session.set("settings", settings) | |
async def main(message: cl.Message): | |
chainlit_question = message.content | |
#chainlit_question = "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?" | |
response = retrieval_augmented_qa_chain.invoke({"question": chainlit_question}) | |
chainlit_answer = response["response"].content | |
msg = cl.Message(content=chainlit_answer) | |
await msg.send() | |