In [1]:
from langchain.document_loaders import PyMuPDFLoader, PyPDFLoader

In [3]:
import os
import getpass

os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

In [4]:
from langchain_openai import ChatOpenAI

openai_chat_model = ChatOpenAI(model="gpt-3.5-turbo") #gpt-4o

In [5]:
from langchain_core.prompts import ChatPromptTemplate

In [6]:
data_path = "data/airbnb_midterm.pdf"
data_url = "https://airbnb2020ipo.q4web.com/files/doc_financials/2024/q1/fdb60f7d-e616-43dc-86ef-e33d3a9bdd05.pdf"

In [7]:
docs = PyMuPDFLoader(data_path).load()

In [8]:
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter

def tiktoken_len(text):
 tokens = tiktoken.encoding_for_model("gpt-4o").encode(
 text,
 )
 return len(tokens)

In [9]:
text_splitter = RecursiveCharacterTextSplitter(
 chunk_size = 500,
 chunk_overlap = 10,
 length_function = tiktoken_len,
)

split_chunks = text_splitter.split_documents(docs)

In [10]:
max_chunk_length = 0

for chunk in split_chunks:
 max_chunk_length = max(max_chunk_length, tiktoken_len(chunk.page_content))

print(max_chunk_length)

495


In [11]:
from langchain_openai.embeddings import OpenAIEmbeddings

embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

In [12]:
from utils.custom_retriver import CustomQDrant, CustomVectorStoreRetriever



In [13]:


qdrant_vectorstore = CustomQDrant.from_documents(
 split_chunks,
 embedding_model,
 location=":memory:",
 collection_name="air bnb data",
 score_threshold=0.3
 
)

qdrant_retriever = qdrant_vectorstore.as_retriever()

In [14]:
qdrant_retriever.invoke("Where did air bnb started")[0]

(Document(page_content='Table of Contents\nPART I - FINANCIAL INFORMATION\nItem 1. Condensed Consolidated Financial Statements\nAirbnb, Inc.\nCondensed Consolidated Balance Sheets\n(in millions, except par value)\n(unaudited)\nDecember 31,\n2023\nMarch 31,\n2024\nAssets\nCurrent assets:\nCash and cash equivalents\n$\n6,874\xa0 $\n7,829\xa0\nShort-term investments (including assets reported at fair value of $2,507 and $2,524, respectively)\n3,197\xa0\n3,264\xa0\nFunds receivable and amounts held on behalf of customers\n5,869\xa0\n8,737\xa0\nPrepaids and other current assets (including customer receivables of $249 and $212 and allowances of $44 and $37, respectively)\n569\xa0\n563\xa0\nTotal current assets\n16,509\xa0\n20,393\xa0\nDeferred tax assets, net\n2,881\xa0\n2,886\xa0\nGoodwill and intangible assets, net\n792\xa0\n786\xa0\nOther assets, noncurrent\n463\xa0\n472\xa0\nTotal assets\n$\n20,645\xa0 $\n24,537\xa0\nLiabilities and Stockholders’ Equity\nCurrent liabilities:\nAccrued exp

In [15]:
RAG_PROMPT = """
CONTEXT:
{context}

QUERY:
{question}

Answer questions only based on provided context and not your previous knowledge. 
In your answer never mention phrases like Based on provided context, From the context etc.
If you don't know the answer say I don't know!
"""

rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)

In [16]:
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

retrieval_augmented_qa_chain = (
 # INVOKE CHAIN WITH: {"question" : "<>"}
 # "question" : populated by getting the value of the "question" key
 # "context" : populated by getting the value of the "question" key and chaining it into the base_retriever
 {"context": itemgetter("question") | qdrant_retriever, "question": itemgetter("question")}
 # "context" : is assigned to a RunnablePassthrough object (will not be called or considered in the next step)
 # by getting the value of the "context" key from the previous step
 | RunnablePassthrough.assign(context=itemgetter("context"))
 # "response" : the "context" and "question" values are used to format our prompt object and then piped
 # into the LLM and stored in a key called "response"
 # "context" : populated by getting the value of the "context" key from the previous step
 | {"response": rag_prompt | openai_chat_model, "context": itemgetter("context")}
)

In [17]:
response = retrieval_augmented_qa_chain.invoke({"question" : "What is Airbnb's 'Description of Business'?"})

In [33]:
chunks = []

async for chunk in retrieval_augmented_qa_chain.astream({"question" : "What is Airbnb's 'Description of Business'?"}):
 chunks.append(chunk)
 if "context" not in chunk.keys():
 print(chunk["response"].content, end="|", flush=True)

|Air|bnb|,| Inc|.| was| incorporated| in| Delaware| in| June| |200|8| and| is| headquartered| in| San| Francisco|,| California|.| The| company| operates| a| global| platform| for| unique| stays| and| experiences|.| Its| marketplace| model| connects| hosts| and| guests| online| or| through| mobile| devices| to| book| spaces| and| experiences| around| the| world|.||

In [32]:
chunks[1]["response"].content

''

In [18]:
response

{'response': AIMessage(content="Airbnb, Inc. operates a global platform for unique stays and experiences. The Company's marketplace model connects hosts and guests online or through mobile devices to book spaces and experiences around the world.", response_metadata={'token_usage': {'completion_tokens': 38, 'prompt_tokens': 3067, 'total_tokens': 3105}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-4b846bb5-6910-4379-a3e3-e6728cca4788-0', usage_metadata={'input_tokens': 3067, 'output_tokens': 38, 'total_tokens': 3105}),
 'context': [(Document(page_content='Table of Contents\nAirbnb, Inc.\nNotes to Condensed Consolidated Financial Statements (unaudited)\nNote 1. Description of Business\nAirbnb,\xa0Inc. (the “Company” or “Airbnb”) was incorporated in Delaware in June\xa02008 and is headquartered in San Francisco, California. The Company\xa0operates\xa0a global platform for\nunique stays and experiences. The Company’s marketpl

In [18]:
qdrant_vectorstore.similarity_search_with_score("Where did air bnb started")

[(Document(page_content='Table of Contents\nPART I - FINANCIAL INFORMATION\nItem 1. Condensed Consolidated Financial Statements\nAirbnb, Inc.\nCondensed Consolidated Balance Sheets\n(in millions, except par value)\n(unaudited)\nDecember 31,\n2023\nMarch 31,\n2024\nAssets\nCurrent assets:\nCash and cash equivalents\n$\n6,874\xa0 $\n7,829\xa0\nShort-term investments (including assets reported at fair value of $2,507 and $2,524, respectively)\n3,197\xa0\n3,264\xa0\nFunds receivable and amounts held on behalf of customers\n5,869\xa0\n8,737\xa0\nPrepaids and other current assets (including customer receivables of $249 and $212 and allowances of $44 and $37, respectively)\n569\xa0\n563\xa0\nTotal current assets\n16,509\xa0\n20,393\xa0\nDeferred tax assets, net\n2,881\xa0\n2,886\xa0\nGoodwill and intangible assets, net\n792\xa0\n786\xa0\nOther assets, noncurrent\n463\xa0\n472\xa0\nTotal assets\n$\n20,645\xa0 $\n24,537\xa0\nLiabilities and Stockholders’ Equity\nCurrent liabilities:\nAccrued ex

In [19]:
qdrant_retriever.invoke("Where did air bnb started")

[(Document(page_content='Table of Contents\nPART I - FINANCIAL INFORMATION\nItem 1. Condensed Consolidated Financial Statements\nAirbnb, Inc.\nCondensed Consolidated Balance Sheets\n(in millions, except par value)\n(unaudited)\nDecember 31,\n2023\nMarch 31,\n2024\nAssets\nCurrent assets:\nCash and cash equivalents\n$\n6,874\xa0 $\n7,829\xa0\nShort-term investments (including assets reported at fair value of $2,507 and $2,524, respectively)\n3,197\xa0\n3,264\xa0\nFunds receivable and amounts held on behalf of customers\n5,869\xa0\n8,737\xa0\nPrepaids and other current assets (including customer receivables of $249 and $212 and allowances of $44 and $37, respectively)\n569\xa0\n563\xa0\nTotal current assets\n16,509\xa0\n20,393\xa0\nDeferred tax assets, net\n2,881\xa0\n2,886\xa0\nGoodwill and intangible assets, net\n792\xa0\n786\xa0\nOther assets, noncurrent\n463\xa0\n472\xa0\nTotal assets\n$\n20,645\xa0 $\n24,537\xa0\nLiabilities and Stockholders’ Equity\nCurrent liabilities:\nAccrued ex

In [20]:
response = retrieval_augmented_qa_chain.invoke({"question" : "What was the total value of 'Cash and cash equivalents' as of December 31, 2023?"})

In [30]:
response["context"][1][0].page_content

'The effect of exchange rate changes on cash, cash equivalents, and restricted cash on our consolidated statements of cash flows relates to certain of our assets, principally cash\nbalances held on behalf of customers, that are denominated in currencies other than the functional currency of certain of our subsidiaries. For the three months ended March 31,\n2024, we recorded a decrease of $111 million in cash, cash equivalents, and restricted cash, primarily due to the strengthening of the U.S. dollar. The impact of exchange rate\nchanges on cash balances can serve as a natural hedge for the effect of exchange rates on our liabilities to our hosts and guests.\nWe assess our liquidity in terms of our ability to generate cash to fund our short- and long-term cash requirements. As such, we believe that the cash flows generated from operating\nactivities will meet our anticipated cash requirements in the short-term. In addition to normal working capital requirements, we anticipate that our 

In [26]:
response = retrieval_augmented_qa_chain.invoke({"question" : "What is the 'maximum number of shares to be sold under the 10b5-1 Trading plan' by Brian Chesky?"})

In [27]:
response["response"].content

'1,146,000'

In [28]:
response["context"]

[Document(page_content='Table of Contents\nItem 5. Other Information\nDirector and Officer 10b5-1 Trading Plans (“10b5-1 Plans”)\nThe following table sets forth the material terms of 10b5-1 Plans intended to satisfy the affirmative defense conditions of Rule 10b5–1(c) that were adopted, terminated, or modified\nby our directors and officers during the three months ended March\xa031, 2024:\nName and Title of Director or Officer\nAction\n\xa0Date\nExpiration Date\nMaximum Number of\nShares to be Sold\nUnder the Plan\nDavid Bernstein, Chief Accounting Officer\nAdopt\n2/22/2024\n1/27/2025\n41,000\nBrian Chesky, Chief Executive Officer and Director\nAdopt\n2/28/2024\n11/11/2024\n1,146,000\nJoseph Gebbia, Director\nAdopt\n2/29/2024\n10/31/2024\n1,322,523\nThere were no “non-Rule 10b5-1 trading arrangements,” as defined in Item 408(c) of Regulation S-K, adopted, terminated, or modified by our directors or officers during the three\nmonths ended March\xa031, 2024.\nItem 6. Exhibits\nThe docume