{ "cells": [ { "cell_type": "markdown", "id": "8acae3ed-2953-45a3-aba9-0327b6ae3679", "metadata": {}, "source": [ "### ChromaDB method - create vectorstore based on Chroma" ] }, { "cell_type": "code", "execution_count": null, "id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97", "metadata": {}, "outputs": [], "source": [ "import chromadb\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.vector_stores.chroma.base import ChromaVectorStore\n", "from llama_index.core import StorageContext\n", "from llama_index.core import ServiceContext\n", "from llama_index.core import Document\n", "\n", "from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding\n", "from llama_index.core import Settings\n", "\n", "import nest_asyncio\n", "nest_asyncio.apply()\n", "\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb", "metadata": {}, "outputs": [], "source": [ "# load some documents\n", "documents = SimpleDirectoryReader(input_files=[\n", " \"../raw_documents/overview_background.txt\",\n", " \"../raw_documents/HI_Knowledge_Base.pdf\",\n", " \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n", " \"../raw_documents/qna.txt\"\n", " ]).load_data()\n", "document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))" ] }, { "cell_type": "code", "execution_count": null, "id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d", "metadata": {}, "outputs": [], "source": [ "# initialize client, setting path to save data\n", "db = chromadb.PersistentClient(path=\"../models/chroma_db\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed", "metadata": {}, "outputs": [], "source": [ "# create collection\n", "chroma_collection = db.get_or_create_collection(\"quickstart\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672", "metadata": {}, "outputs": [], "source": [ "# assign chroma as the vector_store to the context\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)" ] }, { "cell_type": "code", "execution_count": null, "id": "eb5edab2-30db-4bf7-96b5-4005d3161988", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "0946b6ce-96ab-44de-ad75-e424a8429f67", "metadata": {}, "outputs": [], "source": [ "Settings.llm = None\n", "Settings.chunk_size = 1024\n", "Settings.embed_model = \"local:../models/fine-tuned-embeddings\"" ] }, { "cell_type": "code", "execution_count": null, "id": "b8c73a2c-1129-406a-8046-085afcaf9cbb", "metadata": {}, "outputs": [], "source": [ "nodes = Settings.node_parser.get_nodes_from_documents(documents)" ] }, { "cell_type": "code", "execution_count": null, "id": "adfe688f-95c0-477c-a9de-e9e77541a1d7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4", "metadata": {}, "outputs": [], "source": [ "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": null, "id": "6a764113-ad7e-4674-aa57-ebbf405902a8", "metadata": {}, "outputs": [], "source": [ "storage_context.docstore.add_documents(nodes)" ] }, { "cell_type": "code", "execution_count": null, "id": "38e7c88d-6c45-4275-8293-d09b4b85a7cf", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05", "metadata": {}, "outputs": [], "source": [ "start_time = time.time()" ] }, { "cell_type": "code", "execution_count": null, "id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb", "metadata": {}, "outputs": [], "source": [ "vector_index = VectorStoreIndex(nodes, storage_context=storage_context)" ] }, { "cell_type": "code", "execution_count": null, "id": "082a0d7e-b025-4db1-be2a-7a0b7bc453b9", "metadata": {}, "outputs": [], "source": [ "vector_query_engine = vector_index.as_query_engine()" ] }, { "cell_type": "code", "execution_count": null, "id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3", "metadata": {}, "outputs": [], "source": [ "indexing_cost = time.time() - start_time\n", "indexing_cost = indexing_cost / 60\n", "print(f\"Indexing time: {indexing_cost:.1f} mins\")" ] }, { "cell_type": "code", "execution_count": null, "id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e", "metadata": {}, "outputs": [], "source": [ "response = vector_query_engine.query(\"what is the healthcare philosophy in singapore\")\n", "response" ] }, { "cell_type": "code", "execution_count": null, "id": "131d907a-0677-4ad8-b3f7-6fc9b9c5d0a5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "08fb2be5-3a44-4bb8-a9fc-61d7f03b7a35", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "a7fc01f6-4738-415b-a96b-afd6cf8d789a", "metadata": {}, "source": [ "### ChromaDB method - load vectorstore based on Chroma" ] }, { "cell_type": "code", "execution_count": null, "id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5", "metadata": {}, "outputs": [], "source": [ "import chromadb\n", "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n", "from llama_index.vector_stores.chroma.base import ChromaVectorStore\n", "from llama_index.core import StorageContext\n", "from llama_index.core import ServiceContext\n", "from llama_index.core import Document\n", "from llama_index.core import Settings\n", "\n", "from llama_index.embeddings.huggingface.base import HuggingFaceEmbedding\n", "from llama_index.llms.openai import OpenAI\n", "from llama_index.core.memory import ChatMemoryBuffer\n", "\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "id": "72dd0ece-c72d-428a-89b4-9494d948c845", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d38dc953-b923-4128-86a1-c8c6f69af0ed", "metadata": {}, "outputs": [], "source": [ "fine_tuned_path = \"local:../models/fine-tuned-embeddings\"" ] }, { "cell_type": "code", "execution_count": null, "id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e", "metadata": {}, "outputs": [], "source": [ "llm = OpenAI(model=\"gpt-3.5-turbo-1106\", temperature=0.0)" ] }, { "cell_type": "code", "execution_count": null, "id": "0583e9b0-d977-488c-8331-46dfa749924c", "metadata": {}, "outputs": [], "source": [ "Settings.llm = llm\n", "Settings.embed_model = fine_tuned_path" ] }, { "cell_type": "code", "execution_count": null, "id": "f994f440-f647-48b4-a517-46a79f7561e5", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba", "metadata": {}, "outputs": [], "source": [ "db = chromadb.PersistentClient(path=\"../models/chroma_db\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1b385644-b46e-4d13-88fa-9f4af39db405", "metadata": {}, "outputs": [], "source": [ "chroma_collection = db.get_or_create_collection(\"quickstart\")" ] }, { "cell_type": "code", "execution_count": null, "id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2", "metadata": {}, "outputs": [], "source": [ "# assign chroma as the vector_store to the context\n", "vector_store = ChromaVectorStore(chroma_collection=chroma_collection)\n", "storage_context = StorageContext.from_defaults(vector_store=vector_store)" ] }, { "cell_type": "code", "execution_count": null, "id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae", "metadata": {}, "outputs": [], "source": [ "# create your index\n", "index = VectorStoreIndex.from_vector_store(\n", " vector_store=vector_store,\n", " storage_context=storage_context\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "73ba6d06-ba69-4b5e-962a-9cf7d2dc4d94", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1a506940-c2b4-4d14-ad93-fd451331c582", "metadata": {}, "outputs": [], "source": [ "system_content = (\"You are a helpful study assistant. \"\n", " \"You do not respond as 'User' or pretend to be 'User'. \"\n", " \"You only respond once as 'Assistant'.\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "3f592848-8536-4b4d-b34a-adc32d043432", "metadata": {}, "outputs": [], "source": [ "memory = ChatMemoryBuffer.from_defaults(token_limit=15000)" ] }, { "cell_type": "code", "execution_count": null, "id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252", "metadata": {}, "outputs": [], "source": [ "chat_engine = index.as_chat_engine(\n", " chat_mode=\"context\",\n", " memory=memory,\n", " system_prompt=system_content\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612", "metadata": {}, "outputs": [], "source": [ "res = chat_engine.chat(\"what is the healthcare philosophy in singapore\")\n", "print(res.response)" ] }, { "cell_type": "code", "execution_count": null, "id": "1e62303c-3a00-448f-ad93-15cb6cee1f24", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "dad72f9f-7f86-407d-93be-f5724cb30d5c", "metadata": {}, "outputs": [], "source": [ "hi_engine = index.as_query_engine(\n", " memory=memory,\n", " system_prompt=system_content,\n", " similarity_top_k=3,\n", " streaming=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "ab778a5d-d438-4f39-88f5-c67a1f1d575e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "7bb7c21a-7461-40c1-87a7-4a1f92f70153", "metadata": {}, "outputs": [], "source": [ "res = hi_engine.query(\"What is llama2?\")\n", "print(res)" ] }, { "cell_type": "code", "execution_count": null, "id": "874a39ce-e682-42fa-8085-646bacea6cdb", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "301e8270-783d-4942-a05f-9683ca96fbda", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.18" } }, "nbformat": 4, "nbformat_minor": 5 }