lingyit1108 commited on
Commit
70e2d85
1 Parent(s): 5ea4259

resolved pdf reader issue

Browse files
config/model_config_advanced.yml CHANGED
@@ -11,7 +11,7 @@ embeddings:
11
  fine_tuned_embedding_model: 'local:models/fine-tuned-embeddings-advanced'
12
 
13
  vector_store:
14
- persisted_path: './models/chroma_db_advanced'
15
 
16
  questionaire_data:
17
  db_path: './database/mock_qna.sqlite'
 
11
  fine_tuned_embedding_model: 'local:models/fine-tuned-embeddings-advanced'
12
 
13
  vector_store:
14
+ persisted_path: './models/chroma_db_advanced_corrected'
15
 
16
  questionaire_data:
17
  db_path: './database/mock_qna.sqlite'
database/mock_qna.sqlite CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23973f07315b6b1f3aaa1d3b90263a9e60518724f8a077cd6fdb44d809db6da4
3
  size 40960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7f3f8c146d46df19f3dd8a4846ccbf63f88e6dd914b67a0c5c689eba21a558d
3
  size 40960
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c7deba9397301d1ea3f7f5edcb06162bc4797984100c456b128303c58b95c79
3
+ size 31844000
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1fdac383bac7a3236c814029cee7525e8018d396f9cd0d15b97a22a3af9090d8
3
+ size 100
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/index_metadata.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:175eec7edc141af44f996bba0295ddb71d3fd54c39b1352539ede6753f00e834
3
+ size 1100226
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68f24a06f63a85c1b082283b0be703475bf9023e9f4f7e8b3bd4bca276af3b8f
3
+ size 76000
models/chroma_db_advanced_corrected/7ffa7a70-bc3b-4a97-858f-3c8223492b3e/link_lists.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:224e61d8e99cd63b57c6e9204f3136aba4a43f9bb15d7dcd5eb181c9378829f8
3
+ size 167188
models/chroma_db_advanced_corrected/chroma.sqlite3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf347334a74a91a9d24d298ad1b12fa043579fe0b915cf7bc558f3d2acafca5c
3
+ size 299061248
notebooks/002_persisted-embedding-model-advanced.ipynb CHANGED
@@ -10,7 +10,7 @@
10
  },
11
  {
12
  "cell_type": "code",
13
- "execution_count": 1,
14
  "id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97",
15
  "metadata": {},
16
  "outputs": [],
@@ -33,24 +33,16 @@
33
  "import nest_asyncio\n",
34
  "nest_asyncio.apply()\n",
35
  "\n",
36
- "import time"
 
37
  ]
38
  },
39
  {
40
  "cell_type": "code",
41
- "execution_count": 2,
42
  "id": "978152ce-4d87-44b5-b521-dbaff60b32b0",
43
  "metadata": {},
44
- "outputs": [
45
- {
46
- "name": "stderr",
47
- "output_type": "stream",
48
- "text": [
49
- "199it [00:00, 8821.71it/s]\n",
50
- "200it [00:00, 12584.17it/s]\n"
51
- ]
52
- }
53
- ],
54
  "source": [
55
  "split_content(filepath=\"../raw_documents/answers.txt\", \n",
56
  " separator=\"\\n\\n\", \n",
@@ -63,7 +55,7 @@
63
  },
64
  {
65
  "cell_type": "code",
66
- "execution_count": 5,
67
  "id": "d925371b-8777-4f5b-a7f2-ec3f228ef266",
68
  "metadata": {},
69
  "outputs": [],
@@ -84,41 +76,64 @@
84
  {
85
  "cell_type": "code",
86
  "execution_count": null,
87
- "id": "e876a26b-822d-44d6-a3dd-ccdcc04933cf",
88
  "metadata": {},
89
  "outputs": [],
90
  "source": []
91
  },
92
  {
93
  "cell_type": "code",
94
- "execution_count": 7,
95
  "id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb",
96
  "metadata": {},
97
  "outputs": [],
98
  "source": [
99
  "# load some documents\n",
100
- "documents = SimpleDirectoryReader(input_files=[\n",
101
- " \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
102
- " \"../raw_documents/conversation_examples.txt\",\n",
103
- " \"../raw_documents/HI_Knowledge_Base.pdf\",\n",
104
- " ] + answers_temp_files + qna_temp_files ).load_data()\n",
105
- "document = Document(text=\"\\n\\n\".join([doc.text for doc in documents]))"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  ]
107
  },
108
  {
109
  "cell_type": "code",
110
- "execution_count": 8,
 
 
 
 
 
 
 
 
111
  "id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d",
112
  "metadata": {},
113
  "outputs": [],
114
  "source": [
115
  "# initialize client, setting path to save data\n",
116
- "db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced\")"
117
  ]
118
  },
119
  {
120
  "cell_type": "code",
121
- "execution_count": 9,
122
  "id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed",
123
  "metadata": {},
124
  "outputs": [],
@@ -129,7 +144,7 @@
129
  },
130
  {
131
  "cell_type": "code",
132
- "execution_count": 10,
133
  "id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672",
134
  "metadata": {},
135
  "outputs": [],
@@ -148,18 +163,10 @@
148
  },
149
  {
150
  "cell_type": "code",
151
- "execution_count": 11,
152
  "id": "0946b6ce-96ab-44de-ad75-e424a8429f67",
153
  "metadata": {},
154
- "outputs": [
155
- {
156
- "name": "stdout",
157
- "output_type": "stream",
158
- "text": [
159
- "LLM is explicitly disabled. Using MockLLM.\n"
160
- ]
161
- }
162
- ],
163
  "source": [
164
  "Settings.llm = None\n",
165
  "Settings.chunk_size = 1024\n",
@@ -169,31 +176,20 @@
169
  },
170
  {
171
  "cell_type": "code",
172
- "execution_count": 12,
173
  "id": "b8c73a2c-1129-406a-8046-085afcaf9cbb",
174
  "metadata": {},
175
  "outputs": [],
176
  "source": [
177
- "nodes = Settings.node_parser.get_nodes_from_documents(documents)"
178
  ]
179
  },
180
  {
181
  "cell_type": "code",
182
- "execution_count": 13,
183
  "id": "75f1c76f-d3e5-4b69-818c-98865adb1457",
184
  "metadata": {},
185
- "outputs": [
186
- {
187
- "data": {
188
- "text/plain": [
189
- "6814"
190
- ]
191
- },
192
- "execution_count": 13,
193
- "metadata": {},
194
- "output_type": "execute_result"
195
- }
196
- ],
197
  "source": [
198
  "len(nodes)"
199
  ]
@@ -208,7 +204,7 @@
208
  },
209
  {
210
  "cell_type": "code",
211
- "execution_count": 14,
212
  "id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4",
213
  "metadata": {},
214
  "outputs": [],
@@ -218,7 +214,7 @@
218
  },
219
  {
220
  "cell_type": "code",
221
- "execution_count": 15,
222
  "id": "6a764113-ad7e-4674-aa57-ebbf405902a8",
223
  "metadata": {},
224
  "outputs": [],
@@ -236,7 +232,7 @@
236
  },
237
  {
238
  "cell_type": "code",
239
- "execution_count": 16,
240
  "id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05",
241
  "metadata": {},
242
  "outputs": [],
@@ -246,7 +242,7 @@
246
  },
247
  {
248
  "cell_type": "code",
249
- "execution_count": 17,
250
  "id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb",
251
  "metadata": {},
252
  "outputs": [],
@@ -256,18 +252,10 @@
256
  },
257
  {
258
  "cell_type": "code",
259
- "execution_count": 18,
260
  "id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3",
261
  "metadata": {},
262
- "outputs": [
263
- {
264
- "name": "stdout",
265
- "output_type": "stream",
266
- "text": [
267
- "Indexing time: 2.3 mins\n"
268
- ]
269
- }
270
- ],
271
  "source": [
272
  "indexing_cost = time.time() - start_time\n",
273
  "indexing_cost = indexing_cost / 60\n",
@@ -276,7 +264,7 @@
276
  },
277
  {
278
  "cell_type": "code",
279
- "execution_count": 19,
280
  "id": "f16cca33-71fb-437d-a033-671b9fd44054",
281
  "metadata": {},
282
  "outputs": [],
@@ -286,28 +274,28 @@
286
  },
287
  {
288
  "cell_type": "code",
289
- "execution_count": 20,
290
  "id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e",
291
  "metadata": {
292
  "scrolled": true
293
  },
294
- "outputs": [
295
- {
296
- "data": {
297
- "text/plain": [
298
- "Response(response='Context information is below.\\n---------------------\\nfile_path: ../raw_documents/answers_temp/answers_050.txt\\n\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".\\n\\nfile_path: ../raw_documents/qna_temp/qna_050.txt\\n\\nC1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3’s M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".\\n---------------------\\nGiven the context information and not prior knowledge, answer the query.\\nQuery: Healthcare System in Singapore consists of?\\nAnswer: ', source_nodes=[NodeWithScore(node=TextNode(id_='536fef67-6a3f-4054-a94a-cc9143599510', embedding=None, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='2b0f7dad-c532-4abd-8c42-f53383a4fc76', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='5b1d1dc729a663e4ccfacc0f18adf0f6644a2a7d2991490fd962d1550c83f2ff'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='6d93c092-b4cc-4b5b-b379-080d777d3908', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '../raw_documents/answers_temp/answers_044.txt', 'file_name': 'answers_044.txt', 'file_type': 'text/plain', 'file_size': 164, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='caeb59043b8daa56ed472941882947570abff951f64aa0498672aba5921fac1d'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='859a9958-6f5d-4581-95d0-39edfc950ef5', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='8416454b2fbad3e6122c5151d2b3d1eadf0afde3514ba09374c71e96baf712bc')}, text='Question: The fundamental principle of Singapore healthcare financing is ____________.\\nAnswer: The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=130, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4159636550867191), NodeWithScore(node=TextNode(id_='472000ae-a0aa-4464-a200-72fe67a3fbde', embedding=None, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='506fb715-d3b0-4ca7-b7ca-011a1e1a1f0d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='7461ffa12ff6729003131976b82995b7254ab10f8dc7d79c65988ec9e3b7b631'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='d8232b90-d641-4966-b98f-4ca0821db773', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': '../raw_documents/qna_temp/qna_044.txt', 'file_name': 'qna_044.txt', 'file_type': 'text/plain', 'file_size': 383, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, hash='cbeb00c29c6130548466697a862fee43ab2be92d84158cc0b69c2f5c7bbe68b1'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='e772e623-cf91-41cd-a516-50acb894eb54', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='a7583b0fd46f98d0118c712632277d81f417b779f8bcc100ab2558dae6317cde')}, text='C1/5\\nQuestion: The fundamental principle of Singapore healthcare financing is ____________.\\nA. The 3’s M. That is Medisave, Medishield, Medifund.\\nB. Means Testing and Casemix.\\nC. Individual Savings.\\nD. Tax based subsidies and government subvention.\\nAnswer: C. The answer is \"Individual Savings\".', start_char_idx=0, end_char_idx=295, text_template='{metadata_str}\\n\\n{content}', metadata_template='{key}: {value}', metadata_seperator='\\n'), score=0.4126648577998099)], metadata={'536fef67-6a3f-4054-a94a-cc9143599510': {'file_path': '../raw_documents/answers_temp/answers_050.txt', 'file_name': 'answers_050.txt', 'file_type': 'text/plain', 'file_size': 130, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}, '472000ae-a0aa-4464-a200-72fe67a3fbde': {'file_path': '../raw_documents/qna_temp/qna_050.txt', 'file_name': 'qna_050.txt', 'file_type': 'text/plain', 'file_size': 297, 'creation_date': '2024-02-24', 'last_modified_date': '2024-02-24', 'last_accessed_date': '2024-02-24'}})"
299
- ]
300
- },
301
- "execution_count": 20,
302
- "metadata": {},
303
- "output_type": "execute_result"
304
- }
305
- ],
306
  "source": [
307
  "response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n",
308
  "response"
309
  ]
310
  },
 
 
 
 
 
 
 
 
 
 
 
311
  {
312
  "cell_type": "code",
313
  "execution_count": null,
@@ -318,7 +306,7 @@
318
  },
319
  {
320
  "cell_type": "code",
321
- "execution_count": 21,
322
  "id": "1bb75b04-6a62-43a4-8728-d2e52e49f1c0",
323
  "metadata": {},
324
  "outputs": [],
@@ -329,7 +317,7 @@
329
  },
330
  {
331
  "cell_type": "code",
332
- "execution_count": 22,
333
  "id": "0ed920fb-6456-49ac-8b63-08bd86b5b39c",
334
  "metadata": {},
335
  "outputs": [],
@@ -364,7 +352,7 @@
364
  },
365
  {
366
  "cell_type": "code",
367
- "execution_count": 1,
368
  "id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5",
369
  "metadata": {},
370
  "outputs": [],
@@ -381,7 +369,14 @@
381
  "from llama_index.llms.openai import OpenAI\n",
382
  "from llama_index.core.memory import ChatMemoryBuffer\n",
383
  "\n",
384
- "import time"
 
 
 
 
 
 
 
385
  ]
386
  },
387
  {
@@ -394,7 +389,7 @@
394
  },
395
  {
396
  "cell_type": "code",
397
- "execution_count": 2,
398
  "id": "d38dc953-b923-4128-86a1-c8c6f69af0ed",
399
  "metadata": {},
400
  "outputs": [],
@@ -404,7 +399,7 @@
404
  },
405
  {
406
  "cell_type": "code",
407
- "execution_count": 3,
408
  "id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e",
409
  "metadata": {},
410
  "outputs": [],
@@ -414,7 +409,7 @@
414
  },
415
  {
416
  "cell_type": "code",
417
- "execution_count": 4,
418
  "id": "0583e9b0-d977-488c-8331-46dfa749924c",
419
  "metadata": {},
420
  "outputs": [],
@@ -433,17 +428,17 @@
433
  },
434
  {
435
  "cell_type": "code",
436
- "execution_count": 5,
437
  "id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba",
438
  "metadata": {},
439
  "outputs": [],
440
  "source": [
441
- "db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced\")"
442
  ]
443
  },
444
  {
445
  "cell_type": "code",
446
- "execution_count": 6,
447
  "id": "1b385644-b46e-4d13-88fa-9f4af39db405",
448
  "metadata": {},
449
  "outputs": [],
@@ -453,7 +448,7 @@
453
  },
454
  {
455
  "cell_type": "code",
456
- "execution_count": 7,
457
  "id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2",
458
  "metadata": {},
459
  "outputs": [],
@@ -465,7 +460,7 @@
465
  },
466
  {
467
  "cell_type": "code",
468
- "execution_count": 8,
469
  "id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae",
470
  "metadata": {},
471
  "outputs": [],
@@ -487,20 +482,7 @@
487
  },
488
  {
489
  "cell_type": "code",
490
- "execution_count": 9,
491
- "id": "1a506940-c2b4-4d14-ad93-fd451331c582",
492
- "metadata": {},
493
- "outputs": [],
494
- "source": [
495
- "system_content = (\"You are a helpful study assistant. \"\n",
496
- " \"You do not respond as 'User' or pretend to be 'User'. \"\n",
497
- " \"You only respond once as 'Assistant'.\"\n",
498
- ")"
499
- ]
500
- },
501
- {
502
- "cell_type": "code",
503
- "execution_count": 10,
504
  "id": "3f592848-8536-4b4d-b34a-adc32d043432",
505
  "metadata": {},
506
  "outputs": [],
@@ -510,7 +492,7 @@
510
  },
511
  {
512
  "cell_type": "code",
513
- "execution_count": 11,
514
  "id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252",
515
  "metadata": {},
516
  "outputs": [],
@@ -524,7 +506,7 @@
524
  },
525
  {
526
  "cell_type": "code",
527
- "execution_count": 12,
528
  "id": "c3106dff-dd6f-47a9-9454-1e61775e7539",
529
  "metadata": {},
530
  "outputs": [],
@@ -532,7 +514,7 @@
532
  "hi_engine = index.as_query_engine(\n",
533
  " memory=memory,\n",
534
  " system_prompt=system_content,\n",
535
- " similarity_top_k=10,\n",
536
  " streaming=True\n",
537
  ")"
538
  ]
@@ -547,7 +529,7 @@
547
  },
548
  {
549
  "cell_type": "code",
550
- "execution_count": 24,
551
  "id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612",
552
  "metadata": {},
553
  "outputs": [],
@@ -563,21 +545,14 @@
563
  },
564
  {
565
  "cell_type": "code",
566
- "execution_count": 26,
567
  "id": "a1c83dff-50d1-47b1-b7e9-4fc5cd08e1e8",
568
  "metadata": {},
569
- "outputs": [
570
- {
571
- "name": "stdout",
572
- "output_type": "stream",
573
- "text": [
574
- "D. To provide for the care of employees\n"
575
- ]
576
- }
577
- ],
578
  "source": [
579
- "res = hi_engine.query(prompt)\n",
580
- "print(res)"
 
581
  ]
582
  },
583
  {
@@ -591,39 +566,34 @@
591
  {
592
  "cell_type": "code",
593
  "execution_count": null,
594
- "id": "ec53dfcf-d4c0-4d10-a24e-be2004a83656",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
595
  "metadata": {},
596
  "outputs": [],
597
  "source": []
598
  },
599
  {
600
  "cell_type": "code",
601
- "execution_count": 14,
602
- "id": "78abaf95-e52d-445c-9d8e-bc51efb20f06",
603
- "metadata": {},
604
- "outputs": [
605
- {
606
- "name": "stderr",
607
- "output_type": "stream",
608
- "text": [
609
- "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
610
- "To disable this warning, you can either:\n",
611
- "\t- Avoid using `tokenizers` before the fork if possible\n",
612
- "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
613
- ]
614
- },
615
- {
616
- "name": "stdout",
617
- "output_type": "stream",
618
- "text": [
619
- "The correct answer is \"Deductibles apply for all treatments\".\n"
620
- ]
621
- }
622
- ],
623
- "source": [
624
- "res = chat_engine.chat(prompt)\n",
625
- "print(res.response)"
626
- ]
627
  },
628
  {
629
  "cell_type": "code",
 
10
  },
11
  {
12
  "cell_type": "code",
13
+ "execution_count": null,
14
  "id": "7de9c591-5a77-4bbe-80f1-4897e15f0b97",
15
  "metadata": {},
16
  "outputs": [],
 
33
  "import nest_asyncio\n",
34
  "nest_asyncio.apply()\n",
35
  "\n",
36
+ "import time\n",
37
+ "import PyPDF2"
38
  ]
39
  },
40
  {
41
  "cell_type": "code",
42
+ "execution_count": null,
43
  "id": "978152ce-4d87-44b5-b521-dbaff60b32b0",
44
  "metadata": {},
45
+ "outputs": [],
 
 
 
 
 
 
 
 
 
46
  "source": [
47
  "split_content(filepath=\"../raw_documents/answers.txt\", \n",
48
  " separator=\"\\n\\n\", \n",
 
55
  },
56
  {
57
  "cell_type": "code",
58
+ "execution_count": null,
59
  "id": "d925371b-8777-4f5b-a7f2-ec3f228ef266",
60
  "metadata": {},
61
  "outputs": [],
 
76
  {
77
  "cell_type": "code",
78
  "execution_count": null,
79
+ "id": "a83b4fd8-5075-4c52-820c-a3ac7ee7f0c8",
80
  "metadata": {},
81
  "outputs": [],
82
  "source": []
83
  },
84
  {
85
  "cell_type": "code",
86
+ "execution_count": null,
87
  "id": "3e65dff6-77b6-4be8-8857-5cecf3a035bb",
88
  "metadata": {},
89
  "outputs": [],
90
  "source": [
91
  "# load some documents\n",
92
+ "if False:\n",
93
+ " documents = SimpleDirectoryReader(input_files=[\n",
94
+ " \"../raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
95
+ " \"../raw_documents/conversation_examples.txt\",\n",
96
+ " \"../raw_documents/HI_Knowledge_Base.pdf\",\n",
97
+ " ] + answers_temp_files + qna_temp_files ).load_data()\n",
98
+ "else:\n",
99
+ " reader_summary = PyPDF2.PdfReader(\"../raw_documents/HI Chapter Summary Version 1.3.pdf\")\n",
100
+ " documents_summary = [ p.extract_text() for p in reader_summary.pages ]\n",
101
+ "\n",
102
+ " reader_base = PyPDF2.PdfReader(\"../raw_documents/HI_Knowledge_Base.pdf\")\n",
103
+ " documents_base = [ p.extract_text() for p in reader_base.pages ]\n",
104
+ " \n",
105
+ " documents_txt = SimpleDirectoryReader(input_files=[\n",
106
+ " \"../raw_documents/conversation_examples.txt\",\n",
107
+ " \"../raw_documents/qna.txt\",\n",
108
+ " \"../raw_documents/answers.txt\"\n",
109
+ " ] ).load_data()\n",
110
+ " documents_txt = [doc.text for doc in documents_txt]\n",
111
+ "\n",
112
+ "document = Document(text=\"\\n\\n\".join(documents_summary + documents_base + documents_txt))"
113
  ]
114
  },
115
  {
116
  "cell_type": "code",
117
+ "execution_count": null,
118
+ "id": "e485f801-1829-4b50-b6b2-52803203853b",
119
+ "metadata": {},
120
+ "outputs": [],
121
+ "source": []
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
  "id": "bd86b3f5-1dfc-4257-bd9c-86d34f02398d",
127
  "metadata": {},
128
  "outputs": [],
129
  "source": [
130
  "# initialize client, setting path to save data\n",
131
+ "db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced_corrected\")"
132
  ]
133
  },
134
  {
135
  "cell_type": "code",
136
+ "execution_count": null,
137
  "id": "f568ce7b-bcbf-455c-acf1-6c2cae129fed",
138
  "metadata": {},
139
  "outputs": [],
 
144
  },
145
  {
146
  "cell_type": "code",
147
+ "execution_count": null,
148
  "id": "ed0b018e-1982-46b2-b1b4-04f5c0ce8672",
149
  "metadata": {},
150
  "outputs": [],
 
163
  },
164
  {
165
  "cell_type": "code",
166
+ "execution_count": null,
167
  "id": "0946b6ce-96ab-44de-ad75-e424a8429f67",
168
  "metadata": {},
169
+ "outputs": [],
 
 
 
 
 
 
 
 
170
  "source": [
171
  "Settings.llm = None\n",
172
  "Settings.chunk_size = 1024\n",
 
176
  },
177
  {
178
  "cell_type": "code",
179
+ "execution_count": null,
180
  "id": "b8c73a2c-1129-406a-8046-085afcaf9cbb",
181
  "metadata": {},
182
  "outputs": [],
183
  "source": [
184
+ "nodes = Settings.node_parser.get_nodes_from_documents([document])"
185
  ]
186
  },
187
  {
188
  "cell_type": "code",
189
+ "execution_count": null,
190
  "id": "75f1c76f-d3e5-4b69-818c-98865adb1457",
191
  "metadata": {},
192
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
193
  "source": [
194
  "len(nodes)"
195
  ]
 
204
  },
205
  {
206
  "cell_type": "code",
207
+ "execution_count": null,
208
  "id": "dab4c6f3-ef67-4d90-b3d5-e290c5d1b6f4",
209
  "metadata": {},
210
  "outputs": [],
 
214
  },
215
  {
216
  "cell_type": "code",
217
+ "execution_count": null,
218
  "id": "6a764113-ad7e-4674-aa57-ebbf405902a8",
219
  "metadata": {},
220
  "outputs": [],
 
232
  },
233
  {
234
  "cell_type": "code",
235
+ "execution_count": null,
236
  "id": "e492ed4a-23a3-47d6-8b50-51fb48b3aa05",
237
  "metadata": {},
238
  "outputs": [],
 
242
  },
243
  {
244
  "cell_type": "code",
245
+ "execution_count": null,
246
  "id": "cbd11b89-9b83-4f08-bb30-160f750f2ffb",
247
  "metadata": {},
248
  "outputs": [],
 
252
  },
253
  {
254
  "cell_type": "code",
255
+ "execution_count": null,
256
  "id": "d3bd848d-9985-4a3d-bdc4-ec340cc69ef3",
257
  "metadata": {},
258
+ "outputs": [],
 
 
 
 
 
 
 
 
259
  "source": [
260
  "indexing_cost = time.time() - start_time\n",
261
  "indexing_cost = indexing_cost / 60\n",
 
264
  },
265
  {
266
  "cell_type": "code",
267
+ "execution_count": null,
268
  "id": "f16cca33-71fb-437d-a033-671b9fd44054",
269
  "metadata": {},
270
  "outputs": [],
 
274
  },
275
  {
276
  "cell_type": "code",
277
+ "execution_count": null,
278
  "id": "3290e870-41d7-49c4-9c4f-cb16bd1f469e",
279
  "metadata": {
280
  "scrolled": true
281
  },
282
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
283
  "source": [
284
  "response = vector_query_engine.query(\"Healthcare System in Singapore consists of?\")\n",
285
  "response"
286
  ]
287
  },
288
+ {
289
+ "cell_type": "code",
290
+ "execution_count": null,
291
+ "id": "d83e2938-61fa-4d02-920d-0ae88a437abc",
292
+ "metadata": {},
293
+ "outputs": [],
294
+ "source": [
295
+ "response = vector_query_engine.query(\"what is integrated shield plan\")\n",
296
+ "response"
297
+ ]
298
+ },
299
  {
300
  "cell_type": "code",
301
  "execution_count": null,
 
306
  },
307
  {
308
  "cell_type": "code",
309
+ "execution_count": null,
310
  "id": "1bb75b04-6a62-43a4-8728-d2e52e49f1c0",
311
  "metadata": {},
312
  "outputs": [],
 
317
  },
318
  {
319
  "cell_type": "code",
320
+ "execution_count": null,
321
  "id": "0ed920fb-6456-49ac-8b63-08bd86b5b39c",
322
  "metadata": {},
323
  "outputs": [],
 
352
  },
353
  {
354
  "cell_type": "code",
355
+ "execution_count": null,
356
  "id": "c1a42c35-5f57-423c-8fb7-7d18b3b466b5",
357
  "metadata": {},
358
  "outputs": [],
 
369
  "from llama_index.llms.openai import OpenAI\n",
370
  "from llama_index.core.memory import ChatMemoryBuffer\n",
371
  "\n",
372
+ "import time\n",
373
+ "\n",
374
+ "from prompt_engineering import (\n",
375
+ " system_content, \n",
376
+ " textbook_content, \n",
377
+ " winnie_the_pooh_prompt, \n",
378
+ " introduction_line\n",
379
+ ")"
380
  ]
381
  },
382
  {
 
389
  },
390
  {
391
  "cell_type": "code",
392
+ "execution_count": null,
393
  "id": "d38dc953-b923-4128-86a1-c8c6f69af0ed",
394
  "metadata": {},
395
  "outputs": [],
 
399
  },
400
  {
401
  "cell_type": "code",
402
+ "execution_count": null,
403
  "id": "4c83c613-2cfc-4871-9d07-c82f77a3bd5e",
404
  "metadata": {},
405
  "outputs": [],
 
409
  },
410
  {
411
  "cell_type": "code",
412
+ "execution_count": null,
413
  "id": "0583e9b0-d977-488c-8331-46dfa749924c",
414
  "metadata": {},
415
  "outputs": [],
 
428
  },
429
  {
430
  "cell_type": "code",
431
+ "execution_count": null,
432
  "id": "2159a2b6-494b-41b9-ac54-dd342bfb74ba",
433
  "metadata": {},
434
  "outputs": [],
435
  "source": [
436
+ "db = chromadb.PersistentClient(path=\"../models/chroma_db_advanced_corrected\")"
437
  ]
438
  },
439
  {
440
  "cell_type": "code",
441
+ "execution_count": null,
442
  "id": "1b385644-b46e-4d13-88fa-9f4af39db405",
443
  "metadata": {},
444
  "outputs": [],
 
448
  },
449
  {
450
  "cell_type": "code",
451
+ "execution_count": null,
452
  "id": "93cb53d1-6b8c-4b2d-a839-53501c0d54b2",
453
  "metadata": {},
454
  "outputs": [],
 
460
  },
461
  {
462
  "cell_type": "code",
463
+ "execution_count": null,
464
  "id": "c40d59e1-6d42-41f0-8c9b-70aa026093ae",
465
  "metadata": {},
466
  "outputs": [],
 
482
  },
483
  {
484
  "cell_type": "code",
485
+ "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  "id": "3f592848-8536-4b4d-b34a-adc32d043432",
487
  "metadata": {},
488
  "outputs": [],
 
492
  },
493
  {
494
  "cell_type": "code",
495
+ "execution_count": null,
496
  "id": "6c7df81a-fd2f-42bf-b09c-46d7750f7252",
497
  "metadata": {},
498
  "outputs": [],
 
506
  },
507
  {
508
  "cell_type": "code",
509
+ "execution_count": null,
510
  "id": "c3106dff-dd6f-47a9-9454-1e61775e7539",
511
  "metadata": {},
512
  "outputs": [],
 
514
  "hi_engine = index.as_query_engine(\n",
515
  " memory=memory,\n",
516
  " system_prompt=system_content,\n",
517
+ " similarity_top_k=20,\n",
518
  " streaming=True\n",
519
  ")"
520
  ]
 
529
  },
530
  {
531
  "cell_type": "code",
532
+ "execution_count": null,
533
  "id": "434f0caf-8b1f-40c6-b9ec-b039cd1ca612",
534
  "metadata": {},
535
  "outputs": [],
 
545
  },
546
  {
547
  "cell_type": "code",
548
+ "execution_count": null,
549
  "id": "a1c83dff-50d1-47b1-b7e9-4fc5cd08e1e8",
550
  "metadata": {},
551
+ "outputs": [],
 
 
 
 
 
 
 
 
552
  "source": [
553
+ "response = hi_engine.query(prompt)\n",
554
+ "for res in response.response_gen:\n",
555
+ " print(res, end=\"\")"
556
  ]
557
  },
558
  {
 
566
  {
567
  "cell_type": "code",
568
  "execution_count": null,
569
+ "id": "91821a22-c1c4-46a6-90f0-c00651afb0f6",
570
+ "metadata": {},
571
+ "outputs": [],
572
+ "source": [
573
+ "# query_string = \"tell me more about integrated shield plans\"\n",
574
+ "# query_string = \"how to use CPF\"\n",
575
+ "query_string = \"what is MediSave\"\n",
576
+ "\n",
577
+ "response = hi_engine.query(query_string)\n",
578
+ "for res in response.response_gen:\n",
579
+ " print(res, end=\"\")"
580
+ ]
581
+ },
582
+ {
583
+ "cell_type": "code",
584
+ "execution_count": null,
585
+ "id": "07969feb-2667-4d7d-a769-953082138988",
586
  "metadata": {},
587
  "outputs": [],
588
  "source": []
589
  },
590
  {
591
  "cell_type": "code",
592
+ "execution_count": null,
593
+ "id": "ec53dfcf-d4c0-4d10-a24e-be2004a83656",
594
+ "metadata": {},
595
+ "outputs": [],
596
+ "source": []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
  },
598
  {
599
  "cell_type": "code",
notebooks/007_test_hi_content_engine.ipynb CHANGED
@@ -34,6 +34,12 @@
34
  "\n",
35
  "from vision_api import get_transcribed_text\n",
36
  "from qna_prompting import get_qna_question_tool, evaluate_qna_answer_tool\n",
 
 
 
 
 
 
37
  "\n",
38
  "import nest_asyncio\n",
39
  "nest_asyncio.apply()"
@@ -106,18 +112,20 @@
106
  "\n",
107
  " index = VectorStoreIndex(nodes, storage_context=storage_context)\n",
108
  " \n",
109
- " memory = ChatMemoryBuffer.from_defaults(token_limit=15000)\n",
110
  " hi_content_engine = index.as_query_engine(\n",
111
  " memory=memory,\n",
112
  " system_prompt=system_content,\n",
113
- " similarity_top_k=20,\n",
 
114
  " streaming=True\n",
115
  " )\n",
116
  " hi_textbook_query_description = \"\"\"\n",
117
- " Use this tool to extract content from textbook `Health Insurance 7th Edition`,\n",
 
118
  " that has 15 chapters in total. When user wants to learn more about a \n",
119
  " particular chapter, this tool will help to assist user to get better\n",
120
- " understanding of the content of the textbook.\n",
121
  " \"\"\"\n",
122
  " \n",
123
  " hi_query_tool = QueryEngineTool.from_defaults(\n",
@@ -195,32 +203,10 @@
195
  "input_files = [\"./raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
196
  " \"./raw_documents/qna.txt\"]\n",
197
  "embedding_model = \"BAAI/bge-small-en-v1.5\"\n",
198
- "persisted_vector_db = \"../models/chroma_db\"\n",
199
- "fine_tuned_path = \"local:../models/fine-tuned-embeddings\"\n",
200
- "system_content = (\n",
201
- " \"You are a helpful study assistant. \"\n",
202
- " \"You do not respond as 'User' or pretend to be 'User'. \"\n",
203
- " \"You only respond once as 'Assistant'.\"\n",
204
- ")\n",
205
- "textbook_content = (\n",
206
- " \"The content of the textbook `Health Insurance 7th Edition` are as follows,\"\n",
207
- " \"- Chapter 1: Overview Of Healthcare Environment In Singapore\"\n",
208
- " \"- Chapter 2: Medical Expense Insurance\"\n",
209
- " \"- Chapter 3: Group Medical Expense Insurance\"\n",
210
- " \"- Chapter 4: Disability Income Insurance\"\n",
211
- " \"- Chapter 5: Long-Term Care Insurance \"\n",
212
- " \"- Chapter 6: Critical Illness Insurance\"\n",
213
- " \"- Chapter 7: Other Types Of Health Insurance\"\n",
214
- " \"- Chapter 8: Managed Healthcare\"\n",
215
- " \"- Chapter 9: Part I Healthcare Financing\"\n",
216
- " \"- Chapter 9: Part II Healthcare Financing\"\n",
217
- " \"- Chapter 10: Common Policy Provisions\"\n",
218
- " \"- Chapter 11: Health Insurance Pricing\"\n",
219
- " \"- Chapter 12: Health Insurance Underwriting\"\n",
220
- " \"- Chapter 13: Notice No: MAS 120 Disclosure And Advisory Process - Requirements For Accident And Health Insurance Products\"\n",
221
- " \"- Chapter 14: Financial Needs Analysis\"\n",
222
- " \"- Chapter 15: Case Studies\"\n",
223
- ")"
224
  ]
225
  },
226
  {
@@ -292,6 +278,14 @@
292
  ")"
293
  ]
294
  },
 
 
 
 
 
 
 
 
295
  {
296
  "cell_type": "code",
297
  "execution_count": null,
@@ -338,36 +332,39 @@
338
  {
339
  "cell_type": "code",
340
  "execution_count": null,
341
- "id": "c62e817e-c7c8-4f90-9e32-217fec376565",
342
  "metadata": {},
343
  "outputs": [],
344
  "source": [
345
- "response = hi_content_engine.query(\"can you give me the list of chapters that `Health Insurance 7th Edition` covers\")"
 
 
346
  ]
347
  },
348
  {
349
  "cell_type": "code",
350
  "execution_count": null,
351
- "id": "5902ffd2-2f66-4b89-bf7f-a05e3fdeccaa",
352
  "metadata": {},
353
  "outputs": [],
354
- "source": [
355
- "for res in response.response_gen:\n",
356
- " print(res, end=\"\")"
357
- ]
358
  },
359
  {
360
  "cell_type": "code",
361
  "execution_count": null,
362
- "id": "0e75453b-85c7-4e1c-8683-6df45a13cacb",
363
  "metadata": {},
364
  "outputs": [],
365
- "source": []
 
 
 
 
366
  },
367
  {
368
  "cell_type": "code",
369
  "execution_count": null,
370
- "id": "0b97d90d-5c59-486f-863b-4aaa12ed0ea0",
371
  "metadata": {},
372
  "outputs": [],
373
  "source": []
@@ -379,7 +376,9 @@
379
  "metadata": {},
380
  "outputs": [],
381
  "source": [
382
- "response = agent.stream_chat(\"hihi\", tool_choice=\"auto\")"
 
 
383
  ]
384
  },
385
  {
@@ -388,10 +387,7 @@
388
  "id": "eff8bb8d-a2d1-428a-9c3d-193389378288",
389
  "metadata": {},
390
  "outputs": [],
391
- "source": [
392
- "for res in response.response_gen:\n",
393
- " print(res, end=\"\")"
394
- ]
395
  },
396
  {
397
  "cell_type": "code",
 
34
  "\n",
35
  "from vision_api import get_transcribed_text\n",
36
  "from qna_prompting import get_qna_question_tool, evaluate_qna_answer_tool\n",
37
+ "from prompt_engineering import (\n",
38
+ " system_content, \n",
39
+ " textbook_content, \n",
40
+ " winnie_the_pooh_prompt, \n",
41
+ " introduction_line\n",
42
+ ")\n",
43
  "\n",
44
  "import nest_asyncio\n",
45
  "nest_asyncio.apply()"
 
112
  "\n",
113
  " index = VectorStoreIndex(nodes, storage_context=storage_context)\n",
114
  " \n",
115
+ " memory = ChatMemoryBuffer.from_defaults(token_limit=100_000)\n",
116
  " hi_content_engine = index.as_query_engine(\n",
117
  " memory=memory,\n",
118
  " system_prompt=system_content,\n",
119
+ " similarity_top_k=10,\n",
120
+ " verbose=True,\n",
121
  " streaming=True\n",
122
  " )\n",
123
  " hi_textbook_query_description = \"\"\"\n",
124
+ " Use this tool to extract content from the query engine,\n",
125
+ " which is built by ingesting textbook content from `Health Insurance 7th Edition`,\n",
126
  " that has 15 chapters in total. When user wants to learn more about a \n",
127
  " particular chapter, this tool will help to assist user to get better\n",
128
+ " understanding of the content of the textbook. \n",
129
  " \"\"\"\n",
130
  " \n",
131
  " hi_query_tool = QueryEngineTool.from_defaults(\n",
 
203
  "input_files = [\"./raw_documents/HI Chapter Summary Version 1.3.pdf\",\n",
204
  " \"./raw_documents/qna.txt\"]\n",
205
  "embedding_model = \"BAAI/bge-small-en-v1.5\"\n",
206
+ "persisted_vector_db = \"../models/chroma_db_advanced_corrected\"\n",
207
+ "\n",
208
+ "# fine_tuned_path = \"local:../models/fine-tuned-embeddings\"\n",
209
+ "fine_tuned_path = \"local:../models/fine-tuned-embeddings-advanced\""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  ]
211
  },
212
  {
 
278
  ")"
279
  ]
280
  },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": null,
284
+ "id": "a49ed30e-a631-4618-a79e-adab02114d8d",
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": []
288
+ },
289
  {
290
  "cell_type": "code",
291
  "execution_count": null,
 
332
  {
333
  "cell_type": "code",
334
  "execution_count": null,
335
+ "id": "66c8881d-fc57-4e95-ad86-110c4818e2fe",
336
  "metadata": {},
337
  "outputs": [],
338
  "source": [
339
+ "# query_string = \"tell me more about integrated shield plans\"\n",
340
+ "query_string = \"how to use CPF\"\n",
341
+ "# query_string = \"what is MediSave\""
342
  ]
343
  },
344
  {
345
  "cell_type": "code",
346
  "execution_count": null,
347
+ "id": "9cbd338b-bee5-4c06-9934-a0e27fd518d3",
348
  "metadata": {},
349
  "outputs": [],
350
+ "source": []
 
 
 
351
  },
352
  {
353
  "cell_type": "code",
354
  "execution_count": null,
355
+ "id": "5902ffd2-2f66-4b89-bf7f-a05e3fdeccaa",
356
  "metadata": {},
357
  "outputs": [],
358
+ "source": [
359
+ "response = hi_content_engine.query(query_string)\n",
360
+ "for res in response.response_gen:\n",
361
+ " print(res, end=\"\")"
362
+ ]
363
  },
364
  {
365
  "cell_type": "code",
366
  "execution_count": null,
367
+ "id": "0e75453b-85c7-4e1c-8683-6df45a13cacb",
368
  "metadata": {},
369
  "outputs": [],
370
  "source": []
 
376
  "metadata": {},
377
  "outputs": [],
378
  "source": [
379
+ "response = agent.stream_chat(query_string, tool_choice=\"auto\")\n",
380
+ "for res in response.response_gen:\n",
381
+ " print(res, end=\"\")"
382
  ]
383
  },
384
  {
 
387
  "id": "eff8bb8d-a2d1-428a-9c3d-193389378288",
388
  "metadata": {},
389
  "outputs": [],
390
+ "source": []
 
 
 
391
  },
392
  {
393
  "cell_type": "code",
requirements.txt CHANGED
@@ -185,6 +185,7 @@ PyMuPDF==1.23.22
185
  PyMuPDFb==1.23.22
186
  pyparsing==3.1.1
187
  pypdf==4.0.1
 
188
  PyPika==0.48.9
189
  pyproject_hooks==1.0.0
190
  python-dateutil==2.8.2
 
185
  PyMuPDFb==1.23.22
186
  pyparsing==3.1.1
187
  pypdf==4.0.1
188
+ PyPDF2==3.0.1
189
  PyPika==0.48.9
190
  pyproject_hooks==1.0.0
191
  python-dateutil==2.8.2