File size: 27,284 Bytes
03b400c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders.base import Document\n",
    "from langchain.indexes import VectorstoreIndexCreator\n",
    "from langchain.utilities import ApifyWrapper"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[4], line 3\u001b[0m\n\u001b[0;32m      1\u001b[0m apify \u001b[39m=\u001b[39m ApifyWrapper()\n\u001b[1;32m----> 3\u001b[0m loader \u001b[39m=\u001b[39m apify\u001b[39m.\u001b[39;49mcall_actor(\n\u001b[0;32m      4\u001b[0m     actor_id\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mapify/website-content-crawler\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m      5\u001b[0m     run_input\u001b[39m=\u001b[39;49m{\u001b[39m\"\u001b[39;49m\u001b[39mstartUrls\u001b[39;49m\u001b[39m\"\u001b[39;49m: [{\u001b[39m\"\u001b[39;49m\u001b[39murl\u001b[39;49m\u001b[39m\"\u001b[39;49m: \u001b[39m\"\u001b[39;49m\u001b[39mhttps://python.langchain.com/en/latest/\u001b[39;49m\u001b[39m\"\u001b[39;49m}]},\n\u001b[0;32m      6\u001b[0m     dataset_mapping_function\u001b[39m=\u001b[39;49m\u001b[39mlambda\u001b[39;49;00m item: Document(\n\u001b[0;32m      7\u001b[0m         page_content\u001b[39m=\u001b[39;49mitem[\u001b[39m\"\u001b[39;49m\u001b[39mtext\u001b[39;49m\u001b[39m\"\u001b[39;49m] \u001b[39mor\u001b[39;49;00m \u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m, metadata\u001b[39m=\u001b[39;49m{\u001b[39m\"\u001b[39;49m\u001b[39msource\u001b[39;49m\u001b[39m\"\u001b[39;49m: item[\u001b[39m\"\u001b[39;49m\u001b[39murl\u001b[39;49m\u001b[39m\"\u001b[39;49m]}\n\u001b[0;32m      8\u001b[0m     ),\n\u001b[0;32m      9\u001b[0m )\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\langchain\\utilities\\apify.py:73\u001b[0m, in \u001b[0;36mApifyWrapper.call_actor\u001b[1;34m(self, actor_id, run_input, dataset_mapping_function, build, memory_mbytes, timeout_secs)\u001b[0m\n\u001b[0;32m     45\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mcall_actor\u001b[39m(\n\u001b[0;32m     46\u001b[0m     \u001b[39mself\u001b[39m,\n\u001b[0;32m     47\u001b[0m     actor_id: \u001b[39mstr\u001b[39m,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     53\u001b[0m     timeout_secs: Optional[\u001b[39mint\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[0;32m     54\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m ApifyDatasetLoader:\n\u001b[0;32m     55\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Run an Actor on the Apify platform and wait for results to be ready.\u001b[39;00m\n\u001b[0;32m     56\u001b[0m \n\u001b[0;32m     57\u001b[0m \u001b[39m    Args:\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     71\u001b[0m \u001b[39m            Actor run's default dataset.\u001b[39;00m\n\u001b[0;32m     72\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m---> 73\u001b[0m     actor_call \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mapify_client\u001b[39m.\u001b[39;49mactor(actor_id)\u001b[39m.\u001b[39;49mcall(\n\u001b[0;32m     74\u001b[0m         run_input\u001b[39m=\u001b[39;49mrun_input,\n\u001b[0;32m     75\u001b[0m         build\u001b[39m=\u001b[39;49mbuild,\n\u001b[0;32m     76\u001b[0m         memory_mbytes\u001b[39m=\u001b[39;49mmemory_mbytes,\n\u001b[0;32m     77\u001b[0m         timeout_secs\u001b[39m=\u001b[39;49mtimeout_secs,\n\u001b[0;32m     78\u001b[0m     )\n\u001b[0;32m     80\u001b[0m     \u001b[39mreturn\u001b[39;00m ApifyDatasetLoader(\n\u001b[0;32m     81\u001b[0m         dataset_id\u001b[39m=\u001b[39mactor_call[\u001b[39m\"\u001b[39m\u001b[39mdefaultDatasetId\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[0;32m     82\u001b[0m         dataset_mapping_function\u001b[39m=\u001b[39mdataset_mapping_function,\n\u001b[0;32m     83\u001b[0m     )\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_logging.py:68\u001b[0m, in \u001b[0;36m_injects_client_details_to_log_context.<locals>.wrapper\u001b[1;34m(resource_client, *args, **kwargs)\u001b[0m\n\u001b[0;32m     65\u001b[0m ctx_resource_id\u001b[39m.\u001b[39mset(resource_client\u001b[39m.\u001b[39mresource_id)\n\u001b[0;32m     66\u001b[0m ctx_url\u001b[39m.\u001b[39mset(resource_client\u001b[39m.\u001b[39murl)\n\u001b[1;32m---> 68\u001b[0m \u001b[39mreturn\u001b[39;00m fun(resource_client, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\clients\\resource_clients\\actor.py:258\u001b[0m, in \u001b[0;36mActorClient.call\u001b[1;34m(self, run_input, content_type, build, memory_mbytes, timeout_secs, webhooks, wait_secs)\u001b[0m\n\u001b[0;32m    226\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Start the actor and wait for it to finish before returning the Run object.\u001b[39;00m\n\u001b[0;32m    227\u001b[0m \n\u001b[0;32m    228\u001b[0m \u001b[39mIt waits indefinitely, unless the wait_secs argument is provided.\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    247\u001b[0m \u001b[39m    dict: The run object\u001b[39;00m\n\u001b[0;32m    248\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    249\u001b[0m started_run \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstart(\n\u001b[0;32m    250\u001b[0m     run_input\u001b[39m=\u001b[39mrun_input,\n\u001b[0;32m    251\u001b[0m     content_type\u001b[39m=\u001b[39mcontent_type,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    255\u001b[0m     webhooks\u001b[39m=\u001b[39mwebhooks,\n\u001b[0;32m    256\u001b[0m )\n\u001b[1;32m--> 258\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mroot_client\u001b[39m.\u001b[39;49mrun(started_run[\u001b[39m'\u001b[39;49m\u001b[39mid\u001b[39;49m\u001b[39m'\u001b[39;49m])\u001b[39m.\u001b[39;49mwait_for_finish(wait_secs\u001b[39m=\u001b[39;49mwait_secs)\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_logging.py:68\u001b[0m, in \u001b[0;36m_injects_client_details_to_log_context.<locals>.wrapper\u001b[1;34m(resource_client, *args, **kwargs)\u001b[0m\n\u001b[0;32m     65\u001b[0m ctx_resource_id\u001b[39m.\u001b[39mset(resource_client\u001b[39m.\u001b[39mresource_id)\n\u001b[0;32m     66\u001b[0m ctx_url\u001b[39m.\u001b[39mset(resource_client\u001b[39m.\u001b[39murl)\n\u001b[1;32m---> 68\u001b[0m \u001b[39mreturn\u001b[39;00m fun(resource_client, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\clients\\resource_clients\\run.py:81\u001b[0m, in \u001b[0;36mRunClient.wait_for_finish\u001b[1;34m(self, wait_secs)\u001b[0m\n\u001b[0;32m     71\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mwait_for_finish\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39m, wait_secs: Optional[\u001b[39mint\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Optional[Dict]:\n\u001b[0;32m     72\u001b[0m \u001b[39m    \u001b[39m\u001b[39m\"\"\"Wait synchronously until the run finishes or the server times out.\u001b[39;00m\n\u001b[0;32m     73\u001b[0m \n\u001b[0;32m     74\u001b[0m \u001b[39m    Args:\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m     79\u001b[0m \u001b[39m            (SUCEEDED, FAILED, TIMED_OUT, ABORTED), then the run has not yet finished.\u001b[39;00m\n\u001b[0;32m     80\u001b[0m \u001b[39m    \"\"\"\u001b[39;00m\n\u001b[1;32m---> 81\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_wait_for_finish(wait_secs\u001b[39m=\u001b[39;49mwait_secs)\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\clients\\base\\actor_job_base_client.py:34\u001b[0m, in \u001b[0;36mActorJobBaseClient._wait_for_finish\u001b[1;34m(self, wait_secs)\u001b[0m\n\u001b[0;32m     31\u001b[0m     wait_for_finish \u001b[39m=\u001b[39m wait_secs \u001b[39m-\u001b[39m seconds_elapsed\n\u001b[0;32m     33\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m---> 34\u001b[0m     response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mhttp_client\u001b[39m.\u001b[39;49mcall(\n\u001b[0;32m     35\u001b[0m         url\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_url(),\n\u001b[0;32m     36\u001b[0m         method\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mGET\u001b[39;49m\u001b[39m'\u001b[39;49m,\n\u001b[0;32m     37\u001b[0m         params\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_params(waitForFinish\u001b[39m=\u001b[39;49mwait_for_finish),\n\u001b[0;32m     38\u001b[0m     )\n\u001b[0;32m     39\u001b[0m     job \u001b[39m=\u001b[39m _parse_date_fields(_pluck_data(response\u001b[39m.\u001b[39mjson()))\n\u001b[0;32m     41\u001b[0m     seconds_elapsed \u001b[39m=\u001b[39m math\u001b[39m.\u001b[39mfloor(((datetime\u001b[39m.\u001b[39mnow(timezone\u001b[39m.\u001b[39mutc) \u001b[39m-\u001b[39m started_at)\u001b[39m.\u001b[39mtotal_seconds()))\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_http_client.py:190\u001b[0m, in \u001b[0;36m_HTTPClient.call\u001b[1;34m(self, method, url, headers, params, data, json, stream, parse_response)\u001b[0m\n\u001b[0;32m    187\u001b[0m         stop_retrying()\n\u001b[0;32m    188\u001b[0m     \u001b[39mraise\u001b[39;00m ApifyApiError(response, attempt)\n\u001b[1;32m--> 190\u001b[0m \u001b[39mreturn\u001b[39;00m _retry_with_exp_backoff(\n\u001b[0;32m    191\u001b[0m     _make_request,\n\u001b[0;32m    192\u001b[0m     max_retries\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmax_retries,\n\u001b[0;32m    193\u001b[0m     backoff_base_millis\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmin_delay_between_retries_millis,\n\u001b[0;32m    194\u001b[0m     backoff_factor\u001b[39m=\u001b[39;49mDEFAULT_BACKOFF_EXPONENTIAL_FACTOR,\n\u001b[0;32m    195\u001b[0m     random_factor\u001b[39m=\u001b[39;49mDEFAULT_BACKOFF_RANDOM_FACTOR,\n\u001b[0;32m    196\u001b[0m )\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_utils.py:114\u001b[0m, in \u001b[0;36m_retry_with_exp_backoff\u001b[1;34m(func, max_retries, backoff_base_millis, backoff_factor, random_factor)\u001b[0m\n\u001b[0;32m    112\u001b[0m \u001b[39mfor\u001b[39;00m attempt \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m1\u001b[39m, max_retries \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m):\n\u001b[0;32m    113\u001b[0m     \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 114\u001b[0m         \u001b[39mreturn\u001b[39;00m func(stop_retrying, attempt)\n\u001b[0;32m    115\u001b[0m     \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m    116\u001b[0m         \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m swallow:\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_http_client.py:158\u001b[0m, in \u001b[0;36m_HTTPClient.call.<locals>._make_request\u001b[1;34m(stop_retrying, attempt)\u001b[0m\n\u001b[0;32m    150\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m    151\u001b[0m     request \u001b[39m=\u001b[39m httpx_client\u001b[39m.\u001b[39mbuild_request(\n\u001b[0;32m    152\u001b[0m         method\u001b[39m=\u001b[39mmethod,\n\u001b[0;32m    153\u001b[0m         url\u001b[39m=\u001b[39murl,\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    156\u001b[0m         content\u001b[39m=\u001b[39mcontent,\n\u001b[0;32m    157\u001b[0m     )\n\u001b[1;32m--> 158\u001b[0m     response \u001b[39m=\u001b[39m httpx_client\u001b[39m.\u001b[39;49msend(\n\u001b[0;32m    159\u001b[0m         request\u001b[39m=\u001b[39;49mrequest,\n\u001b[0;32m    160\u001b[0m         stream\u001b[39m=\u001b[39;49mstream \u001b[39mor\u001b[39;49;00m \u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m    161\u001b[0m     )\n\u001b[0;32m    163\u001b[0m     \u001b[39m# If response status is < 300, the request was successful, and we can return the result\u001b[39;00m\n\u001b[0;32m    164\u001b[0m     \u001b[39mif\u001b[39;00m response\u001b[39m.\u001b[39mstatus_code \u001b[39m<\u001b[39m \u001b[39m300\u001b[39m:\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_client.py:922\u001b[0m, in \u001b[0;36mClient.send\u001b[1;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[0;32m    920\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m    921\u001b[0m     response\u001b[39m.\u001b[39mclose()\n\u001b[1;32m--> 922\u001b[0m     \u001b[39mraise\u001b[39;00m exc\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_client.py:916\u001b[0m, in \u001b[0;36mClient.send\u001b[1;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[0;32m    914\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m    915\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m stream:\n\u001b[1;32m--> 916\u001b[0m         response\u001b[39m.\u001b[39;49mread()\n\u001b[0;32m    918\u001b[0m     \u001b[39mreturn\u001b[39;00m response\n\u001b[0;32m    920\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_models.py:805\u001b[0m, in \u001b[0;36mResponse.read\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    801\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    802\u001b[0m \u001b[39mRead and return the response content.\u001b[39;00m\n\u001b[0;32m    803\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m    804\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mhasattr\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m_content\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m--> 805\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content \u001b[39m=\u001b[39m \u001b[39mb\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m.\u001b[39;49mjoin(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49miter_bytes())\n\u001b[0;32m    806\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_models.py:823\u001b[0m, in \u001b[0;36mResponse.iter_bytes\u001b[1;34m(self, chunk_size)\u001b[0m\n\u001b[0;32m    821\u001b[0m chunker \u001b[39m=\u001b[39m ByteChunker(chunk_size\u001b[39m=\u001b[39mchunk_size)\n\u001b[0;32m    822\u001b[0m \u001b[39mwith\u001b[39;00m request_context(request\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_request):\n\u001b[1;32m--> 823\u001b[0m     \u001b[39mfor\u001b[39;00m raw_bytes \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39miter_raw():\n\u001b[0;32m    824\u001b[0m         decoded \u001b[39m=\u001b[39m decoder\u001b[39m.\u001b[39mdecode(raw_bytes)\n\u001b[0;32m    825\u001b[0m         \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m chunker\u001b[39m.\u001b[39mdecode(decoded):\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_models.py:881\u001b[0m, in \u001b[0;36mResponse.iter_raw\u001b[1;34m(self, chunk_size)\u001b[0m\n\u001b[0;32m    878\u001b[0m chunker \u001b[39m=\u001b[39m ByteChunker(chunk_size\u001b[39m=\u001b[39mchunk_size)\n\u001b[0;32m    880\u001b[0m \u001b[39mwith\u001b[39;00m request_context(request\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_request):\n\u001b[1;32m--> 881\u001b[0m     \u001b[39mfor\u001b[39;00m raw_stream_bytes \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstream:\n\u001b[0;32m    882\u001b[0m         \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_num_bytes_downloaded \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39mlen\u001b[39m(raw_stream_bytes)\n\u001b[0;32m    883\u001b[0m         \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m chunker\u001b[39m.\u001b[39mdecode(raw_stream_bytes):\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_client.py:124\u001b[0m, in \u001b[0;36mBoundSyncStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    123\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__iter__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m typing\u001b[39m.\u001b[39mIterator[\u001b[39mbytes\u001b[39m]:\n\u001b[1;32m--> 124\u001b[0m     \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stream:\n\u001b[0;32m    125\u001b[0m         \u001b[39myield\u001b[39;00m chunk\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_transports\\default.py:104\u001b[0m, in \u001b[0;36mResponseStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    102\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__iter__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m typing\u001b[39m.\u001b[39mIterator[\u001b[39mbytes\u001b[39m]:\n\u001b[0;32m    103\u001b[0m     \u001b[39mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[1;32m--> 104\u001b[0m         \u001b[39mfor\u001b[39;00m part \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_httpcore_stream:\n\u001b[0;32m    105\u001b[0m             \u001b[39myield\u001b[39;00m part\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\connection_pool.py:338\u001b[0m, in \u001b[0;36mConnectionPoolByteStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    337\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__iter__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Iterator[\u001b[39mbytes\u001b[39m]:\n\u001b[1;32m--> 338\u001b[0m     \u001b[39mfor\u001b[39;00m part \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stream:\n\u001b[0;32m    339\u001b[0m         \u001b[39myield\u001b[39;00m part\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\http11.py:315\u001b[0m, in \u001b[0;36mHTTP11ConnectionByteStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    310\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m    311\u001b[0m     \u001b[39m# If we get an exception while streaming the response,\u001b[39;00m\n\u001b[0;32m    312\u001b[0m     \u001b[39m# we want to close the response (and possibly the connection)\u001b[39;00m\n\u001b[0;32m    313\u001b[0m     \u001b[39m# before raising that exception.\u001b[39;00m\n\u001b[0;32m    314\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n\u001b[1;32m--> 315\u001b[0m     \u001b[39mraise\u001b[39;00m exc\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\http11.py:308\u001b[0m, in \u001b[0;36mHTTP11ConnectionByteStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    306\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m    307\u001b[0m     \u001b[39mwith\u001b[39;00m Trace(\u001b[39m\"\u001b[39m\u001b[39mhttp11.receive_response_body\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_request, kwargs):\n\u001b[1;32m--> 308\u001b[0m         \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_connection\u001b[39m.\u001b[39m_receive_response_body(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[0;32m    309\u001b[0m             \u001b[39myield\u001b[39;00m chunk\n\u001b[0;32m    310\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m    311\u001b[0m     \u001b[39m# If we get an exception while streaming the response,\u001b[39;00m\n\u001b[0;32m    312\u001b[0m     \u001b[39m# we want to close the response (and possibly the connection)\u001b[39;00m\n\u001b[0;32m    313\u001b[0m     \u001b[39m# before raising that exception.\u001b[39;00m\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\http11.py:177\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_response_body\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m    174\u001b[0m timeout \u001b[39m=\u001b[39m timeouts\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mread\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m)\n\u001b[0;32m    176\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 177\u001b[0m     event \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_receive_event(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[0;32m    178\u001b[0m     \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(event, h11\u001b[39m.\u001b[39mData):\n\u001b[0;32m    179\u001b[0m         \u001b[39myield\u001b[39;00m \u001b[39mbytes\u001b[39m(event\u001b[39m.\u001b[39mdata)\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\http11.py:191\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_event\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m    188\u001b[0m     event \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_h11_state\u001b[39m.\u001b[39mnext_event()\n\u001b[0;32m    190\u001b[0m \u001b[39mif\u001b[39;00m event \u001b[39mis\u001b[39;00m h11\u001b[39m.\u001b[39mNEED_DATA:\n\u001b[1;32m--> 191\u001b[0m     data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_network_stream\u001b[39m.\u001b[39;49mread(\n\u001b[0;32m    192\u001b[0m         \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mREAD_NUM_BYTES, timeout\u001b[39m=\u001b[39;49mtimeout\n\u001b[0;32m    193\u001b[0m     )\n\u001b[0;32m    195\u001b[0m     \u001b[39m# If we feed this case through h11 we'll raise an exception like:\u001b[39;00m\n\u001b[0;32m    196\u001b[0m     \u001b[39m#\u001b[39;00m\n\u001b[0;32m    197\u001b[0m     \u001b[39m#     httpcore.RemoteProtocolError: can't handle event type\u001b[39;00m\n\u001b[1;32m   (...)\u001b[0m\n\u001b[0;32m    201\u001b[0m     \u001b[39m# perspective. Instead we handle this case distinctly and treat\u001b[39;00m\n\u001b[0;32m    202\u001b[0m     \u001b[39m# it as a ConnectError.\u001b[39;00m\n\u001b[0;32m    203\u001b[0m     \u001b[39mif\u001b[39;00m data \u001b[39m==\u001b[39m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_h11_state\u001b[39m.\u001b[39mtheir_state \u001b[39m==\u001b[39m h11\u001b[39m.\u001b[39mSEND_RESPONSE:\n",
      "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\backends\\sync.py:28\u001b[0m, in \u001b[0;36mSyncStream.read\u001b[1;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[0;32m     26\u001b[0m \u001b[39mwith\u001b[39;00m map_exceptions(exc_map):\n\u001b[0;32m     27\u001b[0m     \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sock\u001b[39m.\u001b[39msettimeout(timeout)\n\u001b[1;32m---> 28\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv(max_bytes)\n",
      "File \u001b[1;32mc:\\Program Files\\Python311\\Lib\\ssl.py:1263\u001b[0m, in \u001b[0;36mSSLSocket.recv\u001b[1;34m(self, buflen, flags)\u001b[0m\n\u001b[0;32m   1259\u001b[0m     \u001b[39mif\u001b[39;00m flags \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m   1260\u001b[0m         \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m   1261\u001b[0m             \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[0;32m   1262\u001b[0m             \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[1;32m-> 1263\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(buflen)\n\u001b[0;32m   1264\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m   1265\u001b[0m     \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mrecv(buflen, flags)\n",
      "File \u001b[1;32mc:\\Program Files\\Python311\\Lib\\ssl.py:1136\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[1;34m(self, len, buffer)\u001b[0m\n\u001b[0;32m   1134\u001b[0m         \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sslobj\u001b[39m.\u001b[39mread(\u001b[39mlen\u001b[39m, buffer)\n\u001b[0;32m   1135\u001b[0m     \u001b[39melse\u001b[39;00m:\n\u001b[1;32m-> 1136\u001b[0m         \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m)\n\u001b[0;32m   1137\u001b[0m \u001b[39mexcept\u001b[39;00m SSLError \u001b[39mas\u001b[39;00m x:\n\u001b[0;32m   1138\u001b[0m     \u001b[39mif\u001b[39;00m x\u001b[39m.\u001b[39margs[\u001b[39m0\u001b[39m] \u001b[39m==\u001b[39m SSL_ERROR_EOF \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msuppress_ragged_eofs:\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "apify = ApifyWrapper()\n",
    "\n",
    "loader = apify.call_actor(\n",
    "    actor_id=\"apify/website-content-crawler\",\n",
    "    run_input={\"startUrls\": [{\"url\": \"https://python.langchain.com/en/latest/\"}]},\n",
    "    dataset_mapping_function=lambda item: Document(\n",
    "        page_content=item[\"text\"] or \"\", metadata={\"source\": item[\"url\"]}\n",
    "    ),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "index = VectorstoreIndexCreator().from_loaders([loader])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "query = \"What is LangChain?\"\n",
    "result = index.query_with_sources(query)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}