{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "from langchain.document_loaders.base import Document\n", "from langchain.indexes import VectorstoreIndexCreator\n", "from langchain.utilities import ApifyWrapper" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[4], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m apify \u001b[39m=\u001b[39m ApifyWrapper()\n\u001b[1;32m----> 3\u001b[0m loader \u001b[39m=\u001b[39m apify\u001b[39m.\u001b[39;49mcall_actor(\n\u001b[0;32m 4\u001b[0m actor_id\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mapify/website-content-crawler\u001b[39;49m\u001b[39m\"\u001b[39;49m,\n\u001b[0;32m 5\u001b[0m run_input\u001b[39m=\u001b[39;49m{\u001b[39m\"\u001b[39;49m\u001b[39mstartUrls\u001b[39;49m\u001b[39m\"\u001b[39;49m: [{\u001b[39m\"\u001b[39;49m\u001b[39murl\u001b[39;49m\u001b[39m\"\u001b[39;49m: \u001b[39m\"\u001b[39;49m\u001b[39mhttps://python.langchain.com/en/latest/\u001b[39;49m\u001b[39m\"\u001b[39;49m}]},\n\u001b[0;32m 6\u001b[0m dataset_mapping_function\u001b[39m=\u001b[39;49m\u001b[39mlambda\u001b[39;49;00m item: Document(\n\u001b[0;32m 7\u001b[0m page_content\u001b[39m=\u001b[39;49mitem[\u001b[39m\"\u001b[39;49m\u001b[39mtext\u001b[39;49m\u001b[39m\"\u001b[39;49m] \u001b[39mor\u001b[39;49;00m \u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m, metadata\u001b[39m=\u001b[39;49m{\u001b[39m\"\u001b[39;49m\u001b[39msource\u001b[39;49m\u001b[39m\"\u001b[39;49m: item[\u001b[39m\"\u001b[39;49m\u001b[39murl\u001b[39;49m\u001b[39m\"\u001b[39;49m]}\n\u001b[0;32m 8\u001b[0m ),\n\u001b[0;32m 9\u001b[0m )\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\langchain\\utilities\\apify.py:73\u001b[0m, in \u001b[0;36mApifyWrapper.call_actor\u001b[1;34m(self, actor_id, run_input, dataset_mapping_function, build, memory_mbytes, timeout_secs)\u001b[0m\n\u001b[0;32m 45\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mcall_actor\u001b[39m(\n\u001b[0;32m 46\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[0;32m 47\u001b[0m actor_id: \u001b[39mstr\u001b[39m,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 53\u001b[0m timeout_secs: Optional[\u001b[39mint\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m,\n\u001b[0;32m 54\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m ApifyDatasetLoader:\n\u001b[0;32m 55\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Run an Actor on the Apify platform and wait for results to be ready.\u001b[39;00m\n\u001b[0;32m 56\u001b[0m \n\u001b[0;32m 57\u001b[0m \u001b[39m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[39m Actor run's default dataset.\u001b[39;00m\n\u001b[0;32m 72\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 73\u001b[0m actor_call \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mapify_client\u001b[39m.\u001b[39;49mactor(actor_id)\u001b[39m.\u001b[39;49mcall(\n\u001b[0;32m 74\u001b[0m run_input\u001b[39m=\u001b[39;49mrun_input,\n\u001b[0;32m 75\u001b[0m build\u001b[39m=\u001b[39;49mbuild,\n\u001b[0;32m 76\u001b[0m memory_mbytes\u001b[39m=\u001b[39;49mmemory_mbytes,\n\u001b[0;32m 77\u001b[0m timeout_secs\u001b[39m=\u001b[39;49mtimeout_secs,\n\u001b[0;32m 78\u001b[0m )\n\u001b[0;32m 80\u001b[0m \u001b[39mreturn\u001b[39;00m ApifyDatasetLoader(\n\u001b[0;32m 81\u001b[0m dataset_id\u001b[39m=\u001b[39mactor_call[\u001b[39m\"\u001b[39m\u001b[39mdefaultDatasetId\u001b[39m\u001b[39m\"\u001b[39m],\n\u001b[0;32m 82\u001b[0m dataset_mapping_function\u001b[39m=\u001b[39mdataset_mapping_function,\n\u001b[0;32m 83\u001b[0m )\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_logging.py:68\u001b[0m, in \u001b[0;36m_injects_client_details_to_log_context..wrapper\u001b[1;34m(resource_client, *args, **kwargs)\u001b[0m\n\u001b[0;32m 65\u001b[0m ctx_resource_id\u001b[39m.\u001b[39mset(resource_client\u001b[39m.\u001b[39mresource_id)\n\u001b[0;32m 66\u001b[0m ctx_url\u001b[39m.\u001b[39mset(resource_client\u001b[39m.\u001b[39murl)\n\u001b[1;32m---> 68\u001b[0m \u001b[39mreturn\u001b[39;00m fun(resource_client, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\clients\\resource_clients\\actor.py:258\u001b[0m, in \u001b[0;36mActorClient.call\u001b[1;34m(self, run_input, content_type, build, memory_mbytes, timeout_secs, webhooks, wait_secs)\u001b[0m\n\u001b[0;32m 226\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Start the actor and wait for it to finish before returning the Run object.\u001b[39;00m\n\u001b[0;32m 227\u001b[0m \n\u001b[0;32m 228\u001b[0m \u001b[39mIt waits indefinitely, unless the wait_secs argument is provided.\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 247\u001b[0m \u001b[39m dict: The run object\u001b[39;00m\n\u001b[0;32m 248\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 249\u001b[0m started_run \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstart(\n\u001b[0;32m 250\u001b[0m run_input\u001b[39m=\u001b[39mrun_input,\n\u001b[0;32m 251\u001b[0m content_type\u001b[39m=\u001b[39mcontent_type,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 255\u001b[0m webhooks\u001b[39m=\u001b[39mwebhooks,\n\u001b[0;32m 256\u001b[0m )\n\u001b[1;32m--> 258\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mroot_client\u001b[39m.\u001b[39;49mrun(started_run[\u001b[39m'\u001b[39;49m\u001b[39mid\u001b[39;49m\u001b[39m'\u001b[39;49m])\u001b[39m.\u001b[39;49mwait_for_finish(wait_secs\u001b[39m=\u001b[39;49mwait_secs)\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_logging.py:68\u001b[0m, in \u001b[0;36m_injects_client_details_to_log_context..wrapper\u001b[1;34m(resource_client, *args, **kwargs)\u001b[0m\n\u001b[0;32m 65\u001b[0m ctx_resource_id\u001b[39m.\u001b[39mset(resource_client\u001b[39m.\u001b[39mresource_id)\n\u001b[0;32m 66\u001b[0m ctx_url\u001b[39m.\u001b[39mset(resource_client\u001b[39m.\u001b[39murl)\n\u001b[1;32m---> 68\u001b[0m \u001b[39mreturn\u001b[39;00m fun(resource_client, \u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\clients\\resource_clients\\run.py:81\u001b[0m, in \u001b[0;36mRunClient.wait_for_finish\u001b[1;34m(self, wait_secs)\u001b[0m\n\u001b[0;32m 71\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mwait_for_finish\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m*\u001b[39m, wait_secs: Optional[\u001b[39mint\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Optional[Dict]:\n\u001b[0;32m 72\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Wait synchronously until the run finishes or the server times out.\u001b[39;00m\n\u001b[0;32m 73\u001b[0m \n\u001b[0;32m 74\u001b[0m \u001b[39m Args:\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 79\u001b[0m \u001b[39m (SUCEEDED, FAILED, TIMED_OUT, ABORTED), then the run has not yet finished.\u001b[39;00m\n\u001b[0;32m 80\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[1;32m---> 81\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_wait_for_finish(wait_secs\u001b[39m=\u001b[39;49mwait_secs)\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\clients\\base\\actor_job_base_client.py:34\u001b[0m, in \u001b[0;36mActorJobBaseClient._wait_for_finish\u001b[1;34m(self, wait_secs)\u001b[0m\n\u001b[0;32m 31\u001b[0m wait_for_finish \u001b[39m=\u001b[39m wait_secs \u001b[39m-\u001b[39m seconds_elapsed\n\u001b[0;32m 33\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m---> 34\u001b[0m response \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mhttp_client\u001b[39m.\u001b[39;49mcall(\n\u001b[0;32m 35\u001b[0m url\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_url(),\n\u001b[0;32m 36\u001b[0m method\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mGET\u001b[39;49m\u001b[39m'\u001b[39;49m,\n\u001b[0;32m 37\u001b[0m params\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_params(waitForFinish\u001b[39m=\u001b[39;49mwait_for_finish),\n\u001b[0;32m 38\u001b[0m )\n\u001b[0;32m 39\u001b[0m job \u001b[39m=\u001b[39m _parse_date_fields(_pluck_data(response\u001b[39m.\u001b[39mjson()))\n\u001b[0;32m 41\u001b[0m seconds_elapsed \u001b[39m=\u001b[39m math\u001b[39m.\u001b[39mfloor(((datetime\u001b[39m.\u001b[39mnow(timezone\u001b[39m.\u001b[39mutc) \u001b[39m-\u001b[39m started_at)\u001b[39m.\u001b[39mtotal_seconds()))\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_http_client.py:190\u001b[0m, in \u001b[0;36m_HTTPClient.call\u001b[1;34m(self, method, url, headers, params, data, json, stream, parse_response)\u001b[0m\n\u001b[0;32m 187\u001b[0m stop_retrying()\n\u001b[0;32m 188\u001b[0m \u001b[39mraise\u001b[39;00m ApifyApiError(response, attempt)\n\u001b[1;32m--> 190\u001b[0m \u001b[39mreturn\u001b[39;00m _retry_with_exp_backoff(\n\u001b[0;32m 191\u001b[0m _make_request,\n\u001b[0;32m 192\u001b[0m max_retries\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmax_retries,\n\u001b[0;32m 193\u001b[0m backoff_base_millis\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mmin_delay_between_retries_millis,\n\u001b[0;32m 194\u001b[0m backoff_factor\u001b[39m=\u001b[39;49mDEFAULT_BACKOFF_EXPONENTIAL_FACTOR,\n\u001b[0;32m 195\u001b[0m random_factor\u001b[39m=\u001b[39;49mDEFAULT_BACKOFF_RANDOM_FACTOR,\n\u001b[0;32m 196\u001b[0m )\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_utils.py:114\u001b[0m, in \u001b[0;36m_retry_with_exp_backoff\u001b[1;34m(func, max_retries, backoff_base_millis, backoff_factor, random_factor)\u001b[0m\n\u001b[0;32m 112\u001b[0m \u001b[39mfor\u001b[39;00m attempt \u001b[39min\u001b[39;00m \u001b[39mrange\u001b[39m(\u001b[39m1\u001b[39m, max_retries \u001b[39m+\u001b[39m \u001b[39m1\u001b[39m):\n\u001b[0;32m 113\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m--> 114\u001b[0m \u001b[39mreturn\u001b[39;00m func(stop_retrying, attempt)\n\u001b[0;32m 115\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m \u001b[39mas\u001b[39;00m e:\n\u001b[0;32m 116\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m swallow:\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\apify_client\\_http_client.py:158\u001b[0m, in \u001b[0;36m_HTTPClient.call.._make_request\u001b[1;34m(stop_retrying, attempt)\u001b[0m\n\u001b[0;32m 150\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 151\u001b[0m request \u001b[39m=\u001b[39m httpx_client\u001b[39m.\u001b[39mbuild_request(\n\u001b[0;32m 152\u001b[0m method\u001b[39m=\u001b[39mmethod,\n\u001b[0;32m 153\u001b[0m url\u001b[39m=\u001b[39murl,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 156\u001b[0m content\u001b[39m=\u001b[39mcontent,\n\u001b[0;32m 157\u001b[0m )\n\u001b[1;32m--> 158\u001b[0m response \u001b[39m=\u001b[39m httpx_client\u001b[39m.\u001b[39;49msend(\n\u001b[0;32m 159\u001b[0m request\u001b[39m=\u001b[39;49mrequest,\n\u001b[0;32m 160\u001b[0m stream\u001b[39m=\u001b[39;49mstream \u001b[39mor\u001b[39;49;00m \u001b[39mFalse\u001b[39;49;00m,\n\u001b[0;32m 161\u001b[0m )\n\u001b[0;32m 163\u001b[0m \u001b[39m# If response status is < 300, the request was successful, and we can return the result\u001b[39;00m\n\u001b[0;32m 164\u001b[0m \u001b[39mif\u001b[39;00m response\u001b[39m.\u001b[39mstatus_code \u001b[39m<\u001b[39m \u001b[39m300\u001b[39m:\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_client.py:922\u001b[0m, in \u001b[0;36mClient.send\u001b[1;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[0;32m 920\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m 921\u001b[0m response\u001b[39m.\u001b[39mclose()\n\u001b[1;32m--> 922\u001b[0m \u001b[39mraise\u001b[39;00m exc\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_client.py:916\u001b[0m, in \u001b[0;36mClient.send\u001b[1;34m(self, request, stream, auth, follow_redirects)\u001b[0m\n\u001b[0;32m 914\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 915\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m stream:\n\u001b[1;32m--> 916\u001b[0m response\u001b[39m.\u001b[39;49mread()\n\u001b[0;32m 918\u001b[0m \u001b[39mreturn\u001b[39;00m response\n\u001b[0;32m 920\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_models.py:805\u001b[0m, in \u001b[0;36mResponse.read\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 801\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 802\u001b[0m \u001b[39mRead and return the response content.\u001b[39;00m\n\u001b[0;32m 803\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[0;32m 804\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mhasattr\u001b[39m(\u001b[39mself\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39m_content\u001b[39m\u001b[39m\"\u001b[39m):\n\u001b[1;32m--> 805\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content \u001b[39m=\u001b[39m \u001b[39mb\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39m.\u001b[39;49mjoin(\u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49miter_bytes())\n\u001b[0;32m 806\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_content\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_models.py:823\u001b[0m, in \u001b[0;36mResponse.iter_bytes\u001b[1;34m(self, chunk_size)\u001b[0m\n\u001b[0;32m 821\u001b[0m chunker \u001b[39m=\u001b[39m ByteChunker(chunk_size\u001b[39m=\u001b[39mchunk_size)\n\u001b[0;32m 822\u001b[0m \u001b[39mwith\u001b[39;00m request_context(request\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_request):\n\u001b[1;32m--> 823\u001b[0m \u001b[39mfor\u001b[39;00m raw_bytes \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39miter_raw():\n\u001b[0;32m 824\u001b[0m decoded \u001b[39m=\u001b[39m decoder\u001b[39m.\u001b[39mdecode(raw_bytes)\n\u001b[0;32m 825\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m chunker\u001b[39m.\u001b[39mdecode(decoded):\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_models.py:881\u001b[0m, in \u001b[0;36mResponse.iter_raw\u001b[1;34m(self, chunk_size)\u001b[0m\n\u001b[0;32m 878\u001b[0m chunker \u001b[39m=\u001b[39m ByteChunker(chunk_size\u001b[39m=\u001b[39mchunk_size)\n\u001b[0;32m 880\u001b[0m \u001b[39mwith\u001b[39;00m request_context(request\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_request):\n\u001b[1;32m--> 881\u001b[0m \u001b[39mfor\u001b[39;00m raw_stream_bytes \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstream:\n\u001b[0;32m 882\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_num_bytes_downloaded \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39mlen\u001b[39m(raw_stream_bytes)\n\u001b[0;32m 883\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m chunker\u001b[39m.\u001b[39mdecode(raw_stream_bytes):\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_client.py:124\u001b[0m, in \u001b[0;36mBoundSyncStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 123\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__iter__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m typing\u001b[39m.\u001b[39mIterator[\u001b[39mbytes\u001b[39m]:\n\u001b[1;32m--> 124\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stream:\n\u001b[0;32m 125\u001b[0m \u001b[39myield\u001b[39;00m chunk\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpx\\_transports\\default.py:104\u001b[0m, in \u001b[0;36mResponseStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 102\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__iter__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m typing\u001b[39m.\u001b[39mIterator[\u001b[39mbytes\u001b[39m]:\n\u001b[0;32m 103\u001b[0m \u001b[39mwith\u001b[39;00m map_httpcore_exceptions():\n\u001b[1;32m--> 104\u001b[0m \u001b[39mfor\u001b[39;00m part \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_httpcore_stream:\n\u001b[0;32m 105\u001b[0m \u001b[39myield\u001b[39;00m part\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\connection_pool.py:338\u001b[0m, in \u001b[0;36mConnectionPoolByteStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 337\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__iter__\u001b[39m(\u001b[39mself\u001b[39m) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m Iterator[\u001b[39mbytes\u001b[39m]:\n\u001b[1;32m--> 338\u001b[0m \u001b[39mfor\u001b[39;00m part \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_stream:\n\u001b[0;32m 339\u001b[0m \u001b[39myield\u001b[39;00m part\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\http11.py:315\u001b[0m, in \u001b[0;36mHTTP11ConnectionByteStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 310\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m 311\u001b[0m \u001b[39m# If we get an exception while streaming the response,\u001b[39;00m\n\u001b[0;32m 312\u001b[0m \u001b[39m# we want to close the response (and possibly the connection)\u001b[39;00m\n\u001b[0;32m 313\u001b[0m \u001b[39m# before raising that exception.\u001b[39;00m\n\u001b[0;32m 314\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mclose()\n\u001b[1;32m--> 315\u001b[0m \u001b[39mraise\u001b[39;00m exc\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\http11.py:308\u001b[0m, in \u001b[0;36mHTTP11ConnectionByteStream.__iter__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m 306\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 307\u001b[0m \u001b[39mwith\u001b[39;00m Trace(\u001b[39m\"\u001b[39m\u001b[39mhttp11.receive_response_body\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_request, kwargs):\n\u001b[1;32m--> 308\u001b[0m \u001b[39mfor\u001b[39;00m chunk \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_connection\u001b[39m.\u001b[39m_receive_response_body(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[0;32m 309\u001b[0m \u001b[39myield\u001b[39;00m chunk\n\u001b[0;32m 310\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mBaseException\u001b[39;00m \u001b[39mas\u001b[39;00m exc:\n\u001b[0;32m 311\u001b[0m \u001b[39m# If we get an exception while streaming the response,\u001b[39;00m\n\u001b[0;32m 312\u001b[0m \u001b[39m# we want to close the response (and possibly the connection)\u001b[39;00m\n\u001b[0;32m 313\u001b[0m \u001b[39m# before raising that exception.\u001b[39;00m\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\http11.py:177\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_response_body\u001b[1;34m(self, request)\u001b[0m\n\u001b[0;32m 174\u001b[0m timeout \u001b[39m=\u001b[39m timeouts\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mread\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m)\n\u001b[0;32m 176\u001b[0m \u001b[39mwhile\u001b[39;00m \u001b[39mTrue\u001b[39;00m:\n\u001b[1;32m--> 177\u001b[0m event \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_receive_event(timeout\u001b[39m=\u001b[39;49mtimeout)\n\u001b[0;32m 178\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(event, h11\u001b[39m.\u001b[39mData):\n\u001b[0;32m 179\u001b[0m \u001b[39myield\u001b[39;00m \u001b[39mbytes\u001b[39m(event\u001b[39m.\u001b[39mdata)\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\_sync\\http11.py:191\u001b[0m, in \u001b[0;36mHTTP11Connection._receive_event\u001b[1;34m(self, timeout)\u001b[0m\n\u001b[0;32m 188\u001b[0m event \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_h11_state\u001b[39m.\u001b[39mnext_event()\n\u001b[0;32m 190\u001b[0m \u001b[39mif\u001b[39;00m event \u001b[39mis\u001b[39;00m h11\u001b[39m.\u001b[39mNEED_DATA:\n\u001b[1;32m--> 191\u001b[0m data \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_network_stream\u001b[39m.\u001b[39;49mread(\n\u001b[0;32m 192\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mREAD_NUM_BYTES, timeout\u001b[39m=\u001b[39;49mtimeout\n\u001b[0;32m 193\u001b[0m )\n\u001b[0;32m 195\u001b[0m \u001b[39m# If we feed this case through h11 we'll raise an exception like:\u001b[39;00m\n\u001b[0;32m 196\u001b[0m \u001b[39m#\u001b[39;00m\n\u001b[0;32m 197\u001b[0m \u001b[39m# httpcore.RemoteProtocolError: can't handle event type\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 201\u001b[0m \u001b[39m# perspective. Instead we handle this case distinctly and treat\u001b[39;00m\n\u001b[0;32m 202\u001b[0m \u001b[39m# it as a ConnectError.\u001b[39;00m\n\u001b[0;32m 203\u001b[0m \u001b[39mif\u001b[39;00m data \u001b[39m==\u001b[39m \u001b[39mb\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m\"\u001b[39m \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_h11_state\u001b[39m.\u001b[39mtheir_state \u001b[39m==\u001b[39m h11\u001b[39m.\u001b[39mSEND_RESPONSE:\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\httpcore\\backends\\sync.py:28\u001b[0m, in \u001b[0;36mSyncStream.read\u001b[1;34m(self, max_bytes, timeout)\u001b[0m\n\u001b[0;32m 26\u001b[0m \u001b[39mwith\u001b[39;00m map_exceptions(exc_map):\n\u001b[0;32m 27\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sock\u001b[39m.\u001b[39msettimeout(timeout)\n\u001b[1;32m---> 28\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sock\u001b[39m.\u001b[39;49mrecv(max_bytes)\n", "File \u001b[1;32mc:\\Program Files\\Python311\\Lib\\ssl.py:1263\u001b[0m, in \u001b[0;36mSSLSocket.recv\u001b[1;34m(self, buflen, flags)\u001b[0m\n\u001b[0;32m 1259\u001b[0m \u001b[39mif\u001b[39;00m flags \u001b[39m!=\u001b[39m \u001b[39m0\u001b[39m:\n\u001b[0;32m 1260\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 1261\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnon-zero flags not allowed in calls to recv() on \u001b[39m\u001b[39m%s\u001b[39;00m\u001b[39m\"\u001b[39m \u001b[39m%\u001b[39m\n\u001b[0;32m 1262\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m\u001b[39m__class__\u001b[39m)\n\u001b[1;32m-> 1263\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mread(buflen)\n\u001b[0;32m 1264\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[0;32m 1265\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39msuper\u001b[39m()\u001b[39m.\u001b[39mrecv(buflen, flags)\n", "File \u001b[1;32mc:\\Program Files\\Python311\\Lib\\ssl.py:1136\u001b[0m, in \u001b[0;36mSSLSocket.read\u001b[1;34m(self, len, buffer)\u001b[0m\n\u001b[0;32m 1134\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_sslobj\u001b[39m.\u001b[39mread(\u001b[39mlen\u001b[39m, buffer)\n\u001b[0;32m 1135\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m-> 1136\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_sslobj\u001b[39m.\u001b[39;49mread(\u001b[39mlen\u001b[39;49m)\n\u001b[0;32m 1137\u001b[0m \u001b[39mexcept\u001b[39;00m SSLError \u001b[39mas\u001b[39;00m x:\n\u001b[0;32m 1138\u001b[0m \u001b[39mif\u001b[39;00m x\u001b[39m.\u001b[39margs[\u001b[39m0\u001b[39m] \u001b[39m==\u001b[39m SSL_ERROR_EOF \u001b[39mand\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39msuppress_ragged_eofs:\n", "\u001b[1;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "apify = ApifyWrapper()\n", "\n", "loader = apify.call_actor(\n", " actor_id=\"apify/website-content-crawler\",\n", " run_input={\"startUrls\": [{\"url\": \"https://python.langchain.com/en/latest/\"}]},\n", " dataset_mapping_function=lambda item: Document(\n", " page_content=item[\"text\"] or \"\", metadata={\"source\": item[\"url\"]}\n", " ),\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "index = VectorstoreIndexCreator().from_loaders([loader])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query = \"What is LangChain?\"\n", "result = index.query_with_sources(query)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.3" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }