Spaces:
Sleeping
Sleeping
UPDATE: web crawler
Browse files- app.py +4 -8
- functions.py +34 -1
- requirements.txt +3 -1
app.py
CHANGED
@@ -1,11 +1,9 @@
|
|
1 |
import io
|
2 |
-
import re
|
3 |
from functions import *
|
4 |
from PyPDF2 import PdfReader
|
5 |
-
from bs4 import BeautifulSoup
|
6 |
from fastapi import FastAPI, File, UploadFile
|
7 |
from fastapi.middleware.cors import CORSMiddleware
|
8 |
-
from langchain_community.document_loaders import
|
9 |
|
10 |
|
11 |
app = FastAPI(title = "ConversAI", root_path = "/api/v1")
|
@@ -52,12 +50,10 @@ async def addText(vectorstore: str, text: str):
|
|
52 |
|
53 |
@app.post("/addWebsite")
|
54 |
async def addWebsite(vectorstore: str, websiteUrl: str):
|
55 |
-
|
56 |
-
|
57 |
-
return re.sub(r"\n\n+", "\n\n", soup.text).strip()
|
58 |
-
loader = RecursiveUrlLoader(websiteUrl, max_depth=2, timeout = 60, extractor=bs4_extractor)
|
59 |
docs = loader.load()
|
60 |
-
text = "\n\n".join([docs[doc].page_content for doc in range(len(docs))])
|
61 |
return addDocuments(text = text, vectorstore = vectorstore)
|
62 |
|
63 |
|
|
|
1 |
import io
|
|
|
2 |
from functions import *
|
3 |
from PyPDF2 import PdfReader
|
|
|
4 |
from fastapi import FastAPI, File, UploadFile
|
5 |
from fastapi.middleware.cors import CORSMiddleware
|
6 |
+
from langchain_community.document_loaders import UnstructuredURLLoader
|
7 |
|
8 |
|
9 |
app = FastAPI(title = "ConversAI", root_path = "/api/v1")
|
|
|
50 |
|
51 |
@app.post("/addWebsite")
|
52 |
async def addWebsite(vectorstore: str, websiteUrl: str):
|
53 |
+
urls = getLinks("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
54 |
+
loader = UnstructuredURLLoader(urls=urls)
|
|
|
|
|
55 |
docs = loader.load()
|
56 |
+
text = "\n\n\n\n".join([f"Metadata:\n{docs[doc].metadata} \nPage Content:\n {docs[doc].page_content}" for doc in range(len(docs))])
|
57 |
return addDocuments(text = text, vectorstore = vectorstore)
|
58 |
|
59 |
|
functions.py
CHANGED
@@ -18,9 +18,14 @@ from langchain.retrievers.document_compressors import FlashrankRerank
|
|
18 |
from supabase.client import create_client
|
19 |
from qdrant_client import QdrantClient
|
20 |
from langchain_groq import ChatGroq
|
|
|
|
|
21 |
from supabase import create_client
|
22 |
from dotenv import load_dotenv
|
23 |
import os
|
|
|
|
|
|
|
24 |
|
25 |
load_dotenv("secrets.env")
|
26 |
client = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
|
@@ -243,4 +248,32 @@ def listTables(username: str):
|
|
243 |
except Exception as e:
|
244 |
return {
|
245 |
"error": e
|
246 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
from supabase.client import create_client
|
19 |
from qdrant_client import QdrantClient
|
20 |
from langchain_groq import ChatGroq
|
21 |
+
from bs4 import BeautifulSoup
|
22 |
+
from urllib.parse import urlparse
|
23 |
from supabase import create_client
|
24 |
from dotenv import load_dotenv
|
25 |
import os
|
26 |
+
import time
|
27 |
+
import requests
|
28 |
+
|
29 |
|
30 |
load_dotenv("secrets.env")
|
31 |
client = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_KEY"])
|
|
|
248 |
except Exception as e:
|
249 |
return {
|
250 |
"error": e
|
251 |
+
}
|
252 |
+
|
253 |
+
|
254 |
+
def getLinks(url: str, timeout = 30):
|
255 |
+
start = time.time()
|
256 |
+
def getLinksFromPage(url: str):
|
257 |
+
response = requests.get(url)
|
258 |
+
htmlContent = response.content
|
259 |
+
soup = BeautifulSoup(htmlContent, "lxml")
|
260 |
+
anchorTags = soup.find_all("a")
|
261 |
+
allLinks = []
|
262 |
+
for tag in anchorTags:
|
263 |
+
if "href" in tag.attrs:
|
264 |
+
if urlparse(tag.attrs["href"]).netloc == urlparse(url).netloc:
|
265 |
+
allLinks.append(tag.attrs["href"])
|
266 |
+
else:
|
267 |
+
continue
|
268 |
+
else:
|
269 |
+
continue
|
270 |
+
return allLinks
|
271 |
+
links = getLinksFromPage(url)
|
272 |
+
uniqueLinks = set()
|
273 |
+
for link in links:
|
274 |
+
now = time.time()
|
275 |
+
if now - start > timeout:
|
276 |
+
break
|
277 |
+
else:
|
278 |
+
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
279 |
+
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
requirements.txt
CHANGED
@@ -14,4 +14,6 @@ lxml
|
|
14 |
PyPDF2
|
15 |
python-dotenv
|
16 |
sentence-transformers
|
17 |
-
supabase
|
|
|
|
|
|
14 |
PyPDF2
|
15 |
python-dotenv
|
16 |
sentence-transformers
|
17 |
+
supabase
|
18 |
+
unstructured
|
19 |
+
urllib3
|