Spaces:
Sleeping
Sleeping
DEBUG: WEB CRAWLER
Browse files- functions.py +32 -25
functions.py
CHANGED
@@ -19,7 +19,7 @@ from supabase.client import create_client
|
|
19 |
from qdrant_client import QdrantClient
|
20 |
from langchain_groq import ChatGroq
|
21 |
from bs4 import BeautifulSoup
|
22 |
-
from urllib.parse import urlparse
|
23 |
from supabase import create_client
|
24 |
from dotenv import load_dotenv
|
25 |
import os
|
@@ -258,29 +258,36 @@ def listTables(username: str):
|
|
258 |
}
|
259 |
|
260 |
|
|
|
261 |
def getLinks(url: str, timeout = 30):
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
273 |
else:
|
274 |
-
|
275 |
-
|
276 |
-
continue
|
277 |
-
return allLinks
|
278 |
-
links = getLinksFromPage(url)
|
279 |
-
uniqueLinks = set()
|
280 |
-
for link in links:
|
281 |
-
now = time.time()
|
282 |
-
if now - start > timeout:
|
283 |
-
break
|
284 |
-
else:
|
285 |
-
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
286 |
-
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
|
|
19 |
from qdrant_client import QdrantClient
|
20 |
from langchain_groq import ChatGroq
|
21 |
from bs4 import BeautifulSoup
|
22 |
+
from urllib.parse import urlparse, urljoin
|
23 |
from supabase import create_client
|
24 |
from dotenv import load_dotenv
|
25 |
import os
|
|
|
258 |
}
|
259 |
|
260 |
|
261 |
+
|
262 |
def getLinks(url: str, timeout = 30):
|
263 |
+
start = time.time()
|
264 |
+
def getLinksFromPage(url: str):
|
265 |
+
response = requests.get(url)
|
266 |
+
htmlContent = response.content
|
267 |
+
soup = BeautifulSoup(htmlContent, "lxml")
|
268 |
+
anchorTags = soup.find_all("a")
|
269 |
+
allLinks = []
|
270 |
+
for tag in anchorTags:
|
271 |
+
if "href" in tag.attrs:
|
272 |
+
href = tag.attrs["href"]
|
273 |
+
parseObject = urlparse(href)
|
274 |
+
if ((parseObject.scheme == "") | (parseObject.netloc == "")):
|
275 |
+
fullUrl = urljoin(url, os.path.join(parseObject.path, parseObject.params, parseObject.query, parseObject.fragment))
|
276 |
+
else:
|
277 |
+
fullUrl = href
|
278 |
+
if urlparse(fullUrl).netloc == urlparse(url).netloc:
|
279 |
+
allLinks.append(fullUrl)
|
280 |
+
else:
|
281 |
+
continue
|
282 |
+
else:
|
283 |
+
continue
|
284 |
+
return allLinks
|
285 |
+
links = getLinksFromPage(url)
|
286 |
+
uniqueLinks = set()
|
287 |
+
for link in links:
|
288 |
+
now = time.time()
|
289 |
+
if now - start > timeout:
|
290 |
+
break
|
291 |
else:
|
292 |
+
uniqueLinks = uniqueLinks.union(set(getLinksFromPage(link)))
|
293 |
+
return list(set([x[:len(x) - 1] if x[-1] == "/" else x for x in uniqueLinks]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|