|
import requests |
|
import undetected_chromedriver as uc |
|
|
|
from langchain.tools import tool |
|
from bs4 import BeautifulSoup |
|
from duckduckgo_search import DDGS |
|
|
|
|
|
class WebScapeAdv_UC: |
|
@tool("process search results with undetectable chrome", return_direct=False) |
|
def scrape_with_undetectable_chrome(url: str) -> str: |
|
""" |
|
Scrape webpage content using Selenium with undetectable Chrome driver. |
|
|
|
:param url: The URL of the webpage to scrape. |
|
:return: The text content of the webpage. |
|
""" |
|
try: |
|
options = uc.ChromeOptions() |
|
options.add_argument('--headless') |
|
options.add_argument('--no-sandbox') |
|
options.add_argument('--disable-dev-shm-usage') |
|
|
|
|
|
driver = uc.Chrome(options=options) |
|
driver.get(url) |
|
html = driver.page_source |
|
driver.quit() |
|
soup = BeautifulSoup(html, 'html.parser') |
|
return soup.get_text() |
|
except Exception as e: |
|
return f"Failed to fetch content with error: {e}" |
|
from bs4 import BeautifulSoup |
|
import requests |
|
import undetected_chromedriver as uc |
|
|
|
|
|
|
|
@tool("process search results with fallback", return_direct=False) |
|
def scrape_with_fallback(url: str) -> str: |
|
""" |
|
Attempts to scrape webpage content using BeautifulSoup first, then falls back to Selenium with undetectable Chrome driver if needed. |
|
|
|
:param url: The URL of the webpage to scrape. |
|
:return: The text content of the webpage. |
|
""" |
|
|
|
response = requests.get(url) |
|
if response.status_code == 200: |
|
soup = BeautifulSoup(response.content, 'html.parser') |
|
if len(soup.get_text().strip()) > 100: |
|
return soup.get_text() |
|
|
|
|
|
try: |
|
options = uc.ChromeOptions() |
|
options.add_argument('--headless') |
|
options.add_argument('--no-sandbox') |
|
options.add_argument('--disable-dev-shm-usage') |
|
|
|
|
|
driver = uc.Chrome(options=options) |
|
driver.get(url) |
|
html = driver.page_source |
|
driver.quit() |
|
soup = BeautifulSoup(html, 'html.parser') |
|
return soup.get_text() |
|
except Exception as e: |
|
return f"Failed to fetch content with error: {e}" |
|
|