import requests import undetected_chromedriver as uc from langchain.tools import tool from bs4 import BeautifulSoup from duckduckgo_search import DDGS class WebScapeAdv_UC: @tool("process search results with undetectable chrome", return_direct=False) def scrape_with_undetectable_chrome(url: str) -> str: """ Scrape webpage content using Selenium with undetectable Chrome driver. :param url: The URL of the webpage to scrape. :return: The text content of the webpage. """ try: options = uc.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') # Initialize undetectable Chrome driver driver = uc.Chrome(options=options) driver.get(url) html = driver.page_source driver.quit() # Ensure to quit the driver to free resources soup = BeautifulSoup(html, 'html.parser') return soup.get_text() except Exception as e: return f"Failed to fetch content with error: {e}" from bs4 import BeautifulSoup import requests import undetected_chromedriver as uc @tool("process search results with fallback", return_direct=False) def scrape_with_fallback(url: str) -> str: """ Attempts to scrape webpage content using BeautifulSoup first, then falls back to Selenium with undetectable Chrome driver if needed. :param url: The URL of the webpage to scrape. :return: The text content of the webpage. """ # Try scraping with requests and BeautifulSoup response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.content, 'html.parser') if len(soup.get_text().strip()) > 100: # Arbitrary threshold of 100 characters return soup.get_text() # If the first attempt fails, fallback to Selenium with undetectable Chrome driver try: options = uc.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') # Initialize undetectable Chrome driver driver = uc.Chrome(options=options) driver.get(url) html = driver.page_source driver.quit() # Ensure to quit the driver to free resources soup = BeautifulSoup(html, 'html.parser') return soup.get_text() except Exception as e: return f"Failed to fetch content with error: {e}"