Spaces:

CryptoScoutv1
/

CryptoScout_TradeAdvisor

Sleeping

File size: 2,695 Bytes

0fc8f83

import requests
import undetected_chromedriver as uc

from langchain.tools import tool
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS


class WebScapeAdv_UC:
    @tool("process search results with undetectable chrome", return_direct=False)
    def scrape_with_undetectable_chrome(url: str) -> str:
        """
        Scrape webpage content using Selenium with undetectable Chrome driver.

        :param url: The URL of the webpage to scrape.
        :return: The text content of the webpage.
        """
        try:
            options = uc.ChromeOptions()
            options.add_argument('--headless')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')

            # Initialize undetectable Chrome driver
            driver = uc.Chrome(options=options)
            driver.get(url)
            html = driver.page_source
            driver.quit()  # Ensure to quit the driver to free resources
            soup = BeautifulSoup(html, 'html.parser')
            return soup.get_text()
        except Exception as e:
            return f"Failed to fetch content with error: {e}"
    from bs4 import BeautifulSoup
    import requests
    import undetected_chromedriver as uc



    @tool("process search results with fallback", return_direct=False)
    def scrape_with_fallback(url: str) -> str:
        """
        Attempts to scrape webpage content using BeautifulSoup first, then falls back to Selenium with undetectable Chrome driver if needed.

        :param url: The URL of the webpage to scrape.
        :return: The text content of the webpage.
        """
        # Try scraping with requests and BeautifulSoup
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            if len(soup.get_text().strip()) > 100:  # Arbitrary threshold of 100 characters
                return soup.get_text()

        # If the first attempt fails, fallback to Selenium with undetectable Chrome driver
        try:
            options = uc.ChromeOptions()
            options.add_argument('--headless')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')

            # Initialize undetectable Chrome driver
            driver = uc.Chrome(options=options)
            driver.get(url)
            html = driver.page_source
            driver.quit()  # Ensure to quit the driver to free resources
            soup = BeautifulSoup(html, 'html.parser')
            return soup.get_text()
        except Exception as e:
            return f"Failed to fetch content with error: {e}"