File size: 2,695 Bytes
0fc8f83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import requests
import undetected_chromedriver as uc

from langchain.tools import tool
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS


class WebScapeAdv_UC:
    @tool("process search results with undetectable chrome", return_direct=False)
    def scrape_with_undetectable_chrome(url: str) -> str:
        """
        Scrape webpage content using Selenium with undetectable Chrome driver.

        :param url: The URL of the webpage to scrape.
        :return: The text content of the webpage.
        """
        try:
            options = uc.ChromeOptions()
            options.add_argument('--headless')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')

            # Initialize undetectable Chrome driver
            driver = uc.Chrome(options=options)
            driver.get(url)
            html = driver.page_source
            driver.quit()  # Ensure to quit the driver to free resources
            soup = BeautifulSoup(html, 'html.parser')
            return soup.get_text()
        except Exception as e:
            return f"Failed to fetch content with error: {e}"
    from bs4 import BeautifulSoup
    import requests
    import undetected_chromedriver as uc



    @tool("process search results with fallback", return_direct=False)
    def scrape_with_fallback(url: str) -> str:
        """
        Attempts to scrape webpage content using BeautifulSoup first, then falls back to Selenium with undetectable Chrome driver if needed.

        :param url: The URL of the webpage to scrape.
        :return: The text content of the webpage.
        """
        # Try scraping with requests and BeautifulSoup
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            if len(soup.get_text().strip()) > 100:  # Arbitrary threshold of 100 characters
                return soup.get_text()

        # If the first attempt fails, fallback to Selenium with undetectable Chrome driver
        try:
            options = uc.ChromeOptions()
            options.add_argument('--headless')
            options.add_argument('--no-sandbox')
            options.add_argument('--disable-dev-shm-usage')

            # Initialize undetectable Chrome driver
            driver = uc.Chrome(options=options)
            driver.get(url)
            html = driver.page_source
            driver.quit()  # Ensure to quit the driver to free resources
            soup = BeautifulSoup(html, 'html.parser')
            return soup.get_text()
        except Exception as e:
            return f"Failed to fetch content with error: {e}"