CryptoScout_TradeAdvisor / WebScape_ADV.py
CryptoScoutv1's picture
Create WebScape_ADV.py
0fc8f83 verified
raw
history blame
2.7 kB
import requests
import undetected_chromedriver as uc
from langchain.tools import tool
from bs4 import BeautifulSoup
from duckduckgo_search import DDGS
class WebScapeAdv_UC:
@tool("process search results with undetectable chrome", return_direct=False)
def scrape_with_undetectable_chrome(url: str) -> str:
"""
Scrape webpage content using Selenium with undetectable Chrome driver.
:param url: The URL of the webpage to scrape.
:return: The text content of the webpage.
"""
try:
options = uc.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Initialize undetectable Chrome driver
driver = uc.Chrome(options=options)
driver.get(url)
html = driver.page_source
driver.quit() # Ensure to quit the driver to free resources
soup = BeautifulSoup(html, 'html.parser')
return soup.get_text()
except Exception as e:
return f"Failed to fetch content with error: {e}"
from bs4 import BeautifulSoup
import requests
import undetected_chromedriver as uc
@tool("process search results with fallback", return_direct=False)
def scrape_with_fallback(url: str) -> str:
"""
Attempts to scrape webpage content using BeautifulSoup first, then falls back to Selenium with undetectable Chrome driver if needed.
:param url: The URL of the webpage to scrape.
:return: The text content of the webpage.
"""
# Try scraping with requests and BeautifulSoup
response = requests.get(url)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'html.parser')
if len(soup.get_text().strip()) > 100: # Arbitrary threshold of 100 characters
return soup.get_text()
# If the first attempt fails, fallback to Selenium with undetectable Chrome driver
try:
options = uc.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
# Initialize undetectable Chrome driver
driver = uc.Chrome(options=options)
driver.get(url)
html = driver.page_source
driver.quit() # Ensure to quit the driver to free resources
soup = BeautifulSoup(html, 'html.parser')
return soup.get_text()
except Exception as e:
return f"Failed to fetch content with error: {e}"