Spaces:

CryptoScoutv1
/

CryptoScout_TradeAdvisor

Sleeping

App Files Files Community

CryptoScout_TradeAdvisor / WebScape_ADV.py

CryptoScoutv1

Create WebScape_ADV.py

0fc8f83 verified about 1 year ago

raw

history blame

2.7 kB

	import requests
	import undetected_chromedriver as uc

	from langchain.tools import tool
	from bs4 import BeautifulSoup
	from duckduckgo_search import DDGS


	class WebScapeAdv_UC:
	@tool("process search results with undetectable chrome", return_direct=False)
	def scrape_with_undetectable_chrome(url: str) -> str:
	"""
	Scrape webpage content using Selenium with undetectable Chrome driver.

	:param url: The URL of the webpage to scrape.
	:return: The text content of the webpage.
	"""
	try:
	options = uc.ChromeOptions()
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')

	# Initialize undetectable Chrome driver
	driver = uc.Chrome(options=options)
	driver.get(url)
	html = driver.page_source
	driver.quit() # Ensure to quit the driver to free resources
	soup = BeautifulSoup(html, 'html.parser')
	return soup.get_text()
	except Exception as e:
	return f"Failed to fetch content with error: {e}"
	from bs4 import BeautifulSoup
	import requests
	import undetected_chromedriver as uc



	@tool("process search results with fallback", return_direct=False)
	def scrape_with_fallback(url: str) -> str:
	"""
	Attempts to scrape webpage content using BeautifulSoup first, then falls back to Selenium with undetectable Chrome driver if needed.

	:param url: The URL of the webpage to scrape.
	:return: The text content of the webpage.
	"""
	# Try scraping with requests and BeautifulSoup
	response = requests.get(url)
	if response.status_code == 200:
	soup = BeautifulSoup(response.content, 'html.parser')
	if len(soup.get_text().strip()) > 100: # Arbitrary threshold of 100 characters
	return soup.get_text()

	# If the first attempt fails, fallback to Selenium with undetectable Chrome driver
	try:
	options = uc.ChromeOptions()
	options.add_argument('--headless')
	options.add_argument('--no-sandbox')
	options.add_argument('--disable-dev-shm-usage')

	# Initialize undetectable Chrome driver
	driver = uc.Chrome(options=options)
	driver.get(url)
	html = driver.page_source
	driver.quit() # Ensure to quit the driver to free resources
	soup = BeautifulSoup(html, 'html.parser')
	return soup.get_text()
	except Exception as e:
	return f"Failed to fetch content with error: {e}"