# import logging | |
# from typing import List, Dict | |
# import requests | |
# from bs4 import BeautifulSoup | |
# from urllib3.exceptions import InsecureRequestWarning | |
# # Disable SSL warnings for requests | |
# requests.packages.urllib3.disable_warnings(InsecureRequestWarning) | |
# logger = logging.getLogger(__name__) | |
# class WebSearcher: | |
# def __init__(self): | |
# self.headers = { | |
# "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0" | |
# } | |
# def extract_text(self, html_content: str) -> str: | |
# soup = BeautifulSoup(html_content, 'html.parser') | |
# # Remove unwanted elements | |
# for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']): | |
# element.decompose() | |
# text = ' '.join(soup.stripped_strings) | |
# return text[:8000] # Limit text length | |
# def search(self, query: str, max_results: int = 3) -> List[Dict]: | |
# results = [] | |
# try: | |
# with requests.Session() as session: | |
# # Google search parameters | |
# search_url = "https://www.google.com/search" | |
# params = { | |
# "q": query, | |
# "num": max_results, | |
# "hl": "en" | |
# } | |
# response = session.get( | |
# search_url, | |
# headers=self.headers, | |
# params=params, | |
# timeout=10, | |
# verify=False | |
# ) | |
# response.raise_for_status() | |
# # Parse search results | |
# soup = BeautifulSoup(response.text, 'html.parser') | |
# search_results = soup.select('div.g') | |
# for result in search_results[:max_results]: | |
# link = result.find('a') | |
# if not link: | |
# continue | |
# url = link.get('href', '') | |
# if not url.startswith('http'): | |
# continue | |
# try: | |
# # Fetch webpage content | |
# page_response = session.get( | |
# url, | |
# headers=self.headers, | |
# timeout=5, | |
# verify=False | |
# ) | |
# page_response.raise_for_status() | |
# content = self.extract_text(page_response.text) | |
# results.append({ | |
# "url": url, | |
# "content": content | |
# }) | |
# logger.info(f"Successfully fetched content from {url}") | |
# except Exception as e: | |
# logger.warning(f"Failed to fetch {url}: {str(e)}") | |
# continue | |
# except Exception as e: | |
# logger.error(f"Search failed: {str(e)}") | |
# return results[:max_results] | |
import logging | |
from typing import List, Dict | |
from transformers.agents import DuckDuckGoSearchTool | |
logger = logging.getLogger(__name__) | |
class WebSearcher: | |
def __init__(self): | |
self.search_tool = DuckDuckGoSearchTool() | |
def search(self, query: str) -> List[Dict]: | |
try: | |
# Execute search | |
search_results = self.search_tool(query) | |
# Convert list to string if necessary | |
if isinstance(search_results, list): | |
search_results = ' '.join(str(result) for result in search_results) | |
results = [{ | |
"url": "duckduckgo_search", | |
"content": str(search_results) # Limit content length and ensure string | |
}] | |
return results | |
except Exception as e: | |
logger.error(f"Search error: {str(e)}") | |
return [] | |
# Initialize searcher | |
searcher = WebSearcher() |