# utility : # search engine for using web retrieval # extracts text content from ranked search websites, # it would be useful for connecting mixtral to web. # use a reranker if required. import requests from bs4 import BeautifulSoup import re class SearchClient: def __init__(self, vendor, engine_id=None, api_key=None): self.vendor = vendor if vendor == 'google': self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}" elif vendor == 'bing': self.endpoint = "https://api.bing.microsoft.com/v7.0/search" self.headers = { 'Ocp-Apim-Subscription-Key': api_key, } def search(self, query, n_crawl): if self.vendor == 'google': return self._google_search(query, n_crawl) elif self.vendor == 'bing': return self._bing_search(query, n_crawl) else: return "Invalid vendor" @staticmethod def _extract_text_from_link(link): page = requests.get(link) if page.status_code == 200: soup = BeautifulSoup(page.content, 'html.parser') text = soup.get_text() cleaned_text = re.sub(r'\s+', ' ', text) return cleaned_text return None def _google_search(self, query, n_crawl): response = requests.get(self.endpoint, params={'q': query}) search_results = response.json() results = [] count = 0 for item in search_results.get('items', []): if count >= n_crawl: break link = item['link'] cleaned_text = self._extract_text_from_link(link) if cleaned_text: results.append({"text": cleaned_text, "link": link}) count += 1 return results def _bing_search(self, query, n_crawl): params = { 'q': query, 'count': n_crawl, # You might need to adjust this based on Bing API requirements 'mkt': 'en-US' } response = requests.get(self.endpoint, headers=self.headers, params=params) search_results = response.json() print(search_results) results = [] for item in search_results.get('webPages', {}).get('value', []): link = item['url'] cleaned_text = self._extract_text_from_link(link) if cleaned_text: results.append({"text": cleaned_text, "link": link}) return results