Spaces:
Sleeping
Sleeping
from googleapiclient.discovery import build | |
import requests, re, sys, numpy, os | |
from bs4 import BeautifulSoup | |
import wrapt_timeout_decorator | |
from multiprocessing.dummy import Pool | |
# https://console.cloud.google.com/apis/credentials/key | |
api_key = os.environ['gg_api_key'].split() | |
# https://programmablesearchengine.google.com/controlpanel/overview | |
Custom_Search_Engine_ID = os.environ['Custom_Search_Engine_ID'] | |
def ggsearch(service, query, num_pages): | |
try: | |
res = service.cse().list(q=query, cx=Custom_Search_Engine_ID, num=num_pages, # start=i*num_pages | |
gl='vn', googlehost='vn', hl='vi').execute() | |
return res['items'] | |
except: | |
return [] | |
def getContent(url): | |
paragraphs = '' | |
try: | |
html = requests.get(url, timeout=5) | |
tree = BeautifulSoup(html.text, 'html.parser') | |
for invisible_elem in tree.find_all(['script', 'style']): | |
invisible_elem.extract() | |
paragraphs = [p.get_text() for p in tree.find_all("p")] | |
for para in tree.find_all('p'): | |
para.extract() | |
for href in tree.find_all(['a','strong']): | |
href.unwrap() | |
tree = BeautifulSoup(str(tree.html), 'html.parser') | |
text = tree.get_text(separator='\n\n') | |
text = re.sub('\n +\n','\n\n',text) | |
paragraphs += text.split('\n\n') | |
paragraphs = [' '.join(p.split()).strip() for p in paragraphs] | |
paragraphs = [p for p in paragraphs if p != ''] | |
paragraphs = [p for p in paragraphs if len(p.split()) > 10] | |
paragraphs = '\n\n'.join(paragraphs) | |
except: | |
print('Cannot read ' + url, str(sys.exc_info()[0])) | |
return paragraphs | |
class GoogleSearch(): | |
def __init__(self, num_pages=7): | |
self.num_pages = num_pages | |
self.pool = Pool(4) | |
def search(self, question): | |
# https://github.com/googleapis/google-api-python-client/blob/main/samples/customsearch/main.py | |
service = build("customsearch", "v1", developerKey=api_key[0 if len(api_key) == 1 else numpy.random.randint(0, len(api_key))]) | |
pages_content = ggsearch(service, question, self.num_pages) | |
document_urls = set([]) | |
for page in pages_content: | |
if 'fileFormat' in page: | |
continue | |
document_urls.add(page['link']) | |
document_urls = list(document_urls) | |
gg_documents = [d for d in self.pool.map(getContent, document_urls) if len(d) > 20 and 'The security system for this website has been triggered. Completing the challenge below verifies you are a human and gives you access.' not in d] | |
return document_urls, gg_documents | |