Spaces:
Sleeping
Sleeping
File size: 2,759 Bytes
c15cc30 dd8a2b3 c15cc30 dd8a2b3 c15cc30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from googleapiclient.discovery import build
import requests, re, sys, numpy, os
from bs4 import BeautifulSoup
import wrapt_timeout_decorator
from multiprocessing.dummy import Pool
# https://console.cloud.google.com/apis/credentials/key
api_key = os.environ['gg_api_key'].split()
# https://programmablesearchengine.google.com/controlpanel/overview
Custom_Search_Engine_ID = os.environ['Custom_Search_Engine_ID']
@wrapt_timeout_decorator.timeout(3)
def ggsearch(service, query, num_pages):
try:
res = service.cse().list(q=query, cx=Custom_Search_Engine_ID, num=num_pages, # start=i*num_pages
gl='vn', googlehost='vn', hl='vi').execute()
return res['items']
except:
return []
@wrapt_timeout_decorator.timeout(5)
def getContent(url):
paragraphs = ''
try:
html = requests.get(url, timeout=5)
tree = BeautifulSoup(html.text, 'html.parser')
for invisible_elem in tree.find_all(['script', 'style']):
invisible_elem.extract()
paragraphs = [p.get_text() for p in tree.find_all("p")]
for para in tree.find_all('p'):
para.extract()
for href in tree.find_all(['a','strong']):
href.unwrap()
tree = BeautifulSoup(str(tree.html), 'html.parser')
text = tree.get_text(separator='\n\n')
text = re.sub('\n +\n','\n\n',text)
paragraphs += text.split('\n\n')
paragraphs = [' '.join(p.split()).strip() for p in paragraphs]
paragraphs = [p for p in paragraphs if p != '']
paragraphs = [p for p in paragraphs if len(p.split()) > 10]
paragraphs = '\n\n'.join(paragraphs)
except:
print('Cannot read ' + url, str(sys.exc_info()[0]))
return paragraphs
class GoogleSearch():
def __init__(self, num_pages=7):
self.num_pages = num_pages
self.pool = Pool(4)
def search(self, question):
# https://github.com/googleapis/google-api-python-client/blob/main/samples/customsearch/main.py
service = build("customsearch", "v1", developerKey=api_key[0 if len(api_key) == 1 else numpy.random.randint(0, len(api_key))])
pages_content = ggsearch(service, question, self.num_pages)
document_urls = set([])
for page in pages_content:
if 'fileFormat' in page:
continue
document_urls.add(page['link'])
document_urls = list(document_urls)
gg_documents = [d for d in self.pool.map(getContent, document_urls) if len(d) > 20 and 'The security system for this website has been triggered. Completing the challenge below verifies you are a human and gives you access.' not in d]
return document_urls, gg_documents
|