Spaces:

namnh113
/

Question_Answering

Sleeping

App Files Files Community

Question_Answering / gg_search.py

namnh113

Update gg_search.py

dd8a2b3 about 1 year ago

raw

history blame contribute delete

2.76 kB

	from googleapiclient.discovery import build
	import requests, re, sys, numpy, os
	from bs4 import BeautifulSoup
	import wrapt_timeout_decorator
	from multiprocessing.dummy import Pool


	# https://console.cloud.google.com/apis/credentials/key
	api_key = os.environ['gg_api_key'].split()

	# https://programmablesearchengine.google.com/controlpanel/overview
	Custom_Search_Engine_ID = os.environ['Custom_Search_Engine_ID']


	@wrapt_timeout_decorator.timeout(3)
	def ggsearch(service, query, num_pages):
	try:
	res = service.cse().list(q=query, cx=Custom_Search_Engine_ID, num=num_pages, # start=i*num_pages
	gl='vn', googlehost='vn', hl='vi').execute()
	return res['items']
	except:
	return []

	@wrapt_timeout_decorator.timeout(5)
	def getContent(url):
	paragraphs = ''
	try:
	html = requests.get(url, timeout=5)
	tree = BeautifulSoup(html.text, 'html.parser')
	for invisible_elem in tree.find_all(['script', 'style']):
	invisible_elem.extract()

	paragraphs = [p.get_text() for p in tree.find_all("p")]

	for para in tree.find_all('p'):
	para.extract()
	for href in tree.find_all(['a','strong']):
	href.unwrap()

	tree = BeautifulSoup(str(tree.html), 'html.parser')
	text = tree.get_text(separator='\n\n')
	text = re.sub('\n +\n','\n\n',text)

	paragraphs += text.split('\n\n')
	paragraphs = [' '.join(p.split()).strip() for p in paragraphs]
	paragraphs = [p for p in paragraphs if p != '']
	paragraphs = [p for p in paragraphs if len(p.split()) > 10]
	paragraphs = '\n\n'.join(paragraphs)
	except:
	print('Cannot read ' + url, str(sys.exc_info()[0]))
	return paragraphs


	class GoogleSearch():
	def __init__(self, num_pages=7):
	self.num_pages = num_pages
	self.pool = Pool(4)

	def search(self, question):
	# https://github.com/googleapis/google-api-python-client/blob/main/samples/customsearch/main.py
	service = build("customsearch", "v1", developerKey=api_key[0 if len(api_key) == 1 else numpy.random.randint(0, len(api_key))])
	pages_content = ggsearch(service, question, self.num_pages)

	document_urls = set([])
	for page in pages_content:
	if 'fileFormat' in page:
	continue
	document_urls.add(page['link'])
	document_urls = list(document_urls)
	gg_documents = [d for d in self.pool.map(getContent, document_urls) if len(d) > 20 and 'The security system for this website has been triggered. Completing the challenge below verifies you are a human and gives you access.' not in d]
	return document_urls, gg_documents