File size: 2,759 Bytes
c15cc30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd8a2b3
c15cc30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd8a2b3
c15cc30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from googleapiclient.discovery import build
import requests, re, sys, numpy, os
from bs4 import BeautifulSoup
import wrapt_timeout_decorator
from multiprocessing.dummy import Pool


# https://console.cloud.google.com/apis/credentials/key
api_key = os.environ['gg_api_key'].split()

# https://programmablesearchengine.google.com/controlpanel/overview
Custom_Search_Engine_ID = os.environ['Custom_Search_Engine_ID']


@wrapt_timeout_decorator.timeout(3)
def ggsearch(service, query, num_pages):
    try:
        res = service.cse().list(q=query, cx=Custom_Search_Engine_ID, num=num_pages, # start=i*num_pages
                                 gl='vn', googlehost='vn', hl='vi').execute()
        return res['items']
    except:
        return []

@wrapt_timeout_decorator.timeout(5)
def getContent(url):
    paragraphs = ''
    try:
        html = requests.get(url, timeout=5)
        tree = BeautifulSoup(html.text, 'html.parser')
        for invisible_elem in tree.find_all(['script', 'style']):
            invisible_elem.extract()

        paragraphs = [p.get_text() for p in tree.find_all("p")]

        for para in tree.find_all('p'):
            para.extract()
        for href in tree.find_all(['a','strong']):
            href.unwrap()

        tree = BeautifulSoup(str(tree.html), 'html.parser')
        text = tree.get_text(separator='\n\n')
        text = re.sub('\n +\n','\n\n',text)

        paragraphs += text.split('\n\n')
        paragraphs = [' '.join(p.split()).strip() for p in paragraphs]
        paragraphs = [p for p in paragraphs if p != '']
        paragraphs = [p for p in paragraphs if len(p.split()) > 10]
        paragraphs = '\n\n'.join(paragraphs)
    except:
        print('Cannot read ' + url, str(sys.exc_info()[0]))
    return paragraphs 


class GoogleSearch():
    def __init__(self, num_pages=7):
        self.num_pages = num_pages
        self.pool = Pool(4)
            
    def search(self, question):
        # https://github.com/googleapis/google-api-python-client/blob/main/samples/customsearch/main.py
        service = build("customsearch", "v1", developerKey=api_key[0 if len(api_key) == 1 else numpy.random.randint(0, len(api_key))])
        pages_content = ggsearch(service, question, self.num_pages)
        
        document_urls = set([])
        for page in pages_content:
            if 'fileFormat' in page: 
                continue
            document_urls.add(page['link'])
        document_urls = list(document_urls)
        gg_documents = [d for d in self.pool.map(getContent, document_urls) if len(d) > 20 and 'The security system for this website has been triggered. Completing the challenge below verifies you are a human and gives you access.' not in d]
        return document_urls, gg_documents