Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -16,26 +16,27 @@ USER_AGENTS = [
|
|
16 |
def get_random_user_agent():
|
17 |
return random.choice(USER_AGENTS)
|
18 |
|
19 |
-
def extract_content_bs4(url):
|
20 |
try:
|
21 |
response = requests.get(url, headers={'User-Agent': get_random_user_agent()}, timeout=10)
|
22 |
soup = BeautifulSoup(response.content, 'html.parser')
|
23 |
paragraphs = soup.find_all('p')
|
24 |
content = ' '.join([p.text for p in paragraphs])
|
25 |
-
return content[:
|
26 |
except Exception as e:
|
27 |
return f"Error extracting content: {str(e)}"
|
28 |
|
29 |
-
def extract_content_trafilatura(url):
|
30 |
try:
|
31 |
downloaded = trafilatura.fetch_url(url, headers={'User-Agent': get_random_user_agent()})
|
32 |
content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
|
33 |
-
return content[:
|
34 |
except Exception as e:
|
35 |
return f"Error extracting content: {str(e)}"
|
36 |
|
37 |
def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10,
|
38 |
-
use_trafilatura=False, time_range='', language='en', safesearch=0
|
|
|
39 |
"""
|
40 |
Perform a search using the SearXNG API with advanced options.
|
41 |
"""
|
@@ -48,7 +49,9 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
|
|
48 |
'time_range': time_range,
|
49 |
'language': language,
|
50 |
'safesearch': safesearch,
|
51 |
-
'results': str(num_results)
|
|
|
|
|
52 |
}
|
53 |
|
54 |
headers = {
|
@@ -76,9 +79,9 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
|
|
76 |
url = result.get('url', 'No URL')
|
77 |
|
78 |
if use_trafilatura:
|
79 |
-
content = extract_content_trafilatura(url)
|
80 |
else:
|
81 |
-
content = extract_content_bs4(url)
|
82 |
|
83 |
formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{content}\n\n"
|
84 |
|
@@ -94,7 +97,7 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
|
|
94 |
|
95 |
def create_gradio_interface():
|
96 |
"""
|
97 |
-
Creates and returns the Gradio interface with advanced SearXNG options.
|
98 |
"""
|
99 |
with gr.Blocks() as demo:
|
100 |
gr.Markdown("# 🕵️♂️ Advanced SearXNG Search with Content Extraction")
|
@@ -146,17 +149,40 @@ def create_gradio_interface():
|
|
146 |
step=1,
|
147 |
label="SafeSearch (0: Off, 1: Moderate, 2: Strict)"
|
148 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
search_button = gr.Button("Search")
|
150 |
with gr.Column():
|
151 |
results = gr.Markdown("### Search Results will appear here...")
|
152 |
|
153 |
-
def perform_search(q, url, cats, num, use_traf, t_range, lang, safe):
|
154 |
return search_searx(q, instance_url=url, categories=cats, num_results=int(num),
|
155 |
-
use_trafilatura=use_traf, time_range=t_range, language=lang, safesearch=int(safe)
|
|
|
156 |
|
157 |
search_button.click(
|
158 |
perform_search,
|
159 |
-
inputs=[query, instance_url, categories, num_results, use_trafilatura, time_range, language, safesearch
|
|
|
160 |
outputs=results
|
161 |
)
|
162 |
|
|
|
16 |
def get_random_user_agent():
|
17 |
return random.choice(USER_AGENTS)
|
18 |
|
19 |
+
def extract_content_bs4(url, max_chars):
|
20 |
try:
|
21 |
response = requests.get(url, headers={'User-Agent': get_random_user_agent()}, timeout=10)
|
22 |
soup = BeautifulSoup(response.content, 'html.parser')
|
23 |
paragraphs = soup.find_all('p')
|
24 |
content = ' '.join([p.text for p in paragraphs])
|
25 |
+
return content[:max_chars] + "..." if len(content) > max_chars else content
|
26 |
except Exception as e:
|
27 |
return f"Error extracting content: {str(e)}"
|
28 |
|
29 |
+
def extract_content_trafilatura(url, max_chars):
|
30 |
try:
|
31 |
downloaded = trafilatura.fetch_url(url, headers={'User-Agent': get_random_user_agent()})
|
32 |
content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
|
33 |
+
return content[:max_chars] + "..." if content and len(content) > max_chars else content
|
34 |
except Exception as e:
|
35 |
return f"Error extracting content: {str(e)}"
|
36 |
|
37 |
def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10,
|
38 |
+
use_trafilatura=False, time_range='', language='en', safesearch=0, search_engines='all',
|
39 |
+
sort_by='relevance', max_chars=1000):
|
40 |
"""
|
41 |
Perform a search using the SearXNG API with advanced options.
|
42 |
"""
|
|
|
49 |
'time_range': time_range,
|
50 |
'language': language,
|
51 |
'safesearch': safesearch,
|
52 |
+
'results': str(num_results),
|
53 |
+
'engines': ','.join(search_engines) if 'all' not in search_engines else 'all',
|
54 |
+
'sort': sort_by
|
55 |
}
|
56 |
|
57 |
headers = {
|
|
|
79 |
url = result.get('url', 'No URL')
|
80 |
|
81 |
if use_trafilatura:
|
82 |
+
content = extract_content_trafilatura(url, max_chars)
|
83 |
else:
|
84 |
+
content = extract_content_bs4(url, max_chars)
|
85 |
|
86 |
formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{content}\n\n"
|
87 |
|
|
|
97 |
|
98 |
def create_gradio_interface():
|
99 |
"""
|
100 |
+
Creates and returns the Gradio interface with advanced SearXNG options and new parameters.
|
101 |
"""
|
102 |
with gr.Blocks() as demo:
|
103 |
gr.Markdown("# 🕵️♂️ Advanced SearXNG Search with Content Extraction")
|
|
|
149 |
step=1,
|
150 |
label="SafeSearch (0: Off, 1: Moderate, 2: Strict)"
|
151 |
)
|
152 |
+
|
153 |
+
# New parameters
|
154 |
+
search_engines = gr.Dropdown(
|
155 |
+
choices=["all", "google", "bing", "duckduckgo", "wikipedia"],
|
156 |
+
value="all",
|
157 |
+
label="Search Engines",
|
158 |
+
multiselect=True
|
159 |
+
)
|
160 |
+
sort_by = gr.Dropdown(
|
161 |
+
choices=["relevance", "date"],
|
162 |
+
value="relevance",
|
163 |
+
label="Sort Results By"
|
164 |
+
)
|
165 |
+
max_chars = gr.Slider(
|
166 |
+
minimum=100,
|
167 |
+
maximum=10000,
|
168 |
+
value=1000,
|
169 |
+
step=100,
|
170 |
+
label="Max Characters to Extract"
|
171 |
+
)
|
172 |
+
|
173 |
search_button = gr.Button("Search")
|
174 |
with gr.Column():
|
175 |
results = gr.Markdown("### Search Results will appear here...")
|
176 |
|
177 |
+
def perform_search(q, url, cats, num, use_traf, t_range, lang, safe, engines, sort, chars):
|
178 |
return search_searx(q, instance_url=url, categories=cats, num_results=int(num),
|
179 |
+
use_trafilatura=use_traf, time_range=t_range, language=lang, safesearch=int(safe),
|
180 |
+
search_engines=engines, sort_by=sort, max_chars=chars)
|
181 |
|
182 |
search_button.click(
|
183 |
perform_search,
|
184 |
+
inputs=[query, instance_url, categories, num_results, use_trafilatura, time_range, language, safesearch,
|
185 |
+
search_engines, sort_by, max_chars],
|
186 |
outputs=results
|
187 |
)
|
188 |
|