Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,31 +5,39 @@ import random
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
import trafilatura
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def extract_content_bs4(url):
|
9 |
try:
|
10 |
-
response = requests.get(url, timeout=10)
|
11 |
soup = BeautifulSoup(response.content, 'html.parser')
|
12 |
-
|
13 |
-
# This is a simple extraction and might need to be adjusted based on the structure of the websites you're scraping
|
14 |
paragraphs = soup.find_all('p')
|
15 |
content = ' '.join([p.text for p in paragraphs])
|
16 |
-
|
17 |
return content[:1000] + "..." if len(content) > 1000 else content
|
18 |
except Exception as e:
|
19 |
return f"Error extracting content: {str(e)}"
|
20 |
|
21 |
def extract_content_trafilatura(url):
|
22 |
try:
|
23 |
-
downloaded = trafilatura.fetch_url(url)
|
24 |
content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
|
25 |
-
|
26 |
return content[:1000] + "..." if content and len(content) > 1000 else content
|
27 |
except Exception as e:
|
28 |
return f"Error extracting content: {str(e)}"
|
29 |
|
30 |
-
def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10,
|
|
|
31 |
"""
|
32 |
-
Perform a search using the
|
33 |
"""
|
34 |
search_endpoint = f"{instance_url}/search"
|
35 |
params = {
|
@@ -37,14 +45,14 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
|
|
37 |
'format': 'json',
|
38 |
'categories': categories,
|
39 |
'pageno': 1,
|
40 |
-
'time_range':
|
41 |
-
'
|
42 |
-
'safesearch':
|
43 |
'results': str(num_results)
|
44 |
}
|
45 |
|
46 |
headers = {
|
47 |
-
'User-Agent':
|
48 |
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
49 |
'Accept-Language': 'en-US,en;q=0.5',
|
50 |
'Referer': instance_url,
|
@@ -67,7 +75,6 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
|
|
67 |
title = result.get('title', 'No Title')
|
68 |
url = result.get('url', 'No URL')
|
69 |
|
70 |
-
# Extract content using the selected method
|
71 |
if use_trafilatura:
|
72 |
content = extract_content_trafilatura(url)
|
73 |
else:
|
@@ -87,12 +94,12 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
|
|
87 |
|
88 |
def create_gradio_interface():
|
89 |
"""
|
90 |
-
Creates and returns the Gradio interface.
|
91 |
"""
|
92 |
with gr.Blocks() as demo:
|
93 |
-
gr.Markdown("# 🕵️♂️
|
94 |
gr.Markdown(
|
95 |
-
"This application allows you to perform private searches using
|
96 |
)
|
97 |
with gr.Row():
|
98 |
with gr.Column():
|
@@ -102,7 +109,7 @@ def create_gradio_interface():
|
|
102 |
lines=1
|
103 |
)
|
104 |
instance_url = gr.Textbox(
|
105 |
-
label="
|
106 |
value="https://searx.org",
|
107 |
placeholder="https://searx.instance.url",
|
108 |
lines=1
|
@@ -110,7 +117,7 @@ def create_gradio_interface():
|
|
110 |
categories = gr.Textbox(
|
111 |
label="Categories",
|
112 |
value="general",
|
113 |
-
placeholder="e.g., general,
|
114 |
lines=1
|
115 |
)
|
116 |
num_results = gr.Slider(
|
@@ -121,23 +128,42 @@ def create_gradio_interface():
|
|
121 |
label="Number of Results"
|
122 |
)
|
123 |
use_trafilatura = gr.Checkbox(label="Use Trafilatura for extraction (instead of BeautifulSoup)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
search_button = gr.Button("Search")
|
125 |
with gr.Column():
|
126 |
results = gr.Markdown("### Search Results will appear here...")
|
127 |
|
128 |
-
def perform_search(q, url, cats, num, use_traf):
|
129 |
-
return search_searx(q, instance_url=url, categories=cats, num_results=int(num),
|
|
|
130 |
|
131 |
search_button.click(
|
132 |
perform_search,
|
133 |
-
inputs=[query, instance_url, categories, num_results, use_trafilatura],
|
134 |
outputs=results
|
135 |
)
|
136 |
|
137 |
gr.Markdown(
|
138 |
"""
|
139 |
---
|
140 |
-
**Note:** This application uses
|
141 |
It then attempts to extract content from the original sources, which may be subject to the terms of service of those websites.
|
142 |
"""
|
143 |
)
|
|
|
5 |
from bs4 import BeautifulSoup
|
6 |
import trafilatura
|
7 |
|
8 |
+
USER_AGENTS = [
|
9 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
10 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
|
11 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
|
12 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
13 |
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
|
14 |
+
]
|
15 |
+
|
16 |
+
def get_random_user_agent():
|
17 |
+
return random.choice(USER_AGENTS)
|
18 |
+
|
19 |
def extract_content_bs4(url):
|
20 |
try:
|
21 |
+
response = requests.get(url, headers={'User-Agent': get_random_user_agent()}, timeout=10)
|
22 |
soup = BeautifulSoup(response.content, 'html.parser')
|
|
|
|
|
23 |
paragraphs = soup.find_all('p')
|
24 |
content = ' '.join([p.text for p in paragraphs])
|
|
|
25 |
return content[:1000] + "..." if len(content) > 1000 else content
|
26 |
except Exception as e:
|
27 |
return f"Error extracting content: {str(e)}"
|
28 |
|
29 |
def extract_content_trafilatura(url):
|
30 |
try:
|
31 |
+
downloaded = trafilatura.fetch_url(url, headers={'User-Agent': get_random_user_agent()})
|
32 |
content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
|
|
|
33 |
return content[:1000] + "..." if content and len(content) > 1000 else content
|
34 |
except Exception as e:
|
35 |
return f"Error extracting content: {str(e)}"
|
36 |
|
37 |
+
def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10,
|
38 |
+
use_trafilatura=False, time_range='', language='en', safesearch=0):
|
39 |
"""
|
40 |
+
Perform a search using the SearXNG API with advanced options.
|
41 |
"""
|
42 |
search_endpoint = f"{instance_url}/search"
|
43 |
params = {
|
|
|
45 |
'format': 'json',
|
46 |
'categories': categories,
|
47 |
'pageno': 1,
|
48 |
+
'time_range': time_range,
|
49 |
+
'language': language,
|
50 |
+
'safesearch': safesearch,
|
51 |
'results': str(num_results)
|
52 |
}
|
53 |
|
54 |
headers = {
|
55 |
+
'User-Agent': get_random_user_agent(),
|
56 |
'Accept': 'application/json, text/javascript, */*; q=0.01',
|
57 |
'Accept-Language': 'en-US,en;q=0.5',
|
58 |
'Referer': instance_url,
|
|
|
75 |
title = result.get('title', 'No Title')
|
76 |
url = result.get('url', 'No URL')
|
77 |
|
|
|
78 |
if use_trafilatura:
|
79 |
content = extract_content_trafilatura(url)
|
80 |
else:
|
|
|
94 |
|
95 |
def create_gradio_interface():
|
96 |
"""
|
97 |
+
Creates and returns the Gradio interface with advanced SearXNG options.
|
98 |
"""
|
99 |
with gr.Blocks() as demo:
|
100 |
+
gr.Markdown("# 🕵️♂️ Advanced SearXNG Search with Content Extraction")
|
101 |
gr.Markdown(
|
102 |
+
"This application allows you to perform private searches using SearXNG with advanced options and content extraction."
|
103 |
)
|
104 |
with gr.Row():
|
105 |
with gr.Column():
|
|
|
109 |
lines=1
|
110 |
)
|
111 |
instance_url = gr.Textbox(
|
112 |
+
label="SearXNG Instance URL",
|
113 |
value="https://searx.org",
|
114 |
placeholder="https://searx.instance.url",
|
115 |
lines=1
|
|
|
117 |
categories = gr.Textbox(
|
118 |
label="Categories",
|
119 |
value="general",
|
120 |
+
placeholder="e.g., general, news, science",
|
121 |
lines=1
|
122 |
)
|
123 |
num_results = gr.Slider(
|
|
|
128 |
label="Number of Results"
|
129 |
)
|
130 |
use_trafilatura = gr.Checkbox(label="Use Trafilatura for extraction (instead of BeautifulSoup)")
|
131 |
+
time_range = gr.Dropdown(
|
132 |
+
choices=["", "day", "week", "month", "year"],
|
133 |
+
value="",
|
134 |
+
label="Time Range"
|
135 |
+
)
|
136 |
+
language = gr.Textbox(
|
137 |
+
label="Language",
|
138 |
+
value="en",
|
139 |
+
placeholder="e.g., en, fr, de",
|
140 |
+
lines=1
|
141 |
+
)
|
142 |
+
safesearch = gr.Slider(
|
143 |
+
minimum=0,
|
144 |
+
maximum=2,
|
145 |
+
value=0,
|
146 |
+
step=1,
|
147 |
+
label="SafeSearch (0: Off, 1: Moderate, 2: Strict)"
|
148 |
+
)
|
149 |
search_button = gr.Button("Search")
|
150 |
with gr.Column():
|
151 |
results = gr.Markdown("### Search Results will appear here...")
|
152 |
|
153 |
+
def perform_search(q, url, cats, num, use_traf, t_range, lang, safe):
|
154 |
+
return search_searx(q, instance_url=url, categories=cats, num_results=int(num),
|
155 |
+
use_trafilatura=use_traf, time_range=t_range, language=lang, safesearch=int(safe))
|
156 |
|
157 |
search_button.click(
|
158 |
perform_search,
|
159 |
+
inputs=[query, instance_url, categories, num_results, use_trafilatura, time_range, language, safesearch],
|
160 |
outputs=results
|
161 |
)
|
162 |
|
163 |
gr.Markdown(
|
164 |
"""
|
165 |
---
|
166 |
+
**Note:** This application uses SearXNG to fetch results from multiple sources while preserving your privacy.
|
167 |
It then attempts to extract content from the original sources, which may be subject to the terms of service of those websites.
|
168 |
"""
|
169 |
)
|