Shreyas094 commited on
Commit
da860a3
·
verified ·
1 Parent(s): f89b7c9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -12
app.py CHANGED
@@ -16,26 +16,27 @@ USER_AGENTS = [
16
  def get_random_user_agent():
17
  return random.choice(USER_AGENTS)
18
 
19
- def extract_content_bs4(url):
20
  try:
21
  response = requests.get(url, headers={'User-Agent': get_random_user_agent()}, timeout=10)
22
  soup = BeautifulSoup(response.content, 'html.parser')
23
  paragraphs = soup.find_all('p')
24
  content = ' '.join([p.text for p in paragraphs])
25
- return content[:1000] + "..." if len(content) > 1000 else content
26
  except Exception as e:
27
  return f"Error extracting content: {str(e)}"
28
 
29
- def extract_content_trafilatura(url):
30
  try:
31
  downloaded = trafilatura.fetch_url(url, headers={'User-Agent': get_random_user_agent()})
32
  content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
33
- return content[:1000] + "..." if content and len(content) > 1000 else content
34
  except Exception as e:
35
  return f"Error extracting content: {str(e)}"
36
 
37
  def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10,
38
- use_trafilatura=False, time_range='', language='en', safesearch=0):
 
39
  """
40
  Perform a search using the SearXNG API with advanced options.
41
  """
@@ -48,7 +49,9 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
48
  'time_range': time_range,
49
  'language': language,
50
  'safesearch': safesearch,
51
- 'results': str(num_results)
 
 
52
  }
53
 
54
  headers = {
@@ -76,9 +79,9 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
76
  url = result.get('url', 'No URL')
77
 
78
  if use_trafilatura:
79
- content = extract_content_trafilatura(url)
80
  else:
81
- content = extract_content_bs4(url)
82
 
83
  formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{content}\n\n"
84
 
@@ -94,7 +97,7 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
94
 
95
  def create_gradio_interface():
96
  """
97
- Creates and returns the Gradio interface with advanced SearXNG options.
98
  """
99
  with gr.Blocks() as demo:
100
  gr.Markdown("# 🕵️‍♂️ Advanced SearXNG Search with Content Extraction")
@@ -146,17 +149,40 @@ def create_gradio_interface():
146
  step=1,
147
  label="SafeSearch (0: Off, 1: Moderate, 2: Strict)"
148
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
  search_button = gr.Button("Search")
150
  with gr.Column():
151
  results = gr.Markdown("### Search Results will appear here...")
152
 
153
- def perform_search(q, url, cats, num, use_traf, t_range, lang, safe):
154
  return search_searx(q, instance_url=url, categories=cats, num_results=int(num),
155
- use_trafilatura=use_traf, time_range=t_range, language=lang, safesearch=int(safe))
 
156
 
157
  search_button.click(
158
  perform_search,
159
- inputs=[query, instance_url, categories, num_results, use_trafilatura, time_range, language, safesearch],
 
160
  outputs=results
161
  )
162
 
 
16
  def get_random_user_agent():
17
  return random.choice(USER_AGENTS)
18
 
19
+ def extract_content_bs4(url, max_chars):
20
  try:
21
  response = requests.get(url, headers={'User-Agent': get_random_user_agent()}, timeout=10)
22
  soup = BeautifulSoup(response.content, 'html.parser')
23
  paragraphs = soup.find_all('p')
24
  content = ' '.join([p.text for p in paragraphs])
25
+ return content[:max_chars] + "..." if len(content) > max_chars else content
26
  except Exception as e:
27
  return f"Error extracting content: {str(e)}"
28
 
29
+ def extract_content_trafilatura(url, max_chars):
30
  try:
31
  downloaded = trafilatura.fetch_url(url, headers={'User-Agent': get_random_user_agent()})
32
  content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
33
+ return content[:max_chars] + "..." if content and len(content) > max_chars else content
34
  except Exception as e:
35
  return f"Error extracting content: {str(e)}"
36
 
37
  def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10,
38
+ use_trafilatura=False, time_range='', language='en', safesearch=0, search_engines='all',
39
+ sort_by='relevance', max_chars=1000):
40
  """
41
  Perform a search using the SearXNG API with advanced options.
42
  """
 
49
  'time_range': time_range,
50
  'language': language,
51
  'safesearch': safesearch,
52
+ 'results': str(num_results),
53
+ 'engines': ','.join(search_engines) if 'all' not in search_engines else 'all',
54
+ 'sort': sort_by
55
  }
56
 
57
  headers = {
 
79
  url = result.get('url', 'No URL')
80
 
81
  if use_trafilatura:
82
+ content = extract_content_trafilatura(url, max_chars)
83
  else:
84
+ content = extract_content_bs4(url, max_chars)
85
 
86
  formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{content}\n\n"
87
 
 
97
 
98
  def create_gradio_interface():
99
  """
100
+ Creates and returns the Gradio interface with advanced SearXNG options and new parameters.
101
  """
102
  with gr.Blocks() as demo:
103
  gr.Markdown("# 🕵️‍♂️ Advanced SearXNG Search with Content Extraction")
 
149
  step=1,
150
  label="SafeSearch (0: Off, 1: Moderate, 2: Strict)"
151
  )
152
+
153
+ # New parameters
154
+ search_engines = gr.Dropdown(
155
+ choices=["all", "google", "bing", "duckduckgo", "wikipedia"],
156
+ value="all",
157
+ label="Search Engines",
158
+ multiselect=True
159
+ )
160
+ sort_by = gr.Dropdown(
161
+ choices=["relevance", "date"],
162
+ value="relevance",
163
+ label="Sort Results By"
164
+ )
165
+ max_chars = gr.Slider(
166
+ minimum=100,
167
+ maximum=10000,
168
+ value=1000,
169
+ step=100,
170
+ label="Max Characters to Extract"
171
+ )
172
+
173
  search_button = gr.Button("Search")
174
  with gr.Column():
175
  results = gr.Markdown("### Search Results will appear here...")
176
 
177
+ def perform_search(q, url, cats, num, use_traf, t_range, lang, safe, engines, sort, chars):
178
  return search_searx(q, instance_url=url, categories=cats, num_results=int(num),
179
+ use_trafilatura=use_traf, time_range=t_range, language=lang, safesearch=int(safe),
180
+ search_engines=engines, sort_by=sort, max_chars=chars)
181
 
182
  search_button.click(
183
  perform_search,
184
+ inputs=[query, instance_url, categories, num_results, use_trafilatura, time_range, language, safesearch,
185
+ search_engines, sort_by, max_chars],
186
  outputs=results
187
  )
188