Shreyas094 commited on
Commit
f89b7c9
·
verified ·
1 Parent(s): ac3f4aa

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -22
app.py CHANGED
@@ -5,31 +5,39 @@ import random
5
  from bs4 import BeautifulSoup
6
  import trafilatura
7
 
 
 
 
 
 
 
 
 
 
 
 
8
  def extract_content_bs4(url):
9
  try:
10
- response = requests.get(url, timeout=10)
11
  soup = BeautifulSoup(response.content, 'html.parser')
12
-
13
- # This is a simple extraction and might need to be adjusted based on the structure of the websites you're scraping
14
  paragraphs = soup.find_all('p')
15
  content = ' '.join([p.text for p in paragraphs])
16
-
17
  return content[:1000] + "..." if len(content) > 1000 else content
18
  except Exception as e:
19
  return f"Error extracting content: {str(e)}"
20
 
21
  def extract_content_trafilatura(url):
22
  try:
23
- downloaded = trafilatura.fetch_url(url)
24
  content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
25
-
26
  return content[:1000] + "..." if content and len(content) > 1000 else content
27
  except Exception as e:
28
  return f"Error extracting content: {str(e)}"
29
 
30
- def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10, use_trafilatura=False):
 
31
  """
32
- Perform a search using the Searx API with error handling, retry logic, limited results, and content extraction.
33
  """
34
  search_endpoint = f"{instance_url}/search"
35
  params = {
@@ -37,14 +45,14 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
37
  'format': 'json',
38
  'categories': categories,
39
  'pageno': 1,
40
- 'time_range': '',
41
- 'engines': '',
42
- 'safesearch': '0',
43
  'results': str(num_results)
44
  }
45
 
46
  headers = {
47
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
48
  'Accept': 'application/json, text/javascript, */*; q=0.01',
49
  'Accept-Language': 'en-US,en;q=0.5',
50
  'Referer': instance_url,
@@ -67,7 +75,6 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
67
  title = result.get('title', 'No Title')
68
  url = result.get('url', 'No URL')
69
 
70
- # Extract content using the selected method
71
  if use_trafilatura:
72
  content = extract_content_trafilatura(url)
73
  else:
@@ -87,12 +94,12 @@ def search_searx(query, instance_url='https://searx.org', categories='general',
87
 
88
  def create_gradio_interface():
89
  """
90
- Creates and returns the Gradio interface.
91
  """
92
  with gr.Blocks() as demo:
93
- gr.Markdown("# 🕵️‍♂️ Private Search with Searx and Content Extraction")
94
  gr.Markdown(
95
- "This application allows you to perform private searches using the [Searx](https://searx.org/) metasearch engine and extract content from the results."
96
  )
97
  with gr.Row():
98
  with gr.Column():
@@ -102,7 +109,7 @@ def create_gradio_interface():
102
  lines=1
103
  )
104
  instance_url = gr.Textbox(
105
- label="Searx Instance URL",
106
  value="https://searx.org",
107
  placeholder="https://searx.instance.url",
108
  lines=1
@@ -110,7 +117,7 @@ def create_gradio_interface():
110
  categories = gr.Textbox(
111
  label="Categories",
112
  value="general",
113
- placeholder="e.g., general, images, videos",
114
  lines=1
115
  )
116
  num_results = gr.Slider(
@@ -121,23 +128,42 @@ def create_gradio_interface():
121
  label="Number of Results"
122
  )
123
  use_trafilatura = gr.Checkbox(label="Use Trafilatura for extraction (instead of BeautifulSoup)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  search_button = gr.Button("Search")
125
  with gr.Column():
126
  results = gr.Markdown("### Search Results will appear here...")
127
 
128
- def perform_search(q, url, cats, num, use_traf):
129
- return search_searx(q, instance_url=url, categories=cats, num_results=int(num), use_trafilatura=use_traf)
 
130
 
131
  search_button.click(
132
  perform_search,
133
- inputs=[query, instance_url, categories, num_results, use_trafilatura],
134
  outputs=results
135
  )
136
 
137
  gr.Markdown(
138
  """
139
  ---
140
- **Note:** This application uses the Searx metasearch engine to fetch results from multiple sources while preserving your privacy.
141
  It then attempts to extract content from the original sources, which may be subject to the terms of service of those websites.
142
  """
143
  )
 
5
  from bs4 import BeautifulSoup
6
  import trafilatura
7
 
8
+ USER_AGENTS = [
9
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
10
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
11
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
12
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
13
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1'
14
+ ]
15
+
16
+ def get_random_user_agent():
17
+ return random.choice(USER_AGENTS)
18
+
19
  def extract_content_bs4(url):
20
  try:
21
+ response = requests.get(url, headers={'User-Agent': get_random_user_agent()}, timeout=10)
22
  soup = BeautifulSoup(response.content, 'html.parser')
 
 
23
  paragraphs = soup.find_all('p')
24
  content = ' '.join([p.text for p in paragraphs])
 
25
  return content[:1000] + "..." if len(content) > 1000 else content
26
  except Exception as e:
27
  return f"Error extracting content: {str(e)}"
28
 
29
  def extract_content_trafilatura(url):
30
  try:
31
+ downloaded = trafilatura.fetch_url(url, headers={'User-Agent': get_random_user_agent()})
32
  content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
 
33
  return content[:1000] + "..." if content and len(content) > 1000 else content
34
  except Exception as e:
35
  return f"Error extracting content: {str(e)}"
36
 
37
+ def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10,
38
+ use_trafilatura=False, time_range='', language='en', safesearch=0):
39
  """
40
+ Perform a search using the SearXNG API with advanced options.
41
  """
42
  search_endpoint = f"{instance_url}/search"
43
  params = {
 
45
  'format': 'json',
46
  'categories': categories,
47
  'pageno': 1,
48
+ 'time_range': time_range,
49
+ 'language': language,
50
+ 'safesearch': safesearch,
51
  'results': str(num_results)
52
  }
53
 
54
  headers = {
55
+ 'User-Agent': get_random_user_agent(),
56
  'Accept': 'application/json, text/javascript, */*; q=0.01',
57
  'Accept-Language': 'en-US,en;q=0.5',
58
  'Referer': instance_url,
 
75
  title = result.get('title', 'No Title')
76
  url = result.get('url', 'No URL')
77
 
 
78
  if use_trafilatura:
79
  content = extract_content_trafilatura(url)
80
  else:
 
94
 
95
  def create_gradio_interface():
96
  """
97
+ Creates and returns the Gradio interface with advanced SearXNG options.
98
  """
99
  with gr.Blocks() as demo:
100
+ gr.Markdown("# 🕵️‍♂️ Advanced SearXNG Search with Content Extraction")
101
  gr.Markdown(
102
+ "This application allows you to perform private searches using SearXNG with advanced options and content extraction."
103
  )
104
  with gr.Row():
105
  with gr.Column():
 
109
  lines=1
110
  )
111
  instance_url = gr.Textbox(
112
+ label="SearXNG Instance URL",
113
  value="https://searx.org",
114
  placeholder="https://searx.instance.url",
115
  lines=1
 
117
  categories = gr.Textbox(
118
  label="Categories",
119
  value="general",
120
+ placeholder="e.g., general, news, science",
121
  lines=1
122
  )
123
  num_results = gr.Slider(
 
128
  label="Number of Results"
129
  )
130
  use_trafilatura = gr.Checkbox(label="Use Trafilatura for extraction (instead of BeautifulSoup)")
131
+ time_range = gr.Dropdown(
132
+ choices=["", "day", "week", "month", "year"],
133
+ value="",
134
+ label="Time Range"
135
+ )
136
+ language = gr.Textbox(
137
+ label="Language",
138
+ value="en",
139
+ placeholder="e.g., en, fr, de",
140
+ lines=1
141
+ )
142
+ safesearch = gr.Slider(
143
+ minimum=0,
144
+ maximum=2,
145
+ value=0,
146
+ step=1,
147
+ label="SafeSearch (0: Off, 1: Moderate, 2: Strict)"
148
+ )
149
  search_button = gr.Button("Search")
150
  with gr.Column():
151
  results = gr.Markdown("### Search Results will appear here...")
152
 
153
+ def perform_search(q, url, cats, num, use_traf, t_range, lang, safe):
154
+ return search_searx(q, instance_url=url, categories=cats, num_results=int(num),
155
+ use_trafilatura=use_traf, time_range=t_range, language=lang, safesearch=int(safe))
156
 
157
  search_button.click(
158
  perform_search,
159
+ inputs=[query, instance_url, categories, num_results, use_trafilatura, time_range, language, safesearch],
160
  outputs=results
161
  )
162
 
163
  gr.Markdown(
164
  """
165
  ---
166
+ **Note:** This application uses SearXNG to fetch results from multiple sources while preserving your privacy.
167
  It then attempts to extract content from the original sources, which may be subject to the terms of service of those websites.
168
  """
169
  )