k4d3 commited on
Commit
74e179d
1 Parent(s): 0cb8585

Revert "fix: Add validation for image URLs to prevent invalid downloads"

Browse files

This reverts commit 852b09413f080ed6a8e23464df9576dae92f455c.

.vscode/settings.json DELETED
@@ -1,3 +0,0 @@
1
- {
2
- "python.analysis.typeCheckingMode": "basic"
3
- }
 
 
 
 
crawl/.vscode/settings.json DELETED
@@ -1,6 +0,0 @@
1
- {
2
- "python.pythonPath": "C:\\Users\\kade\\miniconda3\\envs\\crawl\\python.exe",
3
- "python.linting.pylintPath": "C:\\Users\\kade\\miniconda3\\envs\\crawl\\Scripts\\pylint.exe",
4
- "python.linting.enabled": true,
5
- "python.linting.pylintEnabled": true
6
- }
 
 
 
 
 
 
 
crawl/crawl CHANGED
@@ -14,22 +14,9 @@ import os
14
  import re
15
  import platform
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
17
- import requests
18
-
19
- # Ensure the correct environment is activated
20
- if os.environ.get("CONDA_DEFAULT_ENV") != "crawl":
21
- print(
22
- "Error: The 'crawl' environment is not activated. "
23
- "Please activate it using 'conda activate crawl'."
24
- )
25
 
26
- try:
27
- from crawl4ai import WebCrawler # type: ignore
28
- except ImportError:
29
- print(
30
- "Error: Unable to import 'WebCrawler' from 'crawl4ai'. "
31
- "Please ensure the module is installed correctly."
32
- )
33
 
34
 
35
  def create_crawler():
@@ -55,7 +42,7 @@ def sanitize_filename(filename):
55
  str: The sanitized filename.
56
  """
57
  # Remove invalid characters for Windows file names
58
- return re.sub(r'[<>:"/\\|?*]', "", filename)
59
 
60
 
61
  def download_image(session, image_url, save_dir):
@@ -72,24 +59,20 @@ def download_image(session, image_url, save_dir):
72
  """
73
  try:
74
  # Ensure the URL has a scheme
75
- if not re.match(r"^https?://", image_url):
76
- image_url = "https://" + image_url.lstrip("/")
77
 
78
- # Check if the URL is valid
79
- if ".." in image_url:
80
- raise ValueError(f"Invalid URL: {image_url}")
81
-
82
- image_filename = os.path.basename(image_url).split("?")[0]
83
  sanitized_image_filename = sanitize_filename(image_filename)
84
  image_path = os.path.join(save_dir, sanitized_image_filename)
85
 
86
  response = session.get(image_url, stream=True)
87
  response.raise_for_status()
88
- with open(image_path, "wb") as image_file:
89
  for chunk in response.iter_content(chunk_size=8192):
90
  image_file.write(chunk)
91
  print(f"Saved image: {image_path}")
92
- except (requests.RequestException, ValueError) as e:
93
  print(f"Error downloading image {image_url}: {str(e)}")
94
  except IOError as e:
95
  print(f"Error saving image {image_url}: {str(e)}")
@@ -107,15 +90,15 @@ def save_result(target_url):
107
  """
108
  crawler = create_crawler()
109
  result = crawler.run(url=target_url)
110
- title = result.metadata.get("title", "untitled")
111
  sanitized_title = sanitize_filename(title).replace(" ", "_")
112
-
113
  # Choose the appropriate base path based on the operating system
114
  if platform.system() == "Windows":
115
  base_path = "E:\\knowledgebase\\Saved Websites\\"
116
  else:
117
  base_path = "/home/kade/saved_websites/"
118
-
119
  save_dir = os.path.join(base_path, sanitized_title)
120
  os.makedirs(save_dir, exist_ok=True)
121
 
@@ -126,45 +109,38 @@ def save_result(target_url):
126
  print(f"Saved markdown to {save_path}")
127
 
128
  # Save images in parallel
129
- if "images" in result.media and isinstance(result.media["images"], list):
130
  session = requests.Session()
131
  headers = {
132
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
133
- "AppleWebKit/537.36 (KHTML, like Gecko) "
134
- "Chrome/91.0.4472.124 Safari/537.36",
135
- "Referer": target_url,
136
- "Accept": (
137
- "image/avif,image/webp,image/apng,image/svg+xml,"
138
- "image/*,*/*;q=0.8"
139
- ),
140
- "Accept-Language": "en-US,en;q=0.9",
141
- "Sec-Fetch-Dest": "image",
142
- "Sec-Fetch-Mode": "no-cors",
143
- "Sec-Fetch-Site": "cross-site",
144
  }
145
  session.headers.update(headers)
146
 
147
  with ThreadPoolExecutor(max_workers=5) as executor:
148
  futures = []
149
- for image_data in result.media["images"]:
150
- if "src" in image_data:
151
- futures.append(
152
- executor.submit(
153
- download_image,
154
- session,
155
- image_data["src"],
156
- save_dir
157
- )
158
- )
159
 
160
  for future in as_completed(futures):
161
  future.result()
162
 
163
 
164
  if __name__ == "__main__":
165
- if len(sys.argv) < 2:
166
- print("Usage: python crawl.py <URL1> <URL2> ... <URLn>")
167
  else:
168
- urls = sys.argv[1:]
169
- for url in urls:
170
- save_result(url)
 
14
  import re
15
  import platform
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
 
 
 
 
 
 
17
 
18
+ import requests
19
+ from crawl4ai import WebCrawler
 
 
 
 
 
20
 
21
 
22
  def create_crawler():
 
42
  str: The sanitized filename.
43
  """
44
  # Remove invalid characters for Windows file names
45
+ return re.sub(r'[<>:"/\\|?*]', '', filename)
46
 
47
 
48
  def download_image(session, image_url, save_dir):
 
59
  """
60
  try:
61
  # Ensure the URL has a scheme
62
+ if not re.match(r'^https?://', image_url):
63
+ image_url = 'https://' + image_url.lstrip('/')
64
 
65
+ image_filename = os.path.basename(image_url).split('?')[0]
 
 
 
 
66
  sanitized_image_filename = sanitize_filename(image_filename)
67
  image_path = os.path.join(save_dir, sanitized_image_filename)
68
 
69
  response = session.get(image_url, stream=True)
70
  response.raise_for_status()
71
+ with open(image_path, 'wb') as image_file:
72
  for chunk in response.iter_content(chunk_size=8192):
73
  image_file.write(chunk)
74
  print(f"Saved image: {image_path}")
75
+ except requests.RequestException as e:
76
  print(f"Error downloading image {image_url}: {str(e)}")
77
  except IOError as e:
78
  print(f"Error saving image {image_url}: {str(e)}")
 
90
  """
91
  crawler = create_crawler()
92
  result = crawler.run(url=target_url)
93
+ title = result.metadata.get('title', 'untitled')
94
  sanitized_title = sanitize_filename(title).replace(" ", "_")
95
+
96
  # Choose the appropriate base path based on the operating system
97
  if platform.system() == "Windows":
98
  base_path = "E:\\knowledgebase\\Saved Websites\\"
99
  else:
100
  base_path = "/home/kade/saved_websites/"
101
+
102
  save_dir = os.path.join(base_path, sanitized_title)
103
  os.makedirs(save_dir, exist_ok=True)
104
 
 
109
  print(f"Saved markdown to {save_path}")
110
 
111
  # Save images in parallel
112
+ if 'images' in result.media and isinstance(result.media['images'], list):
113
  session = requests.Session()
114
  headers = {
115
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
116
+ 'AppleWebKit/537.36 (KHTML, like Gecko) '
117
+ 'Chrome/91.0.4472.124 Safari/537.36',
118
+ 'Referer': target_url,
119
+ 'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,'
120
+ 'image/*,*/*;q=0.8'),
121
+ 'Accept-Language': 'en-US,en;q=0.9',
122
+ 'Sec-Fetch-Dest': 'image',
123
+ 'Sec-Fetch-Mode': 'no-cors',
124
+ 'Sec-Fetch-Site': 'cross-site',
 
 
125
  }
126
  session.headers.update(headers)
127
 
128
  with ThreadPoolExecutor(max_workers=5) as executor:
129
  futures = []
130
+ for image_data in result.media['images']:
131
+ if 'src' in image_data:
132
+ futures.append(executor.submit(download_image,
133
+ session,
134
+ image_data['src'],
135
+ save_dir))
 
 
 
 
136
 
137
  for future in as_completed(futures):
138
  future.result()
139
 
140
 
141
  if __name__ == "__main__":
142
+ if len(sys.argv) != 2:
143
+ print("Usage: python crawl.py <URL>")
144
  else:
145
+ url = sys.argv[1]
146
+ save_result(url)
 
crawl/crawl4ai.pyi CHANGED
@@ -14,6 +14,8 @@ Example:
14
  result = crawler.run("https://example.com")
15
  """
16
 
 
 
17
 
18
  class WebCrawler:
19
  """
@@ -41,7 +43,7 @@ class WebCrawler:
41
  all necessary resources and configurations are set up.
42
  """
43
 
44
- def run(self, url: str) -> "CrawlResult":
45
  """
46
  Crawls the specified URL and returns the result.
47
 
@@ -49,25 +51,8 @@ class WebCrawler:
49
  url (str): The URL to crawl.
50
 
51
  Returns:
52
- CrawlResult: The result of the crawling operation.
53
- """
54
-
55
-
56
- class CrawlResult:
57
- """
58
- Represents the result of a crawling operation.
59
- """
60
- def __init__(self, metadata=None, markdown="", media=None):
61
- self.metadata = metadata or {}
62
- self.markdown = markdown
63
- self.media = media or {}
64
-
65
- def get_data(self):
66
- """
67
- Returns the data obtained from the crawl.
68
  """
69
 
70
- def get_status(self):
71
- """
72
- Returns the status of the crawl operation.
73
- """
 
14
  result = crawler.run("https://example.com")
15
  """
16
 
17
+ from typing import Any
18
+
19
 
20
  class WebCrawler:
21
  """
 
43
  all necessary resources and configurations are set up.
44
  """
45
 
46
+ def run(self, url: str) -> Any:
47
  """
48
  Crawls the specified URL and returns the result.
49
 
 
51
  url (str): The URL to crawl.
52
 
53
  Returns:
54
+ Any: The result of the crawling operation. The specific type
55
+ depends on the implementation and could be raw HTML,
56
+ parsed data, or any other relevant information.
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  """
58