Revert "fix: Add validation for image URLs to prevent invalid downloads"

Browse files

This reverts commit 852b09413f080ed6a8e23464df9576dae92f455c.

Files changed (4) hide show

.vscode/settings.json +0 -3
crawl/.vscode/settings.json +0 -6
crawl/crawl +32 -56
crawl/crawl4ai.pyi +6 -21

.vscode/settings.json DELETED Viewed

@@ -1,3 +0,0 @@
-{
-  "python.analysis.typeCheckingMode": "basic"
-}

crawl/.vscode/settings.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-  "python.pythonPath": "C:\\Users\\kade\\miniconda3\\envs\\crawl\\python.exe",
-  "python.linting.pylintPath": "C:\\Users\\kade\\miniconda3\\envs\\crawl\\Scripts\\pylint.exe",
-  "python.linting.enabled": true,
-  "python.linting.pylintEnabled": true
-}

crawl/crawl CHANGED Viewed

@@ -14,22 +14,9 @@ import os
 import re
 import platform
 from concurrent.futures import ThreadPoolExecutor, as_completed
-import requests
-# Ensure the correct environment is activated
-if os.environ.get("CONDA_DEFAULT_ENV") != "crawl":
-    print(
-        "Error: The 'crawl' environment is not activated. "
-        "Please activate it using 'conda activate crawl'."
-    )
-try:
-    from crawl4ai import WebCrawler  # type: ignore
-except ImportError:
-    print(
-        "Error: Unable to import 'WebCrawler' from 'crawl4ai'. "
-        "Please ensure the module is installed correctly."
-    )
 def create_crawler():
@@ -55,7 +42,7 @@ def sanitize_filename(filename):
         str: The sanitized filename.
     """
     # Remove invalid characters for Windows file names
-    return re.sub(r'[<>:"/\\|?*]', "", filename)
 def download_image(session, image_url, save_dir):
@@ -72,24 +59,20 @@ def download_image(session, image_url, save_dir):
     """
     try:
         # Ensure the URL has a scheme
-        if not re.match(r"^https?://", image_url):
-            image_url = "https://" + image_url.lstrip("/")
-        # Check if the URL is valid
-        if ".." in image_url:
-            raise ValueError(f"Invalid URL: {image_url}")
-        image_filename = os.path.basename(image_url).split("?")[0]
         sanitized_image_filename = sanitize_filename(image_filename)
         image_path = os.path.join(save_dir, sanitized_image_filename)
         response = session.get(image_url, stream=True)
         response.raise_for_status()
-        with open(image_path, "wb") as image_file:
             for chunk in response.iter_content(chunk_size=8192):
                 image_file.write(chunk)
         print(f"Saved image: {image_path}")
-    except (requests.RequestException, ValueError) as e:
         print(f"Error downloading image {image_url}: {str(e)}")
     except IOError as e:
         print(f"Error saving image {image_url}: {str(e)}")
@@ -107,15 +90,15 @@ def save_result(target_url):
     """
     crawler = create_crawler()
     result = crawler.run(url=target_url)
-    title = result.metadata.get("title", "untitled")
     sanitized_title = sanitize_filename(title).replace(" ", "_")
     # Choose the appropriate base path based on the operating system
     if platform.system() == "Windows":
         base_path = "E:\\knowledgebase\\Saved Websites\\"
     else:
         base_path = "/home/kade/saved_websites/"
     save_dir = os.path.join(base_path, sanitized_title)
     os.makedirs(save_dir, exist_ok=True)
@@ -126,45 +109,38 @@ def save_result(target_url):
     print(f"Saved markdown to {save_path}")
     # Save images in parallel
-    if "images" in result.media and isinstance(result.media["images"], list):
         session = requests.Session()
         headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/91.0.4472.124 Safari/537.36",
-            "Referer": target_url,
-            "Accept": (
-                "image/avif,image/webp,image/apng,image/svg+xml,"
-                "image/*,*/*;q=0.8"
-            ),
-            "Accept-Language": "en-US,en;q=0.9",
-            "Sec-Fetch-Dest": "image",
-            "Sec-Fetch-Mode": "no-cors",
-            "Sec-Fetch-Site": "cross-site",
         }
         session.headers.update(headers)
         with ThreadPoolExecutor(max_workers=5) as executor:
             futures = []
-            for image_data in result.media["images"]:
-                if "src" in image_data:
-                    futures.append(
-                        executor.submit(
-                            download_image,
-                            session,
-                            image_data["src"],
-                            save_dir
-                        )
-                    )
             for future in as_completed(futures):
                 future.result()
 if __name__ == "__main__":
-    if len(sys.argv) < 2:
-        print("Usage: python crawl.py <URL1> <URL2> ... <URLn>")
     else:
-        urls = sys.argv[1:]
-        for url in urls:
-            save_result(url)

 import re
 import platform
 from concurrent.futures import ThreadPoolExecutor, as_completed
+import requests
+from crawl4ai import WebCrawler
 def create_crawler():
         str: The sanitized filename.
     """
     # Remove invalid characters for Windows file names
+    return re.sub(r'[<>:"/\\|?*]', '', filename)
 def download_image(session, image_url, save_dir):
     """
     try:
         # Ensure the URL has a scheme
+        if not re.match(r'^https?://', image_url):
+            image_url = 'https://' + image_url.lstrip('/')
+        image_filename = os.path.basename(image_url).split('?')[0]
         sanitized_image_filename = sanitize_filename(image_filename)
         image_path = os.path.join(save_dir, sanitized_image_filename)
         response = session.get(image_url, stream=True)
         response.raise_for_status()
+        with open(image_path, 'wb') as image_file:
             for chunk in response.iter_content(chunk_size=8192):
                 image_file.write(chunk)
         print(f"Saved image: {image_path}")
+    except requests.RequestException as e:
         print(f"Error downloading image {image_url}: {str(e)}")
     except IOError as e:
         print(f"Error saving image {image_url}: {str(e)}")
     """
     crawler = create_crawler()
     result = crawler.run(url=target_url)
+    title = result.metadata.get('title', 'untitled')
     sanitized_title = sanitize_filename(title).replace(" ", "_")
     # Choose the appropriate base path based on the operating system
     if platform.system() == "Windows":
         base_path = "E:\\knowledgebase\\Saved Websites\\"
     else:
         base_path = "/home/kade/saved_websites/"
     save_dir = os.path.join(base_path, sanitized_title)
     os.makedirs(save_dir, exist_ok=True)
     print(f"Saved markdown to {save_path}")
     # Save images in parallel
+    if 'images' in result.media and isinstance(result.media['images'], list):
         session = requests.Session()
         headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
+                          'AppleWebKit/537.36 (KHTML, like Gecko) '
+                          'Chrome/91.0.4472.124 Safari/537.36',
+            'Referer': target_url,
+            'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,'
+                       'image/*,*/*;q=0.8'),
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Sec-Fetch-Dest': 'image',
+            'Sec-Fetch-Mode': 'no-cors',
+            'Sec-Fetch-Site': 'cross-site',
         }
         session.headers.update(headers)
         with ThreadPoolExecutor(max_workers=5) as executor:
             futures = []
+            for image_data in result.media['images']:
+                if 'src' in image_data:
+                    futures.append(executor.submit(download_image,
+                                                   session,
+                                                   image_data['src'],
+                                                   save_dir))
             for future in as_completed(futures):
                 future.result()
 if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python crawl.py <URL>")
     else:
+        url = sys.argv[1]
+        save_result(url)

crawl/crawl4ai.pyi CHANGED Viewed

@@ -14,6 +14,8 @@ Example:
     result = crawler.run("https://example.com")
 """
 class WebCrawler:
     """
@@ -41,7 +43,7 @@ class WebCrawler:
         all necessary resources and configurations are set up.
         """
-    def run(self, url: str) -> "CrawlResult":
         """
         Crawls the specified URL and returns the result.
@@ -49,25 +51,8 @@ class WebCrawler:
             url (str): The URL to crawl.
         Returns:
-            CrawlResult: The result of the crawling operation.
-        """
-class CrawlResult:
-    """
-    Represents the result of a crawling operation.
-    """
-    def __init__(self, metadata=None, markdown="", media=None):
-        self.metadata = metadata or {}
-        self.markdown = markdown
-        self.media = media or {}
-    def get_data(self):
-        """
-        Returns the data obtained from the crawl.
         """
-    def get_status(self):
-        """
-        Returns the status of the crawl operation.
-        """

     result = crawler.run("https://example.com")
 """
+from typing import Any
 class WebCrawler:
     """
         all necessary resources and configurations are set up.
         """
+    def run(self, url: str) -> Any:
         """
         Crawls the specified URL and returns the result.
             url (str): The URL to crawl.
         Returns:
+            Any: The result of the crawling operation. The specific type
+                 depends on the implementation and could be raw HTML,
+                 parsed data, or any other relevant information.
         """