Revert "fix: Enhance URL validation for image downloads- Improved the URL validation in the download_image function.- Used requests.utils.urlparse to parse the URL and check for a valid scheme and netloc.- Ensured the URL path does not contain invalid labels (e.g., ..).- This prevents the script from attempting to download images with invalid URLs and provides clearer error messages."
Browse files- crawl/crawl +2 -3
crawl/crawl
CHANGED
@@ -76,11 +76,10 @@ def download_image(session, image_url, save_dir):
|
|
76 |
image_url = "https://" + image_url.lstrip("/")
|
77 |
|
78 |
# Check if the URL is valid
|
79 |
-
|
80 |
-
if not parsed_url.scheme or not parsed_url.netloc or ".." in parsed_url.path:
|
81 |
raise ValueError(f"Invalid URL: {image_url}")
|
82 |
|
83 |
-
image_filename = os.path.basename(
|
84 |
sanitized_image_filename = sanitize_filename(image_filename)
|
85 |
image_path = os.path.join(save_dir, sanitized_image_filename)
|
86 |
|
|
|
76 |
image_url = "https://" + image_url.lstrip("/")
|
77 |
|
78 |
# Check if the URL is valid
|
79 |
+
if ".." in image_url:
|
|
|
80 |
raise ValueError(f"Invalid URL: {image_url}")
|
81 |
|
82 |
+
image_filename = os.path.basename(image_url).split("?")[0]
|
83 |
sanitized_image_filename = sanitize_filename(image_filename)
|
84 |
image_path = os.path.join(save_dir, sanitized_image_filename)
|
85 |
|