k4d3 commited on
Commit
0cb8585
1 Parent(s): e4302bc

Revert "fix: Enhance URL validation for image downloads- Improved the URL validation in the download_image function.- Used requests.utils.urlparse to parse the URL and check for a valid scheme and netloc.- Ensured the URL path does not contain invalid labels (e.g., ..).- This prevents the script from attempting to download images with invalid URLs and provides clearer error messages."

Browse files
Files changed (1) hide show
  1. crawl/crawl +2 -3
crawl/crawl CHANGED
@@ -76,11 +76,10 @@ def download_image(session, image_url, save_dir):
76
  image_url = "https://" + image_url.lstrip("/")
77
 
78
  # Check if the URL is valid
79
- parsed_url = requests.utils.urlparse(image_url)
80
- if not parsed_url.scheme or not parsed_url.netloc or ".." in parsed_url.path:
81
  raise ValueError(f"Invalid URL: {image_url}")
82
 
83
- image_filename = os.path.basename(parsed_url.path).split("?")[0]
84
  sanitized_image_filename = sanitize_filename(image_filename)
85
  image_path = os.path.join(save_dir, sanitized_image_filename)
86
 
 
76
  image_url = "https://" + image_url.lstrip("/")
77
 
78
  # Check if the URL is valid
79
+ if ".." in image_url:
 
80
  raise ValueError(f"Invalid URL: {image_url}")
81
 
82
+ image_filename = os.path.basename(image_url).split("?")[0]
83
  sanitized_image_filename = sanitize_filename(image_filename)
84
  image_path = os.path.join(save_dir, sanitized_image_filename)
85