#!/usr/bin/env python # -*- coding: utf-8 -*- """ Web Crawler and Content Saver This module provides functionality to crawl web pages, extract content, and save the results including markdown text and images. It uses the WebCrawler class from crawl4ai and implements parallel image downloading. """ import sys import os import re import platform from concurrent.futures import ThreadPoolExecutor, as_completed from urllib.parse import urljoin from bs4 import BeautifulSoup import requests from crawl4ai import WebCrawler def create_crawler(): """ Create and initialize a WebCrawler instance. Returns: WebCrawler: An initialized WebCrawler object. """ crawler = WebCrawler(verbose=True) crawler.warmup() return crawler def sanitize_filename(filename): """ Remove invalid characters from a filename to make it Windows-compatible. Args: filename (str): The original filename. Returns: str: The sanitized filename. """ # Remove invalid characters for Windows file names return re.sub(r'[<>:"/\\|?*]', '', filename) def get_full_size_image_url(session, image_url, base_url): """ Attempt to find the full-size image URL from a thumbnail URL. Args: session (requests.Session): The requests session to use. image_url (str): The thumbnail image URL. base_url (str): The base URL of the page being crawled. Returns: str: The full-size image URL if found, otherwise the original URL. """ try: response = session.get(image_url) response.raise_for_status() soup = BeautifulSoup(response.text, 'html.parser') # Look for common full-size image patterns full_size_link = soup.find('a', class_=re.compile(r'fullimage|full-size')) if full_size_link and full_size_link.get('href'): return urljoin(base_url, full_size_link['href']) # If no full-size link is found, return the original URL return image_url except Exception as e: print(f"Error finding full-size image for {image_url}: {str(e)}") return image_url def download_image(session, image_url, save_dir, base_url): """ Download an image from a given URL and save it to the specified directory. Attempt to get the full-size image if the URL is a thumbnail. Args: session (requests.Session): The requests session to use for downloading. image_url (str): The URL of the image to download. save_dir (str): The directory to save the downloaded image. base_url (str): The base URL of the page being crawled. """ try: full_size_url = get_full_size_image_url(session, image_url, base_url) image_filename = os.path.basename(full_size_url).split('?')[0] sanitized_image_filename = sanitize_filename(image_filename) image_path = os.path.join(save_dir, sanitized_image_filename) if os.path.exists(image_path): print(f"Image already exists: {image_path}") return response = session.get(full_size_url, stream=True) response.raise_for_status() with open(image_path, 'wb') as image_file: for chunk in response.iter_content(chunk_size=8192): image_file.write(chunk) print(f"Saved full-size image: {image_path}") except requests.RequestException as e: print(f"Error downloading image {full_size_url}: {str(e)}") except IOError as e: print(f"Error saving image {full_size_url}: {str(e)}") def save_result(target_url): """ Crawl a given URL, extract content, and save the results. This function crawls the specified URL, saves the markdown content, and downloads all associated images in parallel. Args: target_url (str): The URL to crawl and save content from. """ crawler = create_crawler() result = crawler.run(url=target_url) title = result.metadata.get('title', 'untitled') sanitized_title = sanitize_filename(title).replace(" ", "_") # Choose the appropriate base path based on the operating system if platform.system() == "Windows": base_path = "E:\\knowledgebase\\Saved Websites\\" else: base_path = "/home/kade/saved_websites/" save_dir = os.path.join(base_path, sanitized_title) os.makedirs(save_dir, exist_ok=True) # Save markdown save_path = os.path.join(save_dir, f"{sanitized_title}.md") #sanitized_markdown = sanitize_citations(result.markdown) with open(save_path, "w", encoding="utf-8") as file: file.write(result.markdown) #file.write(sanitized_markdown) print(f"Saved markdown to {save_path}") # Save images in parallel if 'images' in result.media and isinstance(result.media['images'], list): session = requests.Session() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/91.0.4472.124 Safari/537.36', 'Referer': target_url, 'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,' 'image/*,*/*;q=0.8'), 'Accept-Language': 'en-US,en;q=0.9', 'Sec-Fetch-Dest': 'image', 'Sec-Fetch-Mode': 'no-cors', 'Sec-Fetch-Site': 'cross-site', } session.headers.update(headers) with ThreadPoolExecutor(max_workers=5) as executor: futures = [] for image_data in result.media['images']: if 'src' in image_data: # Use urljoin to create absolute URLs for image sources absolute_image_url = urljoin(target_url, image_data['src']) futures.append(executor.submit(download_image, session, absolute_image_url, save_dir, target_url)) # Pass target_url as base_url for future in as_completed(futures): future.result() if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: python crawl.py ") else: url = sys.argv[1] save_result(url)