File size: 6,429 Bytes

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Web Crawler and Content Saver

This module provides functionality to crawl web pages, extract content,
and save the results including markdown text and images. It uses the
WebCrawler class from crawl4ai and implements parallel image downloading.
"""

import sys
import os
import re
import platform
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin
from bs4 import BeautifulSoup

import requests
from crawl4ai import WebCrawler


def create_crawler():
    """
    Create and initialize a WebCrawler instance.

    Returns:
        WebCrawler: An initialized WebCrawler object.
    """
    crawler = WebCrawler(verbose=True)
    crawler.warmup()
    return crawler


def sanitize_filename(filename):
    """
    Remove invalid characters from a filename to make it Windows-compatible.

    Args:
        filename (str): The original filename.

    Returns:
        str: The sanitized filename.
    """
    # Remove invalid characters for Windows file names
    return re.sub(r'[<>:"/\\|?*]', '', filename)


def get_full_size_image_url(session, image_url, base_url):
    """
    Attempt to find the full-size image URL from a thumbnail URL.

    Args:
        session (requests.Session): The requests session to use.
        image_url (str): The thumbnail image URL.
        base_url (str): The base URL of the page being crawled.

    Returns:
        str: The full-size image URL if found, otherwise the original URL.
    """
    try:
        response = session.get(image_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Look for common full-size image patterns
        full_size_link = soup.find('a', class_=re.compile(r'fullimage|full-size'))
        if full_size_link and full_size_link.get('href'):
            return urljoin(base_url, full_size_link['href'])
        
        # If no full-size link is found, return the original URL
        return image_url
    except Exception as e:
        print(f"Error finding full-size image for {image_url}: {str(e)}")
        return image_url


def download_image(session, image_url, save_dir, base_url):
    """
    Download an image from a given URL and save it to the specified directory.
    Attempt to get the full-size image if the URL is a thumbnail.

    Args:
        session (requests.Session): The requests session to use for downloading.
        image_url (str): The URL of the image to download.
        save_dir (str): The directory to save the downloaded image.
        base_url (str): The base URL of the page being crawled.
    """
    try:
        full_size_url = get_full_size_image_url(session, image_url, base_url)
        image_filename = os.path.basename(full_size_url).split('?')[0]
        sanitized_image_filename = sanitize_filename(image_filename)
        image_path = os.path.join(save_dir, sanitized_image_filename)

        if os.path.exists(image_path):
            print(f"Image already exists: {image_path}")
            return

        response = session.get(full_size_url, stream=True)
        response.raise_for_status()
        with open(image_path, 'wb') as image_file:
            for chunk in response.iter_content(chunk_size=8192):
                image_file.write(chunk)
        print(f"Saved full-size image: {image_path}")
    except requests.RequestException as e:
        print(f"Error downloading image {full_size_url}: {str(e)}")
    except IOError as e:
        print(f"Error saving image {full_size_url}: {str(e)}")


def save_result(target_url):
    """
    Crawl a given URL, extract content, and save the results.

    This function crawls the specified URL, saves the markdown content,
    and downloads all associated images in parallel.

    Args:
        target_url (str): The URL to crawl and save content from.
    """
    crawler = create_crawler()
    result = crawler.run(url=target_url)
    title = result.metadata.get('title', 'untitled')
    sanitized_title = sanitize_filename(title).replace(" ", "_")
    
    # Choose the appropriate base path based on the operating system
    if platform.system() == "Windows":
        base_path = "E:\\ragpile\\Saved Websites\\"
    else:
        base_path = "/home/kade/datasets/ragpile/Saved Websites/"
    
    save_dir = os.path.join(base_path, sanitized_title)
    os.makedirs(save_dir, exist_ok=True)

    # Save markdown
    save_path = os.path.join(save_dir, f"{sanitized_title}.md")
    #sanitized_markdown = sanitize_citations(result.markdown)
    with open(save_path, "w", encoding="utf-8") as file:
        file.write(result.markdown)
        #file.write(sanitized_markdown)
    print(f"Saved markdown to {save_path}")

    # Save images in parallel
    if 'images' in result.media and isinstance(result.media['images'], list):
        session = requests.Session()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/91.0.4472.124 Safari/537.36',
            'Referer': target_url,
            'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,'
                       'image/*,*/*;q=0.8'),
            'Accept-Language': 'en-US,en;q=0.9',
            'Sec-Fetch-Dest': 'image',
            'Sec-Fetch-Mode': 'no-cors',
            'Sec-Fetch-Site': 'cross-site',
        }
        session.headers.update(headers)

        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            for image_data in result.media['images']:
                if 'src' in image_data:
                    # Use urljoin to create absolute URLs for image sources
                    absolute_image_url = urljoin(target_url, image_data['src'])
                    futures.append(executor.submit(download_image,
                                                   session,
                                                   absolute_image_url,
                                                   save_dir,
                                                   target_url))  # Pass target_url as base_url

            for future in as_completed(futures):
                future.result()


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python crawl.py <URL>")
    else:
        url = sys.argv[1]
        save_result(url)