File size: 6,429 Bytes
f1a2ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988d14b
f1a2ec8
988d14b
f1a2ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Web Crawler and Content Saver

This module provides functionality to crawl web pages, extract content,
and save the results including markdown text and images. It uses the
WebCrawler class from crawl4ai and implements parallel image downloading.
"""

import sys
import os
import re
import platform
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin
from bs4 import BeautifulSoup

import requests
from crawl4ai import WebCrawler


def create_crawler():
    """
    Create and initialize a WebCrawler instance.

    Returns:
        WebCrawler: An initialized WebCrawler object.
    """
    crawler = WebCrawler(verbose=True)
    crawler.warmup()
    return crawler


def sanitize_filename(filename):
    """
    Remove invalid characters from a filename to make it Windows-compatible.

    Args:
        filename (str): The original filename.

    Returns:
        str: The sanitized filename.
    """
    # Remove invalid characters for Windows file names
    return re.sub(r'[<>:"/\\|?*]', '', filename)


def get_full_size_image_url(session, image_url, base_url):
    """
    Attempt to find the full-size image URL from a thumbnail URL.

    Args:
        session (requests.Session): The requests session to use.
        image_url (str): The thumbnail image URL.
        base_url (str): The base URL of the page being crawled.

    Returns:
        str: The full-size image URL if found, otherwise the original URL.
    """
    try:
        response = session.get(image_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Look for common full-size image patterns
        full_size_link = soup.find('a', class_=re.compile(r'fullimage|full-size'))
        if full_size_link and full_size_link.get('href'):
            return urljoin(base_url, full_size_link['href'])
        
        # If no full-size link is found, return the original URL
        return image_url
    except Exception as e:
        print(f"Error finding full-size image for {image_url}: {str(e)}")
        return image_url


def download_image(session, image_url, save_dir, base_url):
    """
    Download an image from a given URL and save it to the specified directory.
    Attempt to get the full-size image if the URL is a thumbnail.

    Args:
        session (requests.Session): The requests session to use for downloading.
        image_url (str): The URL of the image to download.
        save_dir (str): The directory to save the downloaded image.
        base_url (str): The base URL of the page being crawled.
    """
    try:
        full_size_url = get_full_size_image_url(session, image_url, base_url)
        image_filename = os.path.basename(full_size_url).split('?')[0]
        sanitized_image_filename = sanitize_filename(image_filename)
        image_path = os.path.join(save_dir, sanitized_image_filename)

        if os.path.exists(image_path):
            print(f"Image already exists: {image_path}")
            return

        response = session.get(full_size_url, stream=True)
        response.raise_for_status()
        with open(image_path, 'wb') as image_file:
            for chunk in response.iter_content(chunk_size=8192):
                image_file.write(chunk)
        print(f"Saved full-size image: {image_path}")
    except requests.RequestException as e:
        print(f"Error downloading image {full_size_url}: {str(e)}")
    except IOError as e:
        print(f"Error saving image {full_size_url}: {str(e)}")


def save_result(target_url):
    """
    Crawl a given URL, extract content, and save the results.

    This function crawls the specified URL, saves the markdown content,
    and downloads all associated images in parallel.

    Args:
        target_url (str): The URL to crawl and save content from.
    """
    crawler = create_crawler()
    result = crawler.run(url=target_url)
    title = result.metadata.get('title', 'untitled')
    sanitized_title = sanitize_filename(title).replace(" ", "_")
    
    # Choose the appropriate base path based on the operating system
    if platform.system() == "Windows":
        base_path = "E:\\ragpile\\Saved Websites\\"
    else:
        base_path = "/home/kade/datasets/ragpile/Saved Websites/"
    
    save_dir = os.path.join(base_path, sanitized_title)
    os.makedirs(save_dir, exist_ok=True)

    # Save markdown
    save_path = os.path.join(save_dir, f"{sanitized_title}.md")
    #sanitized_markdown = sanitize_citations(result.markdown)
    with open(save_path, "w", encoding="utf-8") as file:
        file.write(result.markdown)
        #file.write(sanitized_markdown)
    print(f"Saved markdown to {save_path}")

    # Save images in parallel
    if 'images' in result.media and isinstance(result.media['images'], list):
        session = requests.Session()
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/91.0.4472.124 Safari/537.36',
            'Referer': target_url,
            'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,'
                       'image/*,*/*;q=0.8'),
            'Accept-Language': 'en-US,en;q=0.9',
            'Sec-Fetch-Dest': 'image',
            'Sec-Fetch-Mode': 'no-cors',
            'Sec-Fetch-Site': 'cross-site',
        }
        session.headers.update(headers)

        with ThreadPoolExecutor(max_workers=5) as executor:
            futures = []
            for image_data in result.media['images']:
                if 'src' in image_data:
                    # Use urljoin to create absolute URLs for image sources
                    absolute_image_url = urljoin(target_url, image_data['src'])
                    futures.append(executor.submit(download_image,
                                                   session,
                                                   absolute_image_url,
                                                   save_dir,
                                                   target_url))  # Pass target_url as base_url

            for future in as_completed(futures):
                future.result()


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python crawl.py <URL>")
    else:
        url = sys.argv[1]
        save_result(url)