|
|
|
|
|
|
|
""" |
|
Web Crawler and Content Saver |
|
|
|
This module provides functionality to crawl web pages, extract content, |
|
and save the results including markdown text and images. It uses the |
|
WebCrawler class from crawl4ai and implements parallel image downloading. |
|
""" |
|
|
|
import sys |
|
import os |
|
import re |
|
import platform |
|
from concurrent.futures import ThreadPoolExecutor, as_completed |
|
from urllib.parse import urljoin |
|
from bs4 import BeautifulSoup |
|
|
|
import requests |
|
from crawl4ai import WebCrawler |
|
|
|
|
|
def create_crawler(): |
|
""" |
|
Create and initialize a WebCrawler instance. |
|
|
|
Returns: |
|
WebCrawler: An initialized WebCrawler object. |
|
""" |
|
crawler = WebCrawler(verbose=True) |
|
crawler.warmup() |
|
return crawler |
|
|
|
|
|
def sanitize_filename(filename): |
|
""" |
|
Remove invalid characters from a filename to make it Windows-compatible. |
|
|
|
Args: |
|
filename (str): The original filename. |
|
|
|
Returns: |
|
str: The sanitized filename. |
|
""" |
|
|
|
return re.sub(r'[<>:"/\\|?*]', '', filename) |
|
|
|
|
|
def get_full_size_image_url(session, image_url, base_url): |
|
""" |
|
Attempt to find the full-size image URL from a thumbnail URL. |
|
|
|
Args: |
|
session (requests.Session): The requests session to use. |
|
image_url (str): The thumbnail image URL. |
|
base_url (str): The base URL of the page being crawled. |
|
|
|
Returns: |
|
str: The full-size image URL if found, otherwise the original URL. |
|
""" |
|
try: |
|
response = session.get(image_url) |
|
response.raise_for_status() |
|
soup = BeautifulSoup(response.text, 'html.parser') |
|
|
|
|
|
full_size_link = soup.find('a', class_=re.compile(r'fullimage|full-size')) |
|
if full_size_link and full_size_link.get('href'): |
|
return urljoin(base_url, full_size_link['href']) |
|
|
|
|
|
return image_url |
|
except Exception as e: |
|
print(f"Error finding full-size image for {image_url}: {str(e)}") |
|
return image_url |
|
|
|
|
|
def download_image(session, image_url, save_dir, base_url): |
|
""" |
|
Download an image from a given URL and save it to the specified directory. |
|
Attempt to get the full-size image if the URL is a thumbnail. |
|
|
|
Args: |
|
session (requests.Session): The requests session to use for downloading. |
|
image_url (str): The URL of the image to download. |
|
save_dir (str): The directory to save the downloaded image. |
|
base_url (str): The base URL of the page being crawled. |
|
""" |
|
try: |
|
full_size_url = get_full_size_image_url(session, image_url, base_url) |
|
image_filename = os.path.basename(full_size_url).split('?')[0] |
|
sanitized_image_filename = sanitize_filename(image_filename) |
|
image_path = os.path.join(save_dir, sanitized_image_filename) |
|
|
|
if os.path.exists(image_path): |
|
print(f"Image already exists: {image_path}") |
|
return |
|
|
|
response = session.get(full_size_url, stream=True) |
|
response.raise_for_status() |
|
with open(image_path, 'wb') as image_file: |
|
for chunk in response.iter_content(chunk_size=8192): |
|
image_file.write(chunk) |
|
print(f"Saved full-size image: {image_path}") |
|
except requests.RequestException as e: |
|
print(f"Error downloading image {full_size_url}: {str(e)}") |
|
except IOError as e: |
|
print(f"Error saving image {full_size_url}: {str(e)}") |
|
|
|
|
|
def save_result(target_url): |
|
""" |
|
Crawl a given URL, extract content, and save the results. |
|
|
|
This function crawls the specified URL, saves the markdown content, |
|
and downloads all associated images in parallel. |
|
|
|
Args: |
|
target_url (str): The URL to crawl and save content from. |
|
""" |
|
crawler = create_crawler() |
|
result = crawler.run(url=target_url) |
|
title = result.metadata.get('title', 'untitled') |
|
sanitized_title = sanitize_filename(title).replace(" ", "_") |
|
|
|
|
|
if platform.system() == "Windows": |
|
base_path = "E:\\ragpile\\Saved Websites\\" |
|
else: |
|
base_path = "/home/kade/datasets/ragpile/Saved Websites/" |
|
|
|
save_dir = os.path.join(base_path, sanitized_title) |
|
os.makedirs(save_dir, exist_ok=True) |
|
|
|
|
|
save_path = os.path.join(save_dir, f"{sanitized_title}.md") |
|
|
|
with open(save_path, "w", encoding="utf-8") as file: |
|
file.write(result.markdown) |
|
|
|
print(f"Saved markdown to {save_path}") |
|
|
|
|
|
if 'images' in result.media and isinstance(result.media['images'], list): |
|
session = requests.Session() |
|
headers = { |
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' |
|
'AppleWebKit/537.36 (KHTML, like Gecko) ' |
|
'Chrome/91.0.4472.124 Safari/537.36', |
|
'Referer': target_url, |
|
'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,' |
|
'image/*,*/*;q=0.8'), |
|
'Accept-Language': 'en-US,en;q=0.9', |
|
'Sec-Fetch-Dest': 'image', |
|
'Sec-Fetch-Mode': 'no-cors', |
|
'Sec-Fetch-Site': 'cross-site', |
|
} |
|
session.headers.update(headers) |
|
|
|
with ThreadPoolExecutor(max_workers=5) as executor: |
|
futures = [] |
|
for image_data in result.media['images']: |
|
if 'src' in image_data: |
|
|
|
absolute_image_url = urljoin(target_url, image_data['src']) |
|
futures.append(executor.submit(download_image, |
|
session, |
|
absolute_image_url, |
|
save_dir, |
|
target_url)) |
|
|
|
for future in as_completed(futures): |
|
future.result() |
|
|
|
|
|
if __name__ == "__main__": |
|
if len(sys.argv) != 2: |
|
print("Usage: python crawl.py <URL>") |
|
else: |
|
url = sys.argv[1] |
|
save_result(url) |
|
|