toolkit / crawl /crawl_wikipedia
k4d3's picture
update path in crawlies
988d14b
raw
history blame
6.43 kB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Web Crawler and Content Saver
This module provides functionality to crawl web pages, extract content,
and save the results including markdown text and images. It uses the
WebCrawler class from crawl4ai and implements parallel image downloading.
"""
import sys
import os
import re
import platform
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import requests
from crawl4ai import WebCrawler
def create_crawler():
"""
Create and initialize a WebCrawler instance.
Returns:
WebCrawler: An initialized WebCrawler object.
"""
crawler = WebCrawler(verbose=True)
crawler.warmup()
return crawler
def sanitize_filename(filename):
"""
Remove invalid characters from a filename to make it Windows-compatible.
Args:
filename (str): The original filename.
Returns:
str: The sanitized filename.
"""
# Remove invalid characters for Windows file names
return re.sub(r'[<>:"/\\|?*]', '', filename)
def get_full_size_image_url(session, image_url, base_url):
"""
Attempt to find the full-size image URL from a thumbnail URL.
Args:
session (requests.Session): The requests session to use.
image_url (str): The thumbnail image URL.
base_url (str): The base URL of the page being crawled.
Returns:
str: The full-size image URL if found, otherwise the original URL.
"""
try:
response = session.get(image_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Look for common full-size image patterns
full_size_link = soup.find('a', class_=re.compile(r'fullimage|full-size'))
if full_size_link and full_size_link.get('href'):
return urljoin(base_url, full_size_link['href'])
# If no full-size link is found, return the original URL
return image_url
except Exception as e:
print(f"Error finding full-size image for {image_url}: {str(e)}")
return image_url
def download_image(session, image_url, save_dir, base_url):
"""
Download an image from a given URL and save it to the specified directory.
Attempt to get the full-size image if the URL is a thumbnail.
Args:
session (requests.Session): The requests session to use for downloading.
image_url (str): The URL of the image to download.
save_dir (str): The directory to save the downloaded image.
base_url (str): The base URL of the page being crawled.
"""
try:
full_size_url = get_full_size_image_url(session, image_url, base_url)
image_filename = os.path.basename(full_size_url).split('?')[0]
sanitized_image_filename = sanitize_filename(image_filename)
image_path = os.path.join(save_dir, sanitized_image_filename)
if os.path.exists(image_path):
print(f"Image already exists: {image_path}")
return
response = session.get(full_size_url, stream=True)
response.raise_for_status()
with open(image_path, 'wb') as image_file:
for chunk in response.iter_content(chunk_size=8192):
image_file.write(chunk)
print(f"Saved full-size image: {image_path}")
except requests.RequestException as e:
print(f"Error downloading image {full_size_url}: {str(e)}")
except IOError as e:
print(f"Error saving image {full_size_url}: {str(e)}")
def save_result(target_url):
"""
Crawl a given URL, extract content, and save the results.
This function crawls the specified URL, saves the markdown content,
and downloads all associated images in parallel.
Args:
target_url (str): The URL to crawl and save content from.
"""
crawler = create_crawler()
result = crawler.run(url=target_url)
title = result.metadata.get('title', 'untitled')
sanitized_title = sanitize_filename(title).replace(" ", "_")
# Choose the appropriate base path based on the operating system
if platform.system() == "Windows":
base_path = "E:\\ragpile\\Saved Websites\\"
else:
base_path = "/home/kade/datasets/ragpile/Saved Websites/"
save_dir = os.path.join(base_path, sanitized_title)
os.makedirs(save_dir, exist_ok=True)
# Save markdown
save_path = os.path.join(save_dir, f"{sanitized_title}.md")
#sanitized_markdown = sanitize_citations(result.markdown)
with open(save_path, "w", encoding="utf-8") as file:
file.write(result.markdown)
#file.write(sanitized_markdown)
print(f"Saved markdown to {save_path}")
# Save images in parallel
if 'images' in result.media and isinstance(result.media['images'], list):
session = requests.Session()
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/91.0.4472.124 Safari/537.36',
'Referer': target_url,
'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,'
'image/*,*/*;q=0.8'),
'Accept-Language': 'en-US,en;q=0.9',
'Sec-Fetch-Dest': 'image',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Site': 'cross-site',
}
session.headers.update(headers)
with ThreadPoolExecutor(max_workers=5) as executor:
futures = []
for image_data in result.media['images']:
if 'src' in image_data:
# Use urljoin to create absolute URLs for image sources
absolute_image_url = urljoin(target_url, image_data['src'])
futures.append(executor.submit(download_image,
session,
absolute_image_url,
save_dir,
target_url)) # Pass target_url as base_url
for future in as_completed(futures):
future.result()
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python crawl.py <URL>")
else:
url = sys.argv[1]
save_result(url)