toolkit / crawl /crawl_wikipedia

update path in crawlies

988d14b about 1 month ago

6.43 kB

	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	Web Crawler and Content Saver

	This module provides functionality to crawl web pages, extract content,
	and save the results including markdown text and images. It uses the
	WebCrawler class from crawl4ai and implements parallel image downloading.
	"""

	import sys
	import os
	import re
	import platform
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from urllib.parse import urljoin
	from bs4 import BeautifulSoup

	import requests
	from crawl4ai import WebCrawler


	def create_crawler():
	"""
	Create and initialize a WebCrawler instance.

	Returns:
	WebCrawler: An initialized WebCrawler object.
	"""
	crawler = WebCrawler(verbose=True)
	crawler.warmup()
	return crawler


	def sanitize_filename(filename):
	"""
	Remove invalid characters from a filename to make it Windows-compatible.

	Args:
	filename (str): The original filename.

	Returns:
	str: The sanitized filename.
	"""
	# Remove invalid characters for Windows file names
	return re.sub(r'[<>:"/\\\|?*]', '', filename)


	def get_full_size_image_url(session, image_url, base_url):
	"""
	Attempt to find the full-size image URL from a thumbnail URL.

	Args:
	session (requests.Session): The requests session to use.
	image_url (str): The thumbnail image URL.
	base_url (str): The base URL of the page being crawled.

	Returns:
	str: The full-size image URL if found, otherwise the original URL.
	"""
	try:
	response = session.get(image_url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')

	# Look for common full-size image patterns
	full_size_link = soup.find('a', class_=re.compile(r'fullimage\|full-size'))
	if full_size_link and full_size_link.get('href'):
	return urljoin(base_url, full_size_link['href'])

	# If no full-size link is found, return the original URL
	return image_url
	except Exception as e:
	print(f"Error finding full-size image for {image_url}: {str(e)}")
	return image_url


	def download_image(session, image_url, save_dir, base_url):
	"""
	Download an image from a given URL and save it to the specified directory.
	Attempt to get the full-size image if the URL is a thumbnail.

	Args:
	session (requests.Session): The requests session to use for downloading.
	image_url (str): The URL of the image to download.
	save_dir (str): The directory to save the downloaded image.
	base_url (str): The base URL of the page being crawled.
	"""
	try:
	full_size_url = get_full_size_image_url(session, image_url, base_url)
	image_filename = os.path.basename(full_size_url).split('?')[0]
	sanitized_image_filename = sanitize_filename(image_filename)
	image_path = os.path.join(save_dir, sanitized_image_filename)

	if os.path.exists(image_path):
	print(f"Image already exists: {image_path}")
	return

	response = session.get(full_size_url, stream=True)
	response.raise_for_status()
	with open(image_path, 'wb') as image_file:
	for chunk in response.iter_content(chunk_size=8192):
	image_file.write(chunk)
	print(f"Saved full-size image: {image_path}")
	except requests.RequestException as e:
	print(f"Error downloading image {full_size_url}: {str(e)}")
	except IOError as e:
	print(f"Error saving image {full_size_url}: {str(e)}")


	def save_result(target_url):
	"""
	Crawl a given URL, extract content, and save the results.

	This function crawls the specified URL, saves the markdown content,
	and downloads all associated images in parallel.

	Args:
	target_url (str): The URL to crawl and save content from.
	"""
	crawler = create_crawler()
	result = crawler.run(url=target_url)
	title = result.metadata.get('title', 'untitled')
	sanitized_title = sanitize_filename(title).replace(" ", "_")

	# Choose the appropriate base path based on the operating system
	if platform.system() == "Windows":
	base_path = "E:\\ragpile\\Saved Websites\\"
	else:
	base_path = "/home/kade/datasets/ragpile/Saved Websites/"

	save_dir = os.path.join(base_path, sanitized_title)
	os.makedirs(save_dir, exist_ok=True)

	# Save markdown
	save_path = os.path.join(save_dir, f"{sanitized_title}.md")
	#sanitized_markdown = sanitize_citations(result.markdown)
	with open(save_path, "w", encoding="utf-8") as file:
	file.write(result.markdown)
	#file.write(sanitized_markdown)
	print(f"Saved markdown to {save_path}")

	# Save images in parallel
	if 'images' in result.media and isinstance(result.media['images'], list):
	session = requests.Session()
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
	'AppleWebKit/537.36 (KHTML, like Gecko) '
	'Chrome/91.0.4472.124 Safari/537.36',
	'Referer': target_url,
	'Accept': ('image/avif,image/webp,image/apng,image/svg+xml,'
	'image/,/*;q=0.8'),
	'Accept-Language': 'en-US,en;q=0.9',
	'Sec-Fetch-Dest': 'image',
	'Sec-Fetch-Mode': 'no-cors',
	'Sec-Fetch-Site': 'cross-site',
	}
	session.headers.update(headers)

	with ThreadPoolExecutor(max_workers=5) as executor:
	futures = []
	for image_data in result.media['images']:
	if 'src' in image_data:
	# Use urljoin to create absolute URLs for image sources
	absolute_image_url = urljoin(target_url, image_data['src'])
	futures.append(executor.submit(download_image,
	session,
	absolute_image_url,
	save_dir,
	target_url)) # Pass target_url as base_url

	for future in as_completed(futures):
	future.result()


	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python crawl.py <URL>")
	else:
	url = sys.argv[1]
	save_result(url)