|
import os, sys |
|
import time |
|
import warnings |
|
from enum import Enum |
|
from colorama import init, Fore, Back, Style |
|
from pathlib import Path |
|
from typing import Optional, List, Union |
|
import json |
|
import asyncio |
|
|
|
from contextlib import asynccontextmanager |
|
from .models import CrawlResult, MarkdownGenerationResult |
|
from .async_database import async_db_manager |
|
from .chunking_strategy import * |
|
from .content_filter_strategy import * |
|
from .extraction_strategy import * |
|
from .async_crawler_strategy import AsyncCrawlerStrategy, AsyncPlaywrightCrawlerStrategy, AsyncCrawlResponse |
|
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode |
|
from .markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy |
|
from .content_scraping_strategy import WebScrapingStrategy |
|
from .async_logger import AsyncLogger |
|
from .async_configs import BrowserConfig, CrawlerRunConfig |
|
from .config import ( |
|
MIN_WORD_THRESHOLD, |
|
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, |
|
URL_LOG_SHORTEN_LENGTH |
|
) |
|
from .utils import ( |
|
sanitize_input_encode, |
|
InvalidCSSSelectorError, |
|
format_html, |
|
fast_format_html, |
|
create_box_message |
|
) |
|
|
|
from urllib.parse import urlparse |
|
import random |
|
from .__version__ import __version__ as crawl4ai_version |
|
|
|
|
|
class AsyncWebCrawler: |
|
""" |
|
Asynchronous web crawler with flexible caching capabilities. |
|
|
|
There are two ways to use the crawler: |
|
|
|
1. Using context manager (recommended for simple cases): |
|
```python |
|
async with AsyncWebCrawler() as crawler: |
|
result = await crawler.arun(url="https://example.com") |
|
``` |
|
|
|
2. Using explicit lifecycle management (recommended for long-running applications): |
|
```python |
|
crawler = AsyncWebCrawler() |
|
await crawler.start() |
|
|
|
# Use the crawler multiple times |
|
result1 = await crawler.arun(url="https://example.com") |
|
result2 = await crawler.arun(url="https://another.com") |
|
|
|
await crawler.close() |
|
``` |
|
|
|
Migration Guide: |
|
Old way (deprecated): |
|
crawler = AsyncWebCrawler(always_by_pass_cache=True, browser_type="chromium", headless=True) |
|
|
|
New way (recommended): |
|
browser_config = BrowserConfig(browser_type="chromium", headless=True) |
|
crawler = AsyncWebCrawler(config=browser_config) |
|
|
|
|
|
Attributes: |
|
browser_config (BrowserConfig): Configuration object for browser settings. |
|
crawler_strategy (AsyncCrawlerStrategy): Strategy for crawling web pages. |
|
logger (AsyncLogger): Logger instance for recording events and errors. |
|
always_bypass_cache (bool): Whether to always bypass cache. |
|
crawl4ai_folder (str): Directory for storing cache. |
|
base_directory (str): Base directory for storing cache. |
|
ready (bool): Whether the crawler is ready for use. |
|
|
|
Methods: |
|
start(): Start the crawler explicitly without using context manager. |
|
close(): Close the crawler explicitly without using context manager. |
|
arun(): Run the crawler for a single source: URL (web, local file, or raw HTML). |
|
awarmup(): Perform warmup sequence. |
|
arun_many(): Run the crawler for multiple sources. |
|
aprocess_html(): Process HTML content. |
|
|
|
Typical Usage: |
|
async with AsyncWebCrawler() as crawler: |
|
result = await crawler.arun(url="https://example.com") |
|
print(result.markdown) |
|
|
|
Using configuration: |
|
browser_config = BrowserConfig(browser_type="chromium", headless=True) |
|
async with AsyncWebCrawler(config=browser_config) as crawler: |
|
crawler_config = CrawlerRunConfig( |
|
cache_mode=CacheMode.BYPASS |
|
) |
|
result = await crawler.arun(url="https://example.com", config=crawler_config) |
|
print(result.markdown) |
|
""" |
|
_domain_last_hit = {} |
|
|
|
def __init__( |
|
self, |
|
crawler_strategy: Optional[AsyncCrawlerStrategy] = None, |
|
config: Optional[BrowserConfig] = None, |
|
always_bypass_cache: bool = False, |
|
always_by_pass_cache: Optional[bool] = None, |
|
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), |
|
thread_safe: bool = False, |
|
**kwargs, |
|
): |
|
""" |
|
Initialize the AsyncWebCrawler. |
|
|
|
Args: |
|
crawler_strategy: Strategy for crawling web pages. If None, will create AsyncPlaywrightCrawlerStrategy |
|
config: Configuration object for browser settings. If None, will be created from kwargs |
|
always_bypass_cache: Whether to always bypass cache (new parameter) |
|
always_by_pass_cache: Deprecated, use always_bypass_cache instead |
|
base_directory: Base directory for storing cache |
|
thread_safe: Whether to use thread-safe operations |
|
**kwargs: Additional arguments for backwards compatibility |
|
""" |
|
|
|
browser_config = config |
|
if browser_config is not None: |
|
if any(k in kwargs for k in ["browser_type", "headless", "viewport_width", "viewport_height"]): |
|
self.logger.warning( |
|
message="Both browser_config and legacy browser parameters provided. browser_config will take precedence.", |
|
tag="WARNING" |
|
) |
|
else: |
|
|
|
browser_config = BrowserConfig.from_kwargs(kwargs) |
|
|
|
self.browser_config = browser_config |
|
|
|
|
|
self.logger = AsyncLogger( |
|
log_file=os.path.join(base_directory, ".crawl4ai", "crawler.log"), |
|
verbose=self.browser_config.verbose, |
|
tag_width=10 |
|
) |
|
|
|
|
|
|
|
params = { |
|
k:v for k, v in kwargs.items() if k in ['browser_congig', 'logger'] |
|
} |
|
self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( |
|
browser_config=browser_config, |
|
logger=self.logger, |
|
**params |
|
) |
|
|
|
|
|
if not self.crawler_strategy.logger: |
|
self.crawler_strategy.logger = self.logger |
|
|
|
|
|
if always_by_pass_cache is not None: |
|
if kwargs.get("warning", True): |
|
warnings.warn( |
|
"'always_by_pass_cache' is deprecated and will be removed in version 0.5.0. " |
|
"Use 'always_bypass_cache' instead. " |
|
"Pass warning=False to suppress this warning.", |
|
DeprecationWarning, |
|
stacklevel=2 |
|
) |
|
self.always_bypass_cache = always_by_pass_cache |
|
else: |
|
self.always_bypass_cache = always_bypass_cache |
|
|
|
|
|
self._lock = asyncio.Lock() if thread_safe else None |
|
|
|
|
|
self.crawl4ai_folder = os.path.join(base_directory, ".crawl4ai") |
|
os.makedirs(self.crawl4ai_folder, exist_ok=True) |
|
os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True) |
|
|
|
self.ready = False |
|
|
|
async def start(self): |
|
""" |
|
Start the crawler explicitly without using context manager. |
|
This is equivalent to using 'async with' but gives more control over the lifecycle. |
|
|
|
This method will: |
|
1. Initialize the browser and context |
|
2. Perform warmup sequence |
|
3. Return the crawler instance for method chaining |
|
|
|
Returns: |
|
AsyncWebCrawler: The initialized crawler instance |
|
""" |
|
await self.crawler_strategy.__aenter__() |
|
await self.awarmup() |
|
return self |
|
|
|
async def close(self): |
|
""" |
|
Close the crawler explicitly without using context manager. |
|
This should be called when you're done with the crawler if you used start(). |
|
|
|
This method will: |
|
1. Clean up browser resources |
|
2. Close any open pages and contexts |
|
""" |
|
await self.crawler_strategy.__aexit__(None, None, None) |
|
|
|
async def __aenter__(self): |
|
return await self.start() |
|
|
|
async def __aexit__(self, exc_type, exc_val, exc_tb): |
|
await self.close() |
|
|
|
async def awarmup(self): |
|
""" |
|
Initialize the crawler with warm-up sequence. |
|
|
|
This method: |
|
1. Logs initialization info |
|
2. Sets up browser configuration |
|
3. Marks the crawler as ready |
|
""" |
|
self.logger.info(f"Crawl4AI {crawl4ai_version}", tag="INIT") |
|
self.ready = True |
|
|
|
@asynccontextmanager |
|
async def nullcontext(self): |
|
"""异步空上下文管理器""" |
|
yield |
|
|
|
async def arun( |
|
self, |
|
url: str, |
|
config: Optional[CrawlerRunConfig] = None, |
|
|
|
word_count_threshold=MIN_WORD_THRESHOLD, |
|
extraction_strategy: ExtractionStrategy = None, |
|
chunking_strategy: ChunkingStrategy = RegexChunking(), |
|
content_filter: RelevantContentFilter = None, |
|
cache_mode: Optional[CacheMode] = None, |
|
|
|
bypass_cache: bool = False, |
|
disable_cache: bool = False, |
|
no_cache_read: bool = False, |
|
no_cache_write: bool = False, |
|
|
|
css_selector: str = None, |
|
screenshot: bool = False, |
|
pdf: bool = False, |
|
user_agent: str = None, |
|
verbose=True, |
|
**kwargs, |
|
) -> CrawlResult: |
|
""" |
|
Runs the crawler for a single source: URL (web, local file, or raw HTML). |
|
|
|
Migration Guide: |
|
Old way (deprecated): |
|
result = await crawler.arun( |
|
url="https://example.com", |
|
word_count_threshold=200, |
|
screenshot=True, |
|
... |
|
) |
|
|
|
New way (recommended): |
|
config = CrawlerRunConfig( |
|
word_count_threshold=200, |
|
screenshot=True, |
|
... |
|
) |
|
result = await crawler.arun(url="https://example.com", crawler_config=config) |
|
|
|
Args: |
|
url: The URL to crawl (http://, https://, file://, or raw:) |
|
crawler_config: Configuration object controlling crawl behavior |
|
[other parameters maintained for backwards compatibility] |
|
|
|
Returns: |
|
CrawlResult: The result of crawling and processing |
|
""" |
|
crawler_config = config |
|
if not isinstance(url, str) or not url: |
|
raise ValueError("Invalid URL, make sure the URL is a non-empty string") |
|
|
|
async with self._lock or self.nullcontext(): |
|
try: |
|
|
|
if crawler_config is not None: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config = crawler_config |
|
else: |
|
|
|
config_kwargs = { |
|
"word_count_threshold": word_count_threshold, |
|
"extraction_strategy": extraction_strategy, |
|
"chunking_strategy": chunking_strategy, |
|
"content_filter": content_filter, |
|
"cache_mode": cache_mode, |
|
"bypass_cache": bypass_cache, |
|
"disable_cache": disable_cache, |
|
"no_cache_read": no_cache_read, |
|
"no_cache_write": no_cache_write, |
|
"css_selector": css_selector, |
|
"screenshot": screenshot, |
|
"pdf": pdf, |
|
"verbose": verbose, |
|
**kwargs |
|
} |
|
config = CrawlerRunConfig.from_kwargs(config_kwargs) |
|
|
|
|
|
if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]): |
|
if kwargs.get("warning", True): |
|
warnings.warn( |
|
"Cache control boolean flags are deprecated and will be removed in version 0.5.0. " |
|
"Use 'cache_mode' parameter instead.", |
|
DeprecationWarning, |
|
stacklevel=2 |
|
) |
|
|
|
|
|
if config.cache_mode is None: |
|
config.cache_mode = _legacy_to_cache_mode( |
|
disable_cache=disable_cache, |
|
bypass_cache=bypass_cache, |
|
no_cache_read=no_cache_read, |
|
no_cache_write=no_cache_write |
|
) |
|
|
|
|
|
if config.cache_mode is None: |
|
config.cache_mode = CacheMode.ENABLED |
|
|
|
|
|
cache_context = CacheContext(url, config.cache_mode, self.always_bypass_cache) |
|
|
|
|
|
async_response: AsyncCrawlResponse = None |
|
cached_result: CrawlResult = None |
|
screenshot_data = None |
|
pdf_data = None |
|
extracted_content = None |
|
start_time = time.perf_counter() |
|
|
|
|
|
if cache_context.should_read(): |
|
cached_result = await async_db_manager.aget_cached_url(url) |
|
|
|
if cached_result: |
|
html = sanitize_input_encode(cached_result.html) |
|
extracted_content = sanitize_input_encode(cached_result.extracted_content or "") |
|
extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content |
|
|
|
screenshot_data = cached_result.screenshot |
|
pdf_data = cached_result.pdf |
|
if config.screenshot and not screenshot or config.pdf and not pdf: |
|
cached_result = None |
|
|
|
self.logger.url_status( |
|
url=cache_context.display_url, |
|
success=bool(html), |
|
timing=time.perf_counter() - start_time, |
|
tag="FETCH" |
|
) |
|
|
|
|
|
if not cached_result or not html: |
|
t1 = time.perf_counter() |
|
|
|
if user_agent: |
|
self.crawler_strategy.update_user_agent(user_agent) |
|
|
|
|
|
async_response = await self.crawler_strategy.crawl( |
|
url, |
|
config=config |
|
) |
|
|
|
html = sanitize_input_encode(async_response.html) |
|
screenshot_data = async_response.screenshot |
|
pdf_data = async_response.pdf_data |
|
|
|
t2 = time.perf_counter() |
|
self.logger.url_status( |
|
url=cache_context.display_url, |
|
success=bool(html), |
|
timing=t2 - t1, |
|
tag="FETCH" |
|
) |
|
|
|
|
|
crawl_result = await self.aprocess_html( |
|
url=url, |
|
html=html, |
|
extracted_content=extracted_content, |
|
config=config, |
|
screenshot=screenshot_data, |
|
pdf_data=pdf_data, |
|
verbose=config.verbose, |
|
is_raw_html = True if url.startswith("raw:") else False, |
|
**kwargs |
|
) |
|
|
|
crawl_result.status_code = async_response.status_code |
|
crawl_result.response_headers = async_response.response_headers |
|
crawl_result.downloaded_files = async_response.downloaded_files |
|
crawl_result.ssl_certificate = async_response.ssl_certificate |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
crawl_result.success = bool(html) |
|
crawl_result.session_id = getattr(config, 'session_id', None) |
|
|
|
self.logger.success( |
|
message="{url:.50}... | Status: {status} | Total: {timing}", |
|
tag="COMPLETE", |
|
params={ |
|
"url": cache_context.display_url, |
|
"status": crawl_result.success, |
|
"timing": f"{time.perf_counter() - start_time:.2f}s" |
|
}, |
|
colors={ |
|
"status": Fore.GREEN if crawl_result.success else Fore.RED, |
|
"timing": Fore.YELLOW |
|
} |
|
) |
|
|
|
|
|
if cache_context.should_write() and not bool(cached_result): |
|
await async_db_manager.acache_url(crawl_result) |
|
|
|
return crawl_result |
|
|
|
else: |
|
self.logger.success( |
|
message="{url:.50}... | Status: {status} | Total: {timing}", |
|
tag="COMPLETE", |
|
params={ |
|
"url": cache_context.display_url, |
|
"status": True, |
|
"timing": f"{time.perf_counter() - start_time:.2f}s" |
|
}, |
|
colors={ |
|
"status": Fore.GREEN, |
|
"timing": Fore.YELLOW |
|
} |
|
) |
|
|
|
cached_result.success = bool(html) |
|
cached_result.session_id = getattr(config, 'session_id', None) |
|
return cached_result |
|
|
|
except Exception as e: |
|
error_context = get_error_context(sys.exc_info()) |
|
|
|
error_message = ( |
|
f"Unexpected error in _crawl_web at line {error_context['line_no']} " |
|
f"in {error_context['function']} ({error_context['filename']}):\n" |
|
f"Error: {str(e)}\n\n" |
|
f"Code context:\n{error_context['code_context']}" |
|
) |
|
|
|
|
|
|
|
self.logger.error_status( |
|
url=url, |
|
error=create_box_message(error_message, type="error"), |
|
tag="ERROR" |
|
) |
|
|
|
return CrawlResult( |
|
url=url, |
|
html="", |
|
success=False, |
|
error_message=error_message |
|
) |
|
|
|
async def aprocess_html( |
|
self, |
|
url: str, |
|
html: str, |
|
extracted_content: str, |
|
config: CrawlerRunConfig, |
|
screenshot: str, |
|
pdf_data: str, |
|
verbose: bool, |
|
**kwargs, |
|
) -> CrawlResult: |
|
""" |
|
Process HTML content using the provided configuration. |
|
|
|
Args: |
|
url: The URL being processed |
|
html: Raw HTML content |
|
extracted_content: Previously extracted content (if any) |
|
config: Configuration object controlling processing behavior |
|
screenshot: Screenshot data (if any) |
|
pdf_data: PDF data (if any) |
|
verbose: Whether to enable verbose logging |
|
**kwargs: Additional parameters for backwards compatibility |
|
|
|
Returns: |
|
CrawlResult: Processed result containing extracted and formatted content |
|
""" |
|
try: |
|
_url = url if not kwargs.get("is_raw_html", False) else "Raw HTML" |
|
t1 = time.perf_counter() |
|
|
|
|
|
scrapping_strategy = WebScrapingStrategy(logger=self.logger) |
|
|
|
|
|
params = {k:v for k, v in config.to_dict().items() if k not in ["url"]} |
|
|
|
params.update({k:v for k, v in kwargs.items() if k not in params.keys()}) |
|
|
|
result = scrapping_strategy.scrap( |
|
url, |
|
html, |
|
**params, |
|
|
|
|
|
|
|
|
|
|
|
|
|
) |
|
|
|
if result is None: |
|
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") |
|
|
|
except InvalidCSSSelectorError as e: |
|
raise ValueError(str(e)) |
|
except Exception as e: |
|
raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") |
|
|
|
|
|
|
|
|
|
cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) |
|
fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) |
|
fit_html = sanitize_input_encode(result.get("fit_html", "")) |
|
media = result.get("media", []) |
|
links = result.get("links", []) |
|
metadata = result.get("metadata", {}) |
|
|
|
|
|
markdown_generator: Optional[MarkdownGenerationStrategy] = config.markdown_generator or DefaultMarkdownGenerator() |
|
|
|
|
|
|
|
|
|
|
|
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown( |
|
cleaned_html=cleaned_html, |
|
base_url=url, |
|
|
|
) |
|
markdown_v2 = markdown_result |
|
markdown = sanitize_input_encode(markdown_result.raw_markdown) |
|
|
|
|
|
self.logger.info( |
|
message="Processed {url:.50}... | Time: {timing}ms", |
|
tag="SCRAPE", |
|
params={ |
|
"url": _url, |
|
"timing": int((time.perf_counter() - t1) * 1000) |
|
} |
|
) |
|
|
|
|
|
if (extracted_content is None and |
|
config.extraction_strategy and |
|
config.chunking_strategy and |
|
not isinstance(config.extraction_strategy, NoExtractionStrategy)): |
|
|
|
t1 = time.perf_counter() |
|
|
|
|
|
content_format = config.extraction_strategy.input_format |
|
if content_format == "fit_markdown" and not markdown_result.fit_markdown: |
|
self.logger.warning( |
|
message="Fit markdown requested but not available. Falling back to raw markdown.", |
|
tag="EXTRACT", |
|
params={"url": _url} |
|
) |
|
content_format = "markdown" |
|
|
|
content = { |
|
"markdown": markdown, |
|
"html": html, |
|
"fit_markdown": markdown_result.raw_markdown |
|
}.get(content_format, markdown) |
|
|
|
|
|
chunking = IdentityChunking() if content_format == "html" else config.chunking_strategy |
|
sections = chunking.chunk(content) |
|
extracted_content = config.extraction_strategy.run(url, sections) |
|
extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) |
|
|
|
|
|
self.logger.info( |
|
message="Completed for {url:.50}... | Time: {timing}s", |
|
tag="EXTRACT", |
|
params={ |
|
"url": _url, |
|
"timing": time.perf_counter() - t1 |
|
} |
|
) |
|
|
|
|
|
screenshot_data = None if not screenshot else screenshot |
|
pdf_data = None if not pdf_data else pdf_data |
|
|
|
|
|
if config.prettiify: |
|
cleaned_html = fast_format_html(cleaned_html) |
|
|
|
|
|
return CrawlResult( |
|
url=url, |
|
html=html, |
|
cleaned_html=cleaned_html, |
|
markdown_v2=markdown_v2, |
|
markdown=markdown, |
|
fit_markdown=fit_markdown, |
|
fit_html=fit_html, |
|
media=media, |
|
links=links, |
|
metadata=metadata, |
|
screenshot=screenshot_data, |
|
pdf=pdf_data, |
|
extracted_content=extracted_content, |
|
success=True, |
|
error_message="", |
|
) |
|
|
|
async def arun_many( |
|
self, |
|
urls: List[str], |
|
config: Optional[CrawlerRunConfig] = None, |
|
|
|
word_count_threshold=MIN_WORD_THRESHOLD, |
|
extraction_strategy: ExtractionStrategy = None, |
|
chunking_strategy: ChunkingStrategy = RegexChunking(), |
|
content_filter: RelevantContentFilter = None, |
|
cache_mode: Optional[CacheMode] = None, |
|
bypass_cache: bool = False, |
|
css_selector: str = None, |
|
screenshot: bool = False, |
|
pdf: bool = False, |
|
user_agent: str = None, |
|
verbose=True, |
|
**kwargs, |
|
) -> List[CrawlResult]: |
|
""" |
|
Runs the crawler for multiple URLs concurrently. |
|
|
|
Migration Guide: |
|
Old way (deprecated): |
|
results = await crawler.arun_many( |
|
urls, |
|
word_count_threshold=200, |
|
screenshot=True, |
|
... |
|
) |
|
|
|
New way (recommended): |
|
config = CrawlerRunConfig( |
|
word_count_threshold=200, |
|
screenshot=True, |
|
... |
|
) |
|
results = await crawler.arun_many(urls, crawler_config=config) |
|
|
|
Args: |
|
urls: List of URLs to crawl |
|
crawler_config: Configuration object controlling crawl behavior for all URLs |
|
[other parameters maintained for backwards compatibility] |
|
|
|
Returns: |
|
List[CrawlResult]: Results for each URL |
|
""" |
|
crawler_config = config |
|
|
|
if crawler_config is not None: |
|
if any(param is not None for param in [ |
|
word_count_threshold, extraction_strategy, chunking_strategy, |
|
content_filter, cache_mode, css_selector, screenshot, pdf |
|
]): |
|
self.logger.warning( |
|
message="Both crawler_config and legacy parameters provided. crawler_config will take precedence.", |
|
tag="WARNING" |
|
) |
|
config = crawler_config |
|
else: |
|
|
|
config_kwargs = { |
|
"word_count_threshold": word_count_threshold, |
|
"extraction_strategy": extraction_strategy, |
|
"chunking_strategy": chunking_strategy, |
|
"content_filter": content_filter, |
|
"cache_mode": cache_mode, |
|
"bypass_cache": bypass_cache, |
|
"css_selector": css_selector, |
|
"screenshot": screenshot, |
|
"pdf": pdf, |
|
"verbose": verbose, |
|
**kwargs |
|
} |
|
config = CrawlerRunConfig.from_kwargs(config_kwargs) |
|
|
|
if bypass_cache: |
|
if kwargs.get("warning", True): |
|
warnings.warn( |
|
"'bypass_cache' is deprecated and will be removed in version 0.5.0. " |
|
"Use 'cache_mode=CacheMode.BYPASS' instead. " |
|
"Pass warning=False to suppress this warning.", |
|
DeprecationWarning, |
|
stacklevel=2 |
|
) |
|
if config.cache_mode is None: |
|
config.cache_mode = CacheMode.BYPASS |
|
|
|
semaphore_count = config.semaphore_count or 5 |
|
semaphore = asyncio.Semaphore(semaphore_count) |
|
|
|
async def crawl_with_semaphore(url): |
|
|
|
domain = urlparse(url).netloc |
|
current_time = time.time() |
|
|
|
self.logger.debug( |
|
message="Started task for {url:.50}...", |
|
tag="PARALLEL", |
|
params={"url": url} |
|
) |
|
|
|
|
|
mean_delay = config.mean_delay |
|
max_range = config.max_range |
|
|
|
|
|
if domain in self._domain_last_hit: |
|
time_since_last = current_time - self._domain_last_hit[domain] |
|
if time_since_last < mean_delay: |
|
delay = mean_delay + random.uniform(0, max_range) |
|
await asyncio.sleep(delay) |
|
|
|
self._domain_last_hit[domain] = current_time |
|
|
|
async with semaphore: |
|
return await self.arun( |
|
url, |
|
crawler_config=config, |
|
user_agent=user_agent |
|
) |
|
|
|
|
|
self.logger.info( |
|
message="Starting concurrent crawling for {count} URLs...", |
|
tag="INIT", |
|
params={"count": len(urls)} |
|
) |
|
|
|
|
|
start_time = time.perf_counter() |
|
tasks = [crawl_with_semaphore(url) for url in urls] |
|
results = await asyncio.gather(*tasks, return_exceptions=True) |
|
end_time = time.perf_counter() |
|
|
|
|
|
self.logger.success( |
|
message="Concurrent crawling completed for {count} URLs | Total time: {timing}", |
|
tag="COMPLETE", |
|
params={ |
|
"count": len(urls), |
|
"timing": f"{end_time - start_time:.2f}s" |
|
}, |
|
colors={ |
|
"timing": Fore.YELLOW |
|
} |
|
) |
|
|
|
return [result if not isinstance(result, Exception) else str(result) for result in results] |
|
|
|
async def aclear_cache(self): |
|
"""Clear the cache database.""" |
|
await async_db_manager.cleanup() |
|
|
|
async def aflush_cache(self): |
|
"""Flush the cache database.""" |
|
await async_db_manager.aflush_db() |
|
|
|
async def aget_cache_size(self): |
|
"""Get the total number of cached items.""" |
|
return await async_db_manager.aget_total_count() |
|
|