from .config import ( MIN_WORD_THRESHOLD, IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, SCREENSHOT_HEIGHT_TRESHOLD, PAGE_TIMEOUT, IMAGE_SCORE_THRESHOLD, SOCIAL_MEDIA_DOMAINS, ) from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy from typing import Union, List class BrowserConfig: """ Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. This class centralizes all parameters that affect browser and context creation. Instead of passing scattered keyword arguments, users can instantiate and modify this configuration object. The crawler code will then reference these settings to initialize the browser in a consistent, documented manner. Attributes: browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit". Default: "chromium". headless (bool): Whether to run the browser in headless mode (no visible GUI). Default: True. use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing advanced manipulation. Default: False. debugging_port (int): Port for the browser debugging protocol. Default: 9222. use_persistent_context (bool): Use a persistent browser context (like a persistent profile). Automatically sets use_managed_browser=True. Default: False. user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a temporary directory may be used. Default: None. chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type is "chromium". Default: "chromium". channel (str): The channel to launch (e.g., "chromium", "chrome", "msedge"). Only applies if browser_type is "chromium". Default: "chromium". proxy (str or None): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used. Default: None. proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. viewport_width (int): Default viewport width for pages. Default: 1080. viewport_height (int): Default viewport height for pages. Default: 600. verbose (bool): Enable verbose logging. Default: True. accept_downloads (bool): Whether to allow file downloads. If True, requires a downloads_path. Default: False. downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True, a default path will be created. Default: None. storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage). Default: None. ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True. java_script_enabled (bool): Enable JavaScript execution in pages. Default: True. cookies (list): List of cookies to add to the browser context. Each cookie is a dict with fields like {"name": "...", "value": "...", "url": "..."}. Default: []. headers (dict): Extra HTTP headers to apply to all requests in this context. Default: {}. user_agent (str): Custom User-Agent string to use. Default: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36". user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is. Default: None. user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. Default: None. text_mode (bool): If True, disables images and other rich content for potentially faster load times. Default: False. light_mode (bool): Disables certain background features for performance gains. Default: False. extra_args (list): Additional command-line arguments passed to the browser. Default: []. """ def __init__( self, browser_type: str = "chromium", headless: bool = True, use_managed_browser: bool = False, use_persistent_context: bool = False, user_data_dir: str = None, chrome_channel: str = "chromium", channel: str = "chromium", proxy: str = None, proxy_config: dict = None, viewport_width: int = 1080, viewport_height: int = 600, accept_downloads: bool = False, downloads_path: str = None, storage_state=None, ignore_https_errors: bool = True, java_script_enabled: bool = True, sleep_on_close: bool = False, verbose: bool = True, cookies: list = None, headers: dict = None, user_agent: str = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" ), user_agent_mode: str = None, user_agent_generator_config: dict = None, text_mode: bool = False, light_mode: bool = False, extra_args: list = None, debugging_port : int = 9222, ): self.browser_type = browser_type self.headless = headless self.use_managed_browser = use_managed_browser self.use_persistent_context = use_persistent_context self.user_data_dir = user_data_dir self.chrome_channel = chrome_channel or self.browser_type or "chromium" self.channel = channel or self.browser_type or "chromium" self.proxy = proxy self.proxy_config = proxy_config self.viewport_width = viewport_width self.viewport_height = viewport_height self.accept_downloads = accept_downloads self.downloads_path = downloads_path self.storage_state = storage_state self.ignore_https_errors = ignore_https_errors self.java_script_enabled = java_script_enabled self.cookies = cookies if cookies is not None else [] self.headers = headers if headers is not None else {} self.user_agent = user_agent self.user_agent_mode = user_agent_mode self.user_agent_generator_config = user_agent_generator_config self.text_mode = text_mode self.light_mode = light_mode self.extra_args = extra_args if extra_args is not None else [] self.sleep_on_close = sleep_on_close self.verbose = verbose self.debugging_port = debugging_port user_agenr_generator = UserAgentGenerator() if self.user_agent_mode != "random" and self.user_agent_generator_config: self.user_agent = user_agenr_generator.generate( **(self.user_agent_generator_config or {}) ) elif self.user_agent_mode == "random": self.user_agent = user_agenr_generator.generate() else: pass self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) # If persistent context is requested, ensure managed browser is enabled if self.use_persistent_context: self.use_managed_browser = True @staticmethod def from_kwargs(kwargs: dict) -> "BrowserConfig": return BrowserConfig( browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), use_managed_browser=kwargs.get("use_managed_browser", False), use_persistent_context=kwargs.get("use_persistent_context", False), user_data_dir=kwargs.get("user_data_dir"), chrome_channel=kwargs.get("chrome_channel", "chromium"), channel=kwargs.get("channel", "chromium"), proxy=kwargs.get("proxy"), proxy_config=kwargs.get("proxy_config"), viewport_width=kwargs.get("viewport_width", 1080), viewport_height=kwargs.get("viewport_height", 600), accept_downloads=kwargs.get("accept_downloads", False), downloads_path=kwargs.get("downloads_path"), storage_state=kwargs.get("storage_state"), ignore_https_errors=kwargs.get("ignore_https_errors", True), java_script_enabled=kwargs.get("java_script_enabled", True), cookies=kwargs.get("cookies", []), headers=kwargs.get("headers", {}), user_agent=kwargs.get( "user_agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36", ), user_agent_mode=kwargs.get("user_agent_mode"), user_agent_generator_config=kwargs.get("user_agent_generator_config"), text_mode=kwargs.get("text_mode", False), light_mode=kwargs.get("light_mode", False), extra_args=kwargs.get("extra_args", []), ) class CrawlerRunConfig: """ Configuration class for controlling how the crawler runs each crawl operation. This includes parameters for content extraction, page manipulation, waiting conditions, caching, and other runtime behaviors. This centralizes parameters that were previously scattered as kwargs to `arun()` and related methods. By using this class, you have a single place to understand and adjust the crawling options. Attributes: # Content Processing Parameters word_count_threshold (int): Minimum word count threshold before processing content. Default: MIN_WORD_THRESHOLD (typically 200). extraction_strategy (ExtractionStrategy or None): Strategy to extract structured data from crawled pages. Default: None (NoExtractionStrategy is used if None). chunking_strategy (ChunkingStrategy): Strategy to chunk content before extraction. Default: RegexChunking(). markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown. Default: None. content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content. Default: None. only_text (bool): If True, attempt to extract text-only content where applicable. Default: False. css_selector (str or None): CSS selector to extract a specific portion of the page. Default: None. excluded_tags (list of str or None): List of HTML tags to exclude from processing. Default: None. excluded_selector (str or None): CSS selector to exclude from processing. Default: None. keep_data_attributes (bool): If True, retain `data-*` attributes while removing unwanted attributes. Default: False. remove_forms (bool): If True, remove all `