Crawl4AI

Running

Crawl4AI / crawl4ai /markdown_generation_strategy.py

amaye15

test

03c0888 about 1 month ago

9.39 kB

	from abc import ABC, abstractmethod
	from typing import Optional, Dict, Any, Tuple
	from .models import MarkdownGenerationResult
	from .html2text import CustomHTML2Text
	from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
	import re
	from urllib.parse import urljoin

	# Pre-compile the regex pattern
	LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')

	def fast_urljoin(base: str, url: str) -> str:
	"""Fast URL joining for common cases."""
	if url.startswith(('http://', 'https://', 'mailto:', '//')):
	return url
	if url.startswith('/'):
	# Handle absolute paths
	if base.endswith('/'):
	return base[:-1] + url
	return base + url
	return urljoin(base, url)

	class MarkdownGenerationStrategy(ABC):
	"""Abstract base class for markdown generation strategies."""
	def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
	self.content_filter = content_filter
	self.options = options or {}

	@abstractmethod
	def generate_markdown(self,
	cleaned_html: str,
	base_url: str = "",
	html2text_options: Optional[Dict[str, Any]] = None,
	content_filter: Optional[RelevantContentFilter] = None,
	citations: bool = True,
	**kwargs) -> MarkdownGenerationResult:
	"""Generate markdown from cleaned HTML."""
	pass

	class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
	"""
	Default implementation of markdown generation strategy.

	How it works:
	1. Generate raw markdown from cleaned HTML.
	2. Convert links to citations.
	3. Generate fit markdown if content filter is provided.
	4. Return MarkdownGenerationResult.

	Args:
	content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
	options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.

	Returns:
	MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
	"""
	def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
	super().__init__(content_filter, options)

	def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
	"""
	Convert links in markdown to citations.

	How it works:
	1. Find all links in the markdown.
	2. Convert links to citations.
	3. Return converted markdown and references markdown.

	Note:
	This function uses a regex pattern to find links in markdown.

	Args:
	markdown (str): Markdown text.
	base_url (str): Base URL for URL joins.

	Returns:
	Tuple[str, str]: Converted markdown and references markdown.
	"""
	link_map = {}
	url_cache = {} # Cache for URL joins
	parts = []
	last_end = 0
	counter = 1

	for match in LINK_PATTERN.finditer(markdown):
	parts.append(markdown[last_end:match.start()])
	text, url, title = match.groups()

	# Use cached URL if available, otherwise compute and cache
	if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
	if url not in url_cache:
	url_cache[url] = fast_urljoin(base_url, url)
	url = url_cache[url]

	if url not in link_map:
	desc = []
	if title: desc.append(title)
	if text and text != title: desc.append(text)
	link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
	counter += 1

	num = link_map[url][0]
	parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]")
	last_end = match.end()

	parts.append(markdown[last_end:])
	converted_text = ''.join(parts)

	# Pre-build reference strings
	references = ["\n\n## References\n\n"]
	references.extend(
	f"⟨{num}⟩ {url}{desc}\n"
	for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
	)

	return converted_text, ''.join(references)

	def generate_markdown(self,
	cleaned_html: str,
	base_url: str = "",
	html2text_options: Optional[Dict[str, Any]] = None,
	options: Optional[Dict[str, Any]] = None,
	content_filter: Optional[RelevantContentFilter] = None,
	citations: bool = True,
	**kwargs) -> MarkdownGenerationResult:
	"""
	Generate markdown with citations from cleaned HTML.

	How it works:
	1. Generate raw markdown from cleaned HTML.
	2. Convert links to citations.
	3. Generate fit markdown if content filter is provided.
	4. Return MarkdownGenerationResult.

	Args:
	cleaned_html (str): Cleaned HTML content.
	base_url (str): Base URL for URL joins.
	html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
	options (Optional[Dict[str, Any]]): Additional options for markdown generation.
	content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
	citations (bool): Whether to generate citations.

	Returns:
	MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
	"""
	try:
	# Initialize HTML2Text with default options for better conversion
	h = CustomHTML2Text(baseurl=base_url)
	default_options = {
	'body_width': 0, # Disable text wrapping
	'ignore_emphasis': False,
	'ignore_links': False,
	'ignore_images': False,
	'protect_links': True,
	'single_line_break': True,
	'mark_code': True,
	'escape_snob': False
	}

	# Update with custom options if provided
	if html2text_options:
	default_options.update(html2text_options)
	elif options:
	default_options.update(options)
	elif self.options:
	default_options.update(self.options)

	h.update_params(**default_options)

	# Ensure we have valid input
	if not cleaned_html:
	cleaned_html = ""
	elif not isinstance(cleaned_html, str):
	cleaned_html = str(cleaned_html)

	# Generate raw markdown
	try:
	raw_markdown = h.handle(cleaned_html)
	except Exception as e:
	raw_markdown = f"Error converting HTML to markdown: {str(e)}"

	raw_markdown = raw_markdown.replace(' ```', '```')

	# Convert links to citations
	markdown_with_citations: str = raw_markdown
	references_markdown: str = ""
	if citations:
	try:
	markdown_with_citations, references_markdown = self.convert_links_to_citations(
	raw_markdown, base_url
	)
	except Exception as e:
	markdown_with_citations = raw_markdown
	references_markdown = f"Error generating citations: {str(e)}"

	# Generate fit markdown if content filter is provided
	fit_markdown: Optional[str] = ""
	filtered_html: Optional[str] = ""
	if content_filter or self.content_filter:
	try:
	content_filter = content_filter or self.content_filter
	filtered_html = content_filter.filter_content(cleaned_html)
	filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
	fit_markdown = h.handle(filtered_html)
	except Exception as e:
	fit_markdown = f"Error generating fit markdown: {str(e)}"
	filtered_html = ""

	return MarkdownGenerationResult(
	raw_markdown=raw_markdown or "",
	markdown_with_citations=markdown_with_citations or "",
	references_markdown=references_markdown or "",
	fit_markdown=fit_markdown or "",
	fit_html=filtered_html or "",
	)
	except Exception as e:
	# If anything fails, return empty strings with error message
	error_msg = f"Error in markdown generation: {str(e)}"
	return MarkdownGenerationResult(
	raw_markdown=error_msg,
	markdown_with_citations=error_msg,
	references_markdown="",
	fit_markdown="",
	fit_html="",
	)