Crawl4AI / crawl4ai /markdown_generation_strategy.py
amaye15
test
03c0888
raw
history blame
9.39 kB
from abc import ABC, abstractmethod
from typing import Optional, Dict, Any, Tuple
from .models import MarkdownGenerationResult
from .html2text import CustomHTML2Text
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter
import re
from urllib.parse import urljoin
# Pre-compile the regex pattern
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)')
def fast_urljoin(base: str, url: str) -> str:
"""Fast URL joining for common cases."""
if url.startswith(('http://', 'https://', 'mailto:', '//')):
return url
if url.startswith('/'):
# Handle absolute paths
if base.endswith('/'):
return base[:-1] + url
return base + url
return urljoin(base, url)
class MarkdownGenerationStrategy(ABC):
"""Abstract base class for markdown generation strategies."""
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
self.content_filter = content_filter
self.options = options or {}
@abstractmethod
def generate_markdown(self,
cleaned_html: str,
base_url: str = "",
html2text_options: Optional[Dict[str, Any]] = None,
content_filter: Optional[RelevantContentFilter] = None,
citations: bool = True,
**kwargs) -> MarkdownGenerationResult:
"""Generate markdown from cleaned HTML."""
pass
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
"""
Default implementation of markdown generation strategy.
How it works:
1. Generate raw markdown from cleaned HTML.
2. Convert links to citations.
3. Generate fit markdown if content filter is provided.
4. Return MarkdownGenerationResult.
Args:
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None.
Returns:
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
"""
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None):
super().__init__(content_filter, options)
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]:
"""
Convert links in markdown to citations.
How it works:
1. Find all links in the markdown.
2. Convert links to citations.
3. Return converted markdown and references markdown.
Note:
This function uses a regex pattern to find links in markdown.
Args:
markdown (str): Markdown text.
base_url (str): Base URL for URL joins.
Returns:
Tuple[str, str]: Converted markdown and references markdown.
"""
link_map = {}
url_cache = {} # Cache for URL joins
parts = []
last_end = 0
counter = 1
for match in LINK_PATTERN.finditer(markdown):
parts.append(markdown[last_end:match.start()])
text, url, title = match.groups()
# Use cached URL if available, otherwise compute and cache
if base_url and not url.startswith(('http://', 'https://', 'mailto:')):
if url not in url_cache:
url_cache[url] = fast_urljoin(base_url, url)
url = url_cache[url]
if url not in link_map:
desc = []
if title: desc.append(title)
if text and text != title: desc.append(text)
link_map[url] = (counter, ": " + " - ".join(desc) if desc else "")
counter += 1
num = link_map[url][0]
parts.append(f"{text}{num}⟩" if not match.group(0).startswith('!') else f"![{text}{num}⟩]")
last_end = match.end()
parts.append(markdown[last_end:])
converted_text = ''.join(parts)
# Pre-build reference strings
references = ["\n\n## References\n\n"]
references.extend(
f"⟨{num}{url}{desc}\n"
for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0])
)
return converted_text, ''.join(references)
def generate_markdown(self,
cleaned_html: str,
base_url: str = "",
html2text_options: Optional[Dict[str, Any]] = None,
options: Optional[Dict[str, Any]] = None,
content_filter: Optional[RelevantContentFilter] = None,
citations: bool = True,
**kwargs) -> MarkdownGenerationResult:
"""
Generate markdown with citations from cleaned HTML.
How it works:
1. Generate raw markdown from cleaned HTML.
2. Convert links to citations.
3. Generate fit markdown if content filter is provided.
4. Return MarkdownGenerationResult.
Args:
cleaned_html (str): Cleaned HTML content.
base_url (str): Base URL for URL joins.
html2text_options (Optional[Dict[str, Any]]): HTML2Text options.
options (Optional[Dict[str, Any]]): Additional options for markdown generation.
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown.
citations (bool): Whether to generate citations.
Returns:
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown.
"""
try:
# Initialize HTML2Text with default options for better conversion
h = CustomHTML2Text(baseurl=base_url)
default_options = {
'body_width': 0, # Disable text wrapping
'ignore_emphasis': False,
'ignore_links': False,
'ignore_images': False,
'protect_links': True,
'single_line_break': True,
'mark_code': True,
'escape_snob': False
}
# Update with custom options if provided
if html2text_options:
default_options.update(html2text_options)
elif options:
default_options.update(options)
elif self.options:
default_options.update(self.options)
h.update_params(**default_options)
# Ensure we have valid input
if not cleaned_html:
cleaned_html = ""
elif not isinstance(cleaned_html, str):
cleaned_html = str(cleaned_html)
# Generate raw markdown
try:
raw_markdown = h.handle(cleaned_html)
except Exception as e:
raw_markdown = f"Error converting HTML to markdown: {str(e)}"
raw_markdown = raw_markdown.replace(' ```', '```')
# Convert links to citations
markdown_with_citations: str = raw_markdown
references_markdown: str = ""
if citations:
try:
markdown_with_citations, references_markdown = self.convert_links_to_citations(
raw_markdown, base_url
)
except Exception as e:
markdown_with_citations = raw_markdown
references_markdown = f"Error generating citations: {str(e)}"
# Generate fit markdown if content filter is provided
fit_markdown: Optional[str] = ""
filtered_html: Optional[str] = ""
if content_filter or self.content_filter:
try:
content_filter = content_filter or self.content_filter
filtered_html = content_filter.filter_content(cleaned_html)
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html)
fit_markdown = h.handle(filtered_html)
except Exception as e:
fit_markdown = f"Error generating fit markdown: {str(e)}"
filtered_html = ""
return MarkdownGenerationResult(
raw_markdown=raw_markdown or "",
markdown_with_citations=markdown_with_citations or "",
references_markdown=references_markdown or "",
fit_markdown=fit_markdown or "",
fit_html=filtered_html or "",
)
except Exception as e:
# If anything fails, return empty strings with error message
error_msg = f"Error in markdown generation: {str(e)}"
return MarkdownGenerationResult(
raw_markdown=error_msg,
markdown_with_citations=error_msg,
references_markdown="",
fit_markdown="",
fit_html="",
)