from abc import ABC, abstractmethod from typing import Optional, Dict, Any, Tuple from .models import MarkdownGenerationResult from .html2text import CustomHTML2Text from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter import re from urllib.parse import urljoin # Pre-compile the regex pattern LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') def fast_urljoin(base: str, url: str) -> str: """Fast URL joining for common cases.""" if url.startswith(('http://', 'https://', 'mailto:', '//')): return url if url.startswith('/'): # Handle absolute paths if base.endswith('/'): return base[:-1] + url return base + url return urljoin(base, url) class MarkdownGenerationStrategy(ABC): """Abstract base class for markdown generation strategies.""" def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): self.content_filter = content_filter self.options = options or {} @abstractmethod def generate_markdown(self, cleaned_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult: """Generate markdown from cleaned HTML.""" pass class DefaultMarkdownGenerator(MarkdownGenerationStrategy): """ Default implementation of markdown generation strategy. How it works: 1. Generate raw markdown from cleaned HTML. 2. Convert links to citations. 3. Generate fit markdown if content filter is provided. 4. Return MarkdownGenerationResult. Args: content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None. Returns: MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. """ def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): super().__init__(content_filter, options) def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: """ Convert links in markdown to citations. How it works: 1. Find all links in the markdown. 2. Convert links to citations. 3. Return converted markdown and references markdown. Note: This function uses a regex pattern to find links in markdown. Args: markdown (str): Markdown text. base_url (str): Base URL for URL joins. Returns: Tuple[str, str]: Converted markdown and references markdown. """ link_map = {} url_cache = {} # Cache for URL joins parts = [] last_end = 0 counter = 1 for match in LINK_PATTERN.finditer(markdown): parts.append(markdown[last_end:match.start()]) text, url, title = match.groups() # Use cached URL if available, otherwise compute and cache if base_url and not url.startswith(('http://', 'https://', 'mailto:')): if url not in url_cache: url_cache[url] = fast_urljoin(base_url, url) url = url_cache[url] if url not in link_map: desc = [] if title: desc.append(title) if text and text != title: desc.append(text) link_map[url] = (counter, ": " + " - ".join(desc) if desc else "") counter += 1 num = link_map[url][0] parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]") last_end = match.end() parts.append(markdown[last_end:]) converted_text = ''.join(parts) # Pre-build reference strings references = ["\n\n## References\n\n"] references.extend( f"⟨{num}⟩ {url}{desc}\n" for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0]) ) return converted_text, ''.join(references) def generate_markdown(self, cleaned_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult: """ Generate markdown with citations from cleaned HTML. How it works: 1. Generate raw markdown from cleaned HTML. 2. Convert links to citations. 3. Generate fit markdown if content filter is provided. 4. Return MarkdownGenerationResult. Args: cleaned_html (str): Cleaned HTML content. base_url (str): Base URL for URL joins. html2text_options (Optional[Dict[str, Any]]): HTML2Text options. options (Optional[Dict[str, Any]]): Additional options for markdown generation. content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. citations (bool): Whether to generate citations. Returns: MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. """ try: # Initialize HTML2Text with default options for better conversion h = CustomHTML2Text(baseurl=base_url) default_options = { 'body_width': 0, # Disable text wrapping 'ignore_emphasis': False, 'ignore_links': False, 'ignore_images': False, 'protect_links': True, 'single_line_break': True, 'mark_code': True, 'escape_snob': False } # Update with custom options if provided if html2text_options: default_options.update(html2text_options) elif options: default_options.update(options) elif self.options: default_options.update(self.options) h.update_params(**default_options) # Ensure we have valid input if not cleaned_html: cleaned_html = "" elif not isinstance(cleaned_html, str): cleaned_html = str(cleaned_html) # Generate raw markdown try: raw_markdown = h.handle(cleaned_html) except Exception as e: raw_markdown = f"Error converting HTML to markdown: {str(e)}" raw_markdown = raw_markdown.replace(' ```', '```') # Convert links to citations markdown_with_citations: str = raw_markdown references_markdown: str = "" if citations: try: markdown_with_citations, references_markdown = self.convert_links_to_citations( raw_markdown, base_url ) except Exception as e: markdown_with_citations = raw_markdown references_markdown = f"Error generating citations: {str(e)}" # Generate fit markdown if content filter is provided fit_markdown: Optional[str] = "" filtered_html: Optional[str] = "" if content_filter or self.content_filter: try: content_filter = content_filter or self.content_filter filtered_html = content_filter.filter_content(cleaned_html) filtered_html = '\n'.join('
{}
'.format(s) for s in filtered_html) fit_markdown = h.handle(filtered_html) except Exception as e: fit_markdown = f"Error generating fit markdown: {str(e)}" filtered_html = "" return MarkdownGenerationResult( raw_markdown=raw_markdown or "", markdown_with_citations=markdown_with_citations or "", references_markdown=references_markdown or "", fit_markdown=fit_markdown or "", fit_html=filtered_html or "", ) except Exception as e: # If anything fails, return empty strings with error message error_msg = f"Error in markdown generation: {str(e)}" return MarkdownGenerationResult( raw_markdown=error_msg, markdown_with_citations=error_msg, references_markdown="", fit_markdown="", fit_html="", )