|
from abc import ABC, abstractmethod |
|
from typing import Optional, Dict, Any, Tuple |
|
from .models import MarkdownGenerationResult |
|
from .html2text import CustomHTML2Text |
|
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter |
|
import re |
|
from urllib.parse import urljoin |
|
|
|
|
|
LINK_PATTERN = re.compile(r'!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)') |
|
|
|
def fast_urljoin(base: str, url: str) -> str: |
|
"""Fast URL joining for common cases.""" |
|
if url.startswith(('http://', 'https://', 'mailto:', '//')): |
|
return url |
|
if url.startswith('/'): |
|
|
|
if base.endswith('/'): |
|
return base[:-1] + url |
|
return base + url |
|
return urljoin(base, url) |
|
|
|
class MarkdownGenerationStrategy(ABC): |
|
"""Abstract base class for markdown generation strategies.""" |
|
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): |
|
self.content_filter = content_filter |
|
self.options = options or {} |
|
|
|
@abstractmethod |
|
def generate_markdown(self, |
|
cleaned_html: str, |
|
base_url: str = "", |
|
html2text_options: Optional[Dict[str, Any]] = None, |
|
content_filter: Optional[RelevantContentFilter] = None, |
|
citations: bool = True, |
|
**kwargs) -> MarkdownGenerationResult: |
|
"""Generate markdown from cleaned HTML.""" |
|
pass |
|
|
|
class DefaultMarkdownGenerator(MarkdownGenerationStrategy): |
|
""" |
|
Default implementation of markdown generation strategy. |
|
|
|
How it works: |
|
1. Generate raw markdown from cleaned HTML. |
|
2. Convert links to citations. |
|
3. Generate fit markdown if content filter is provided. |
|
4. Return MarkdownGenerationResult. |
|
|
|
Args: |
|
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. |
|
options (Optional[Dict[str, Any]]): Additional options for markdown generation. Defaults to None. |
|
|
|
Returns: |
|
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. |
|
""" |
|
def __init__(self, content_filter: Optional[RelevantContentFilter] = None, options: Optional[Dict[str, Any]] = None): |
|
super().__init__(content_filter, options) |
|
|
|
def convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]: |
|
""" |
|
Convert links in markdown to citations. |
|
|
|
How it works: |
|
1. Find all links in the markdown. |
|
2. Convert links to citations. |
|
3. Return converted markdown and references markdown. |
|
|
|
Note: |
|
This function uses a regex pattern to find links in markdown. |
|
|
|
Args: |
|
markdown (str): Markdown text. |
|
base_url (str): Base URL for URL joins. |
|
|
|
Returns: |
|
Tuple[str, str]: Converted markdown and references markdown. |
|
""" |
|
link_map = {} |
|
url_cache = {} |
|
parts = [] |
|
last_end = 0 |
|
counter = 1 |
|
|
|
for match in LINK_PATTERN.finditer(markdown): |
|
parts.append(markdown[last_end:match.start()]) |
|
text, url, title = match.groups() |
|
|
|
|
|
if base_url and not url.startswith(('http://', 'https://', 'mailto:')): |
|
if url not in url_cache: |
|
url_cache[url] = fast_urljoin(base_url, url) |
|
url = url_cache[url] |
|
|
|
if url not in link_map: |
|
desc = [] |
|
if title: desc.append(title) |
|
if text and text != title: desc.append(text) |
|
link_map[url] = (counter, ": " + " - ".join(desc) if desc else "") |
|
counter += 1 |
|
|
|
num = link_map[url][0] |
|
parts.append(f"{text}⟨{num}⟩" if not match.group(0).startswith('!') else f"![{text}⟨{num}⟩]") |
|
last_end = match.end() |
|
|
|
parts.append(markdown[last_end:]) |
|
converted_text = ''.join(parts) |
|
|
|
|
|
references = ["\n\n## References\n\n"] |
|
references.extend( |
|
f"⟨{num}⟩ {url}{desc}\n" |
|
for url, (num, desc) in sorted(link_map.items(), key=lambda x: x[1][0]) |
|
) |
|
|
|
return converted_text, ''.join(references) |
|
|
|
def generate_markdown(self, |
|
cleaned_html: str, |
|
base_url: str = "", |
|
html2text_options: Optional[Dict[str, Any]] = None, |
|
options: Optional[Dict[str, Any]] = None, |
|
content_filter: Optional[RelevantContentFilter] = None, |
|
citations: bool = True, |
|
**kwargs) -> MarkdownGenerationResult: |
|
""" |
|
Generate markdown with citations from cleaned HTML. |
|
|
|
How it works: |
|
1. Generate raw markdown from cleaned HTML. |
|
2. Convert links to citations. |
|
3. Generate fit markdown if content filter is provided. |
|
4. Return MarkdownGenerationResult. |
|
|
|
Args: |
|
cleaned_html (str): Cleaned HTML content. |
|
base_url (str): Base URL for URL joins. |
|
html2text_options (Optional[Dict[str, Any]]): HTML2Text options. |
|
options (Optional[Dict[str, Any]]): Additional options for markdown generation. |
|
content_filter (Optional[RelevantContentFilter]): Content filter for generating fit markdown. |
|
citations (bool): Whether to generate citations. |
|
|
|
Returns: |
|
MarkdownGenerationResult: Result containing raw markdown, fit markdown, fit HTML, and references markdown. |
|
""" |
|
try: |
|
|
|
h = CustomHTML2Text(baseurl=base_url) |
|
default_options = { |
|
'body_width': 0, |
|
'ignore_emphasis': False, |
|
'ignore_links': False, |
|
'ignore_images': False, |
|
'protect_links': True, |
|
'single_line_break': True, |
|
'mark_code': True, |
|
'escape_snob': False |
|
} |
|
|
|
|
|
if html2text_options: |
|
default_options.update(html2text_options) |
|
elif options: |
|
default_options.update(options) |
|
elif self.options: |
|
default_options.update(self.options) |
|
|
|
h.update_params(**default_options) |
|
|
|
|
|
if not cleaned_html: |
|
cleaned_html = "" |
|
elif not isinstance(cleaned_html, str): |
|
cleaned_html = str(cleaned_html) |
|
|
|
|
|
try: |
|
raw_markdown = h.handle(cleaned_html) |
|
except Exception as e: |
|
raw_markdown = f"Error converting HTML to markdown: {str(e)}" |
|
|
|
raw_markdown = raw_markdown.replace(' ```', '```') |
|
|
|
|
|
markdown_with_citations: str = raw_markdown |
|
references_markdown: str = "" |
|
if citations: |
|
try: |
|
markdown_with_citations, references_markdown = self.convert_links_to_citations( |
|
raw_markdown, base_url |
|
) |
|
except Exception as e: |
|
markdown_with_citations = raw_markdown |
|
references_markdown = f"Error generating citations: {str(e)}" |
|
|
|
|
|
fit_markdown: Optional[str] = "" |
|
filtered_html: Optional[str] = "" |
|
if content_filter or self.content_filter: |
|
try: |
|
content_filter = content_filter or self.content_filter |
|
filtered_html = content_filter.filter_content(cleaned_html) |
|
filtered_html = '\n'.join('<div>{}</div>'.format(s) for s in filtered_html) |
|
fit_markdown = h.handle(filtered_html) |
|
except Exception as e: |
|
fit_markdown = f"Error generating fit markdown: {str(e)}" |
|
filtered_html = "" |
|
|
|
return MarkdownGenerationResult( |
|
raw_markdown=raw_markdown or "", |
|
markdown_with_citations=markdown_with_citations or "", |
|
references_markdown=references_markdown or "", |
|
fit_markdown=fit_markdown or "", |
|
fit_html=filtered_html or "", |
|
) |
|
except Exception as e: |
|
|
|
error_msg = f"Error in markdown generation: {str(e)}" |
|
return MarkdownGenerationResult( |
|
raw_markdown=error_msg, |
|
markdown_with_citations=error_msg, |
|
references_markdown="", |
|
fit_markdown="", |
|
fit_html="", |
|
) |
|
|