# markdown_utils.py import re def robust_clean_markdown(text): """ Cleans markdown text by removing code fences, normalizing headings, and removing extra blank lines. """ # Remove code fences with optional language specifiers. text = re.sub(r"```(?:\w+)?\n", "", text) text = re.sub(r"\n```", "", text) # Normalize heading formats: ensure exactly one space after '#' symbols. def fix_heading(match): hashes = match.group(1) title = match.group(2).strip() return f"{hashes} {title}" text = re.sub(r"^(#{1,6})\s*(.*)$", fix_heading, text, flags=re.MULTILINE) # Remove extra blank lines. text = re.sub(r'\n\s*\n', '\n\n', text) return text.strip() def normalize_heading_levels(text): """ Adjusts all heading levels so that the highest-level (smallest number of '#' characters) heading becomes level 1. For example, if the smallest heading in the document is '###', all headings will be promoted by 2 levels. """ # Find all heading levels in the text. heading_levels = [len(match.group(1)) for match in re.finditer(r"^(#{1,6})\s", text, flags=re.MULTILINE)] if heading_levels: min_level = min(heading_levels) # Only adjust if the minimum level is greater than 1. if min_level > 1: def adjust_heading(match): current_level = len(match.group(1)) new_level = current_level - (min_level - 1) return "#" * new_level + " " text = re.sub(r"^(#{1,6})\s", adjust_heading, text, flags=re.MULTILINE) return text