Spaces:
Running
Running
# markdown_utils.py | |
import re | |
def robust_clean_markdown(text): | |
""" | |
Cleans markdown text by removing code fences, normalizing headings, and removing extra blank lines. | |
""" | |
# Remove code fences with optional language specifiers. | |
text = re.sub(r"```(?:\w+)?\n", "", text) | |
text = re.sub(r"\n```", "", text) | |
# Normalize heading formats: ensure exactly one space after '#' symbols. | |
def fix_heading(match): | |
hashes = match.group(1) | |
title = match.group(2).strip() | |
return f"{hashes} {title}" | |
text = re.sub(r"^(#{1,6})\s*(.*)$", fix_heading, text, flags=re.MULTILINE) | |
# Remove extra blank lines. | |
text = re.sub(r'\n\s*\n', '\n\n', text) | |
return text.strip() | |
def normalize_heading_levels(text): | |
""" | |
Adjusts all heading levels so that the highest-level (smallest number of '#' characters) | |
heading becomes level 1. For example, if the smallest heading in the document is '###', | |
all headings will be promoted by 2 levels. | |
""" | |
# Find all heading levels in the text. | |
heading_levels = [len(match.group(1)) for match in re.finditer(r"^(#{1,6})\s", text, flags=re.MULTILINE)] | |
if heading_levels: | |
min_level = min(heading_levels) | |
# Only adjust if the minimum level is greater than 1. | |
if min_level > 1: | |
def adjust_heading(match): | |
current_level = len(match.group(1)) | |
new_level = current_level - (min_level - 1) | |
return "#" * new_level + " " | |
text = re.sub(r"^(#{1,6})\s", adjust_heading, text, flags=re.MULTILINE) | |
return text |