Spaces:

RJuro
/

pdf-digest

Running

File size: 1,613 Bytes

d3fdae9

# markdown_utils.py

import re

def robust_clean_markdown(text):
    """
    Cleans markdown text by removing code fences, normalizing headings, and removing extra blank lines.
    """
    # Remove code fences with optional language specifiers.
    text = re.sub(r"```(?:\w+)?\n", "", text)
    text = re.sub(r"\n```", "", text)
    
    # Normalize heading formats: ensure exactly one space after '#' symbols.
    def fix_heading(match):
        hashes = match.group(1)
        title = match.group(2).strip()
        return f"{hashes} {title}"
    text = re.sub(r"^(#{1,6})\s*(.*)$", fix_heading, text, flags=re.MULTILINE)
    
    # Remove extra blank lines.
    text = re.sub(r'\n\s*\n', '\n\n', text)
    return text.strip()

def normalize_heading_levels(text):
    """
    Adjusts all heading levels so that the highest-level (smallest number of '#' characters)
    heading becomes level 1. For example, if the smallest heading in the document is '###',
    all headings will be promoted by 2 levels.
    """
    # Find all heading levels in the text.
    heading_levels = [len(match.group(1)) for match in re.finditer(r"^(#{1,6})\s", text, flags=re.MULTILINE)]
    if heading_levels:
        min_level = min(heading_levels)
        # Only adjust if the minimum level is greater than 1.
        if min_level > 1:
            def adjust_heading(match):
                current_level = len(match.group(1))
                new_level = current_level - (min_level - 1)
                return "#" * new_level + " "
            text = re.sub(r"^(#{1,6})\s", adjust_heading, text, flags=re.MULTILINE)
    return text