pdf-digest / utils /markdown_utils.py
RJuro's picture
Reinitialize repository without offending large file
d3fdae9
raw
history blame
1.61 kB
# markdown_utils.py
import re
def robust_clean_markdown(text):
"""
Cleans markdown text by removing code fences, normalizing headings, and removing extra blank lines.
"""
# Remove code fences with optional language specifiers.
text = re.sub(r"```(?:\w+)?\n", "", text)
text = re.sub(r"\n```", "", text)
# Normalize heading formats: ensure exactly one space after '#' symbols.
def fix_heading(match):
hashes = match.group(1)
title = match.group(2).strip()
return f"{hashes} {title}"
text = re.sub(r"^(#{1,6})\s*(.*)$", fix_heading, text, flags=re.MULTILINE)
# Remove extra blank lines.
text = re.sub(r'\n\s*\n', '\n\n', text)
return text.strip()
def normalize_heading_levels(text):
"""
Adjusts all heading levels so that the highest-level (smallest number of '#' characters)
heading becomes level 1. For example, if the smallest heading in the document is '###',
all headings will be promoted by 2 levels.
"""
# Find all heading levels in the text.
heading_levels = [len(match.group(1)) for match in re.finditer(r"^(#{1,6})\s", text, flags=re.MULTILINE)]
if heading_levels:
min_level = min(heading_levels)
# Only adjust if the minimum level is greater than 1.
if min_level > 1:
def adjust_heading(match):
current_level = len(match.group(1))
new_level = current_level - (min_level - 1)
return "#" * new_level + " "
text = re.sub(r"^(#{1,6})\s", adjust_heading, text, flags=re.MULTILINE)
return text