Spaces:

RJuro
/

pdf-digest

Running

pdf-digest / utils /markdown_utils.py

Reinitialize repository without offending large file

d3fdae9 21 days ago

1.61 kB

	# markdown_utils.py

	import re

	def robust_clean_markdown(text):
	"""
	Cleans markdown text by removing code fences, normalizing headings, and removing extra blank lines.
	"""
	# Remove code fences with optional language specifiers.
	text = re.sub(r"```(?:\w+)?\n", "", text)
	text = re.sub(r"\n```", "", text)

	# Normalize heading formats: ensure exactly one space after '#' symbols.
	def fix_heading(match):
	hashes = match.group(1)
	title = match.group(2).strip()
	return f"{hashes} {title}"
	text = re.sub(r"^(#{1,6})\s(.)$", fix_heading, text, flags=re.MULTILINE)

	# Remove extra blank lines.
	text = re.sub(r'\n\s*\n', '\n\n', text)
	return text.strip()

	def normalize_heading_levels(text):
	"""
	Adjusts all heading levels so that the highest-level (smallest number of '#' characters)
	heading becomes level 1. For example, if the smallest heading in the document is '###',
	all headings will be promoted by 2 levels.
	"""
	# Find all heading levels in the text.
	heading_levels = [len(match.group(1)) for match in re.finditer(r"^(#{1,6})\s", text, flags=re.MULTILINE)]
	if heading_levels:
	min_level = min(heading_levels)
	# Only adjust if the minimum level is greater than 1.
	if min_level > 1:
	def adjust_heading(match):
	current_level = len(match.group(1))
	new_level = current_level - (min_level - 1)
	return "#" * new_level + " "
	text = re.sub(r"^(#{1,6})\s", adjust_heading, text, flags=re.MULTILINE)
	return text