Spaces:

CultriX
/

RAG-Scraper

Running

App Files Files Community

RAG-Scraper / app.py

CultriX

Added recursion

d70a98e 4 days ago

raw

history blame

2.22 kB

	import gradio as gr
	from rag_scraper.scraper import Scraper
	from rag_scraper.converter import Converter
	from rag_scraper.link_extractor import LinkExtractor, LinkType
	from rag_scraper.utils import URLUtils

	def scrape_and_convert(url, depth):
	"""Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
	try:
	visited_urls = set()

	def recursive_scrape(url, current_depth):
	"""Recursively scrape and convert pages up to the given depth."""
	if url in visited_urls or current_depth < 0:
	return ""

	visited_urls.add(url)

	# Fetch HTML content
	try:
	html_content = Scraper.fetch_html(url)
	except Exception as e:
	return f"Error fetching {url}: {str(e)}\n"

	# Convert to Markdown
	markdown_content = Converter.html_to_markdown(
	html=html_content,
	base_url=url,
	parser_features='html.parser',
	ignore_links=True
	)

	# If depth > 0, extract links and process them
	if current_depth > 0:
	links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
	for link in links:
	markdown_content += f"\n\n## Extracted from: {link}\n"
	markdown_content += recursive_scrape(link, current_depth - 1)

	return markdown_content

	# Start the recursive scraping process
	result = recursive_scrape(url, depth)
	return result

	except Exception as e:
	return f"Error: {str(e)}"

	# Define Gradio interface
	iface = gr.Interface(
	fn=scrape_and_convert,
	inputs=[
	gr.Textbox(label="Enter URL"),
	gr.Slider(minimum=0, maximum=3, step=1, label="Search Depth (0 = Only main page)")
	],
	outputs=gr.Code(label="Markdown Output", language="markdown"),
	title="RAGScraper with Recursive Depth",
	description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown."
	)

	# Launch the Gradio app
	if __name__ == "__main__":
	iface.launch()