RAG-Scraper / app.py
CultriX's picture
Added recursion
d70a98e
raw
history blame
2.22 kB
import gradio as gr
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils
def scrape_and_convert(url, depth):
"""Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
try:
visited_urls = set()
def recursive_scrape(url, current_depth):
"""Recursively scrape and convert pages up to the given depth."""
if url in visited_urls or current_depth < 0:
return ""
visited_urls.add(url)
# Fetch HTML content
try:
html_content = Scraper.fetch_html(url)
except Exception as e:
return f"Error fetching {url}: {str(e)}\n"
# Convert to Markdown
markdown_content = Converter.html_to_markdown(
html=html_content,
base_url=url,
parser_features='html.parser',
ignore_links=True
)
# If depth > 0, extract links and process them
if current_depth > 0:
links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
for link in links:
markdown_content += f"\n\n## Extracted from: {link}\n"
markdown_content += recursive_scrape(link, current_depth - 1)
return markdown_content
# Start the recursive scraping process
result = recursive_scrape(url, depth)
return result
except Exception as e:
return f"Error: {str(e)}"
# Define Gradio interface
iface = gr.Interface(
fn=scrape_and_convert,
inputs=[
gr.Textbox(label="Enter URL"),
gr.Slider(minimum=0, maximum=3, step=1, label="Search Depth (0 = Only main page)")
],
outputs=gr.Code(label="Markdown Output", language="markdown"),
title="RAGScraper with Recursive Depth",
description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown."
)
# Launch the Gradio app
if __name__ == "__main__":
iface.launch()