Spaces:
Running
Running
import gradio as gr | |
from rag_scraper.scraper import Scraper | |
from rag_scraper.converter import Converter | |
from rag_scraper.link_extractor import LinkExtractor, LinkType | |
from rag_scraper.utils import URLUtils | |
def scrape_and_convert(url, depth): | |
"""Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown.""" | |
try: | |
visited_urls = set() | |
def recursive_scrape(url, current_depth): | |
"""Recursively scrape and convert pages up to the given depth.""" | |
if url in visited_urls or current_depth < 0: | |
return "" | |
visited_urls.add(url) | |
# Fetch HTML content | |
try: | |
html_content = Scraper.fetch_html(url) | |
except Exception as e: | |
return f"Error fetching {url}: {str(e)}\n" | |
# Convert to Markdown | |
markdown_content = Converter.html_to_markdown( | |
html=html_content, | |
base_url=url, | |
parser_features='html.parser', | |
ignore_links=True | |
) | |
# If depth > 0, extract links and process them | |
if current_depth > 0: | |
links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL) | |
for link in links: | |
markdown_content += f"\n\n## Extracted from: {link}\n" | |
markdown_content += recursive_scrape(link, current_depth - 1) | |
return markdown_content | |
# Start the recursive scraping process | |
result = recursive_scrape(url, depth) | |
return result | |
except Exception as e: | |
return f"Error: {str(e)}" | |
# Define Gradio interface | |
iface = gr.Interface( | |
fn=scrape_and_convert, | |
inputs=[ | |
gr.Textbox(label="Enter URL"), | |
gr.Slider(minimum=0, maximum=3, step=1, label="Search Depth (0 = Only main page)") | |
], | |
outputs=gr.Code(label="Markdown Output", language="markdown"), | |
title="RAGScraper with Recursive Depth", | |
description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown." | |
) | |
# Launch the Gradio app | |
if __name__ == "__main__": | |
iface.launch() | |