import gradio as gr from rag_scraper.scraper import Scraper from rag_scraper.converter import Converter from rag_scraper.link_extractor import LinkExtractor, LinkType from rag_scraper.utils import URLUtils def scrape_and_convert(url, depth): """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown.""" try: visited_urls = set() def recursive_scrape(url, current_depth): """Recursively scrape and convert pages up to the given depth.""" if url in visited_urls or current_depth < 0: return "" visited_urls.add(url) # Fetch HTML content try: html_content = Scraper.fetch_html(url) except Exception as e: return f"Error fetching {url}: {str(e)}\n" # Convert to Markdown markdown_content = Converter.html_to_markdown( html=html_content, base_url=url, parser_features='html.parser', ignore_links=True ) # If depth > 0, extract links and process them if current_depth > 0: links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL) for link in links: markdown_content += f"\n\n## Extracted from: {link}\n" markdown_content += recursive_scrape(link, current_depth - 1) return markdown_content # Start the recursive scraping process result = recursive_scrape(url, depth) return result except Exception as e: return f"Error: {str(e)}" # Define Gradio interface iface = gr.Interface( fn=scrape_and_convert, inputs=[ gr.Textbox(label="Enter URL"), gr.Slider(minimum=0, maximum=3, step=1, label="Search Depth (0 = Only main page)") ], outputs=gr.Code(label="Markdown Output", language="markdown"), title="RAGScraper with Recursive Depth", description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown." ) # Launch the Gradio app if __name__ == "__main__": iface.launch()