Spaces:

CultriX
/

RAG-Scraper

Running

File size: 2,221 Bytes

import gradio as gr
from rag_scraper.scraper import Scraper
from rag_scraper.converter import Converter
from rag_scraper.link_extractor import LinkExtractor, LinkType
from rag_scraper.utils import URLUtils

def scrape_and_convert(url, depth):
    """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
    try:
        visited_urls = set()

        def recursive_scrape(url, current_depth):
            """Recursively scrape and convert pages up to the given depth."""
            if url in visited_urls or current_depth < 0:
                return ""
            
            visited_urls.add(url)

            # Fetch HTML content
            try:
                html_content = Scraper.fetch_html(url)
            except Exception as e:
                return f"Error fetching {url}: {str(e)}\n"

            # Convert to Markdown
            markdown_content = Converter.html_to_markdown(
                html=html_content,
                base_url=url,
                parser_features='html.parser',
                ignore_links=True
            )

            # If depth > 0, extract links and process them
            if current_depth > 0:
                links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
                for link in links:
                    markdown_content += f"\n\n## Extracted from: {link}\n"  
                    markdown_content += recursive_scrape(link, current_depth - 1)

            return markdown_content

        # Start the recursive scraping process
        result = recursive_scrape(url, depth)
        return result

    except Exception as e:
        return f"Error: {str(e)}"

# Define Gradio interface
iface = gr.Interface(
    fn=scrape_and_convert, 
    inputs=[
        gr.Textbox(label="Enter URL"), 
        gr.Slider(minimum=0, maximum=3, step=1, label="Search Depth (0 = Only main page)")
    ],
    outputs=gr.Code(label="Markdown Output", language="markdown"),
    title="RAGScraper with Recursive Depth",
    description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown."
)

# Launch the Gradio app
if __name__ == "__main__":
    iface.launch()