Spaces:

CultriX
/

RAG-Scraper

Running

App Files Files Community

CultriX commited on 4 days ago

Commit

d70a98e

1 Parent(s): f74a355

Added recursion

Browse files

Files changed (3) hide show

app.py +46 -16
rag_scraper/cli.py +12 -36
rag_scraper/link_extractor.py +34 -42

app.py CHANGED Viewed

@@ -1,31 +1,61 @@
 import gradio as gr
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
-def scrape_and_convert(url):
-    """Fetch HTML content and convert it to Markdown."""
     try:
-        # Fetch HTML content
-        html_content = Scraper.fetch_html(url)
-        # Convert to Markdown
-        markdown_content = Converter.html_to_markdown(
-            html=html_content,
-            base_url=url,
-            parser_features='html.parser',
-            ignore_links=True
-        )
-        return markdown_content
     except Exception as e:
         return f"Error: {str(e)}"
 # Define Gradio interface
 iface = gr.Interface(
     fn=scrape_and_convert,
-    inputs=gr.Textbox(label="Enter URL"),
     outputs=gr.Code(label="Markdown Output", language="markdown"),
-    title="RAGScraper",
-    description="Enter a URL to scrape and convert its content into Markdown format."
 )
 # Launch the Gradio app

 import gradio as gr
 from rag_scraper.scraper import Scraper
 from rag_scraper.converter import Converter
+from rag_scraper.link_extractor import LinkExtractor, LinkType
+from rag_scraper.utils import URLUtils
+def scrape_and_convert(url, depth):
+    """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
     try:
+        visited_urls = set()
+        def recursive_scrape(url, current_depth):
+            """Recursively scrape and convert pages up to the given depth."""
+            if url in visited_urls or current_depth < 0:
+                return ""
+            visited_urls.add(url)
+            # Fetch HTML content
+            try:
+                html_content = Scraper.fetch_html(url)
+            except Exception as e:
+                return f"Error fetching {url}: {str(e)}\n"
+            # Convert to Markdown
+            markdown_content = Converter.html_to_markdown(
+                html=html_content,
+                base_url=url,
+                parser_features='html.parser',
+                ignore_links=True
+            )
+            # If depth > 0, extract links and process them
+            if current_depth > 0:
+                links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
+                for link in links:
+                    markdown_content += f"\n\n## Extracted from: {link}\n"
+                    markdown_content += recursive_scrape(link, current_depth - 1)
+            return markdown_content
+        # Start the recursive scraping process
+        result = recursive_scrape(url, depth)
+        return result
     except Exception as e:
         return f"Error: {str(e)}"
 # Define Gradio interface
 iface = gr.Interface(
     fn=scrape_and_convert,
+    inputs=[
+        gr.Textbox(label="Enter URL"),
+        gr.Slider(minimum=0, maximum=3, step=1, label="Search Depth (0 = Only main page)")
+    ],
     outputs=gr.Code(label="Markdown Output", language="markdown"),
+    title="RAGScraper with Recursive Depth",
+    description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown."
 )
 # Launch the Gradio app

rag_scraper/cli.py CHANGED Viewed

@@ -1,62 +1,38 @@
 import argparse
 from rag_scraper.converter import Converter
-from rag_scraper.link_extractor import LinkExtractor
 from rag_scraper.scraper import Scraper
 from rag_scraper.utils import URLUtils
 def main():
     parser = argparse.ArgumentParser(
-        description="RAGScraper: A tool to scrape, extract links, and convert webpages to markdown."
     )
     parser.add_argument("url", help="The URL of the webpage to scrape.")
-    parser.add_argument(
-        "--element_id",
-        help="The ID of the element to search for links.",
-        default=None,
-    )
-    parser.add_argument(
-        "--element_type",
-        help='The type of the element to search for links. Default is "nav".',
-        default="nav",
-    )
-    parser.add_argument(
-        "--convert",
-        help="Convert the webpage to markdown.",
-        action="store_true",
-    )
-    parser.add_argument(
-        "--extract",
-        help="Extract links from the specified element.",
-        action="store_true",
-    )
     args = parser.parse_args()
     base_url = URLUtils.get_base_url(args.url)
     if args.extract:
-        # Extract links if the flag is set
-        links = LinkExtractor.scrape_url(
-            args.url,
-            element_id=args.element_id,
-            element_type=args.element_type,
-        )
         print(f"Unique links for {args.url}:")
         for link in links:
             print(link)
     elif args.convert:
-        # Convert to markdown if the flag is set
         html_content = Scraper.fetch_html(args.url)
         markdown_content = Converter.html_to_markdown(html_content, base_url)
         print(markdown_content)
     else:
-        print(
-            "Please specify an action: --convert for markdown conversion or --extract for link extraction."
-        )
 if __name__ == "__main__":
     main()

 import argparse
 from rag_scraper.converter import Converter
+from rag_scraper.link_extractor import LinkExtractor, LinkType
 from rag_scraper.scraper import Scraper
 from rag_scraper.utils import URLUtils
 def main():
     parser = argparse.ArgumentParser(
+        description="RAGScraper: Scrape webpages, extract links, and convert to markdown."
     )
     parser.add_argument("url", help="The URL of the webpage to scrape.")
+    parser.add_argument("--element_id", help="The ID of the element to search for links.", default=None)
+    parser.add_argument("--element_type", help='The type of element to search for links. Default is "nav".', default="nav")
+    parser.add_argument("--convert", help="Convert the webpage to markdown.", action="store_true")
+    parser.add_argument("--extract", help="Extract links from the specified element.", action="store_true")
+    parser.add_argument("--depth", type=int, default=0, help="Set search depth for link extraction.")
     args = parser.parse_args()
     base_url = URLUtils.get_base_url(args.url)
     if args.extract:
+        # Extract links recursively
+        links = LinkExtractor.scrape_url(args.url, link_type=LinkType.INTERNAL, depth=args.depth)
         print(f"Unique links for {args.url}:")
         for link in links:
             print(link)
     elif args.convert:
+        # Convert to markdown
         html_content = Scraper.fetch_html(args.url)
         markdown_content = Converter.html_to_markdown(html_content, base_url)
         print(markdown_content)
     else:
+        print("Please specify an action: --convert for markdown conversion or --extract for link extraction.")
 if __name__ == "__main__":
     main()

rag_scraper/link_extractor.py CHANGED Viewed

@@ -1,70 +1,62 @@
 from enum import Enum, auto
 from typing import Set
 from urllib.parse import urljoin, urlparse
 import requests
 from bs4 import BeautifulSoup
 class LinkType(Enum):
     ALL = auto()
     INTERNAL = auto()
     EXTERNAL = auto()
 class LinkExtractor:
     @staticmethod
-    def scrape_url(
-        url: str, link_type: LinkType = LinkType.ALL, **kwargs
-    ) -> Set[str]:
         """
-        Scrape a given URL for unique links within a specified element, with an option to choose between internal, external, or all links.
-        Converts relative URLs to absolute URLs.
         :param url: The URL of the website to scrape.
-        :param link_type: The type of links to scrape (LinkType.ALL, LinkType.INTERNAL, LinkType.EXTERNAL).
-        :param kwargs: Keyword arguments to specify element id and element type.
-        :return: A set of unique link URLs found within the specified element.
         """
-        element_id = kwargs.get("element_id")
-        element_type = kwargs.get("element_type", "nav")
         base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
         try:
-            response = requests.get(url)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, "html.parser")
-            if element_id:
-                fetched_element = soup.find_all(element_type, id=element_id)
-            else:
-                fetched_element = soup.find_all(element_type)
-            links = set()
-            # Iterate over all found elements and extract links
-            for element in fetched_element:
-                for a_tag in element.find_all("a", href=True):
-                    href = a_tag["href"]
-                    absolute_url = urljoin(url, href)
-                    domain = urlparse(absolute_url).netloc
-                    if (
-                        link_type == LinkType.INTERNAL
-                        and domain == urlparse(base_url).netloc
-                    ):
-                        links.add(absolute_url)
-                    elif (
-                        link_type == LinkType.EXTERNAL
-                        and domain != urlparse(base_url).netloc
-                    ):
-                        links.add(absolute_url)
-                    elif link_type == LinkType.ALL:
-                        links.add(absolute_url)
-            return links
         except requests.RequestException as e:
             print(f"Request failed for {url}: {e}")
             return set()
-        except Exception as e:
-            print(f"An error occurred: {e}")
-            return set()

 from enum import Enum, auto
 from typing import Set
 from urllib.parse import urljoin, urlparse
 import requests
 from bs4 import BeautifulSoup
 class LinkType(Enum):
     ALL = auto()
     INTERNAL = auto()
     EXTERNAL = auto()
 class LinkExtractor:
     @staticmethod
+    def scrape_url(url: str, link_type: LinkType = LinkType.ALL, depth: int = 0, visited_urls: Set[str] = None) -> Set[str]:
         """
+        Scrape a given URL for unique links within a specified element,
+        with recursive depth support.
         :param url: The URL of the website to scrape.
+        :param link_type: The type of links to scrape (ALL, INTERNAL, EXTERNAL).
+        :param depth: The recursion depth for extracting links.
+        :param visited_urls: A set to keep track of visited URLs.
+        :return: A set of unique link URLs found.
         """
+        if visited_urls is None:
+            visited_urls = set()
+        if url in visited_urls or depth < 0:
+            return set()
+        visited_urls.add(url)
         base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
+        extracted_links = set()
         try:
+            response = requests.get(url, timeout=5)
             response.raise_for_status()
             soup = BeautifulSoup(response.text, "html.parser")
+            for a_tag in soup.find_all("a", href=True):
+                href = a_tag["href"]
+                absolute_url = urljoin(url, href)
+                domain = urlparse(absolute_url).netloc
+                if link_type == LinkType.INTERNAL and domain == urlparse(base_url).netloc:
+                    extracted_links.add(absolute_url)
+                elif link_type == LinkType.EXTERNAL and domain != urlparse(base_url).netloc:
+                    extracted_links.add(absolute_url)
+                elif link_type == LinkType.ALL:
+                    extracted_links.add(absolute_url)
+            # Recursive scraping if depth > 0
+            if depth > 0:
+                for link in extracted_links.copy():
+                    extracted_links.update(LinkExtractor.scrape_url(link, link_type, depth - 1, visited_urls))
+            return extracted_links
         except requests.RequestException as e:
             print(f"Request failed for {url}: {e}")
             return set()