CultriX commited on
Commit
d70a98e
·
1 Parent(s): f74a355

Added recursion

Browse files
Files changed (3) hide show
  1. app.py +46 -16
  2. rag_scraper/cli.py +12 -36
  3. rag_scraper/link_extractor.py +34 -42
app.py CHANGED
@@ -1,31 +1,61 @@
1
  import gradio as gr
2
  from rag_scraper.scraper import Scraper
3
  from rag_scraper.converter import Converter
 
 
4
 
5
- def scrape_and_convert(url):
6
- """Fetch HTML content and convert it to Markdown."""
7
  try:
8
- # Fetch HTML content
9
- html_content = Scraper.fetch_html(url)
10
-
11
- # Convert to Markdown
12
- markdown_content = Converter.html_to_markdown(
13
- html=html_content,
14
- base_url=url,
15
- parser_features='html.parser',
16
- ignore_links=True
17
- )
18
- return markdown_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  except Exception as e:
20
  return f"Error: {str(e)}"
21
 
22
  # Define Gradio interface
23
  iface = gr.Interface(
24
  fn=scrape_and_convert,
25
- inputs=gr.Textbox(label="Enter URL"),
 
 
 
26
  outputs=gr.Code(label="Markdown Output", language="markdown"),
27
- title="RAGScraper",
28
- description="Enter a URL to scrape and convert its content into Markdown format."
29
  )
30
 
31
  # Launch the Gradio app
 
1
  import gradio as gr
2
  from rag_scraper.scraper import Scraper
3
  from rag_scraper.converter import Converter
4
+ from rag_scraper.link_extractor import LinkExtractor, LinkType
5
+ from rag_scraper.utils import URLUtils
6
 
7
+ def scrape_and_convert(url, depth):
8
+ """Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
9
  try:
10
+ visited_urls = set()
11
+
12
+ def recursive_scrape(url, current_depth):
13
+ """Recursively scrape and convert pages up to the given depth."""
14
+ if url in visited_urls or current_depth < 0:
15
+ return ""
16
+
17
+ visited_urls.add(url)
18
+
19
+ # Fetch HTML content
20
+ try:
21
+ html_content = Scraper.fetch_html(url)
22
+ except Exception as e:
23
+ return f"Error fetching {url}: {str(e)}\n"
24
+
25
+ # Convert to Markdown
26
+ markdown_content = Converter.html_to_markdown(
27
+ html=html_content,
28
+ base_url=url,
29
+ parser_features='html.parser',
30
+ ignore_links=True
31
+ )
32
+
33
+ # If depth > 0, extract links and process them
34
+ if current_depth > 0:
35
+ links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
36
+ for link in links:
37
+ markdown_content += f"\n\n## Extracted from: {link}\n"
38
+ markdown_content += recursive_scrape(link, current_depth - 1)
39
+
40
+ return markdown_content
41
+
42
+ # Start the recursive scraping process
43
+ result = recursive_scrape(url, depth)
44
+ return result
45
+
46
  except Exception as e:
47
  return f"Error: {str(e)}"
48
 
49
  # Define Gradio interface
50
  iface = gr.Interface(
51
  fn=scrape_and_convert,
52
+ inputs=[
53
+ gr.Textbox(label="Enter URL"),
54
+ gr.Slider(minimum=0, maximum=3, step=1, label="Search Depth (0 = Only main page)")
55
+ ],
56
  outputs=gr.Code(label="Markdown Output", language="markdown"),
57
+ title="RAGScraper with Recursive Depth",
58
+ description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown."
59
  )
60
 
61
  # Launch the Gradio app
rag_scraper/cli.py CHANGED
@@ -1,62 +1,38 @@
1
  import argparse
2
-
3
  from rag_scraper.converter import Converter
4
- from rag_scraper.link_extractor import LinkExtractor
5
  from rag_scraper.scraper import Scraper
6
  from rag_scraper.utils import URLUtils
7
 
8
-
9
  def main():
10
  parser = argparse.ArgumentParser(
11
- description="RAGScraper: A tool to scrape, extract links, and convert webpages to markdown."
12
  )
13
-
14
  parser.add_argument("url", help="The URL of the webpage to scrape.")
15
- parser.add_argument(
16
- "--element_id",
17
- help="The ID of the element to search for links.",
18
- default=None,
19
- )
20
- parser.add_argument(
21
- "--element_type",
22
- help='The type of the element to search for links. Default is "nav".',
23
- default="nav",
24
- )
25
- parser.add_argument(
26
- "--convert",
27
- help="Convert the webpage to markdown.",
28
- action="store_true",
29
- )
30
- parser.add_argument(
31
- "--extract",
32
- help="Extract links from the specified element.",
33
- action="store_true",
34
- )
35
 
36
  args = parser.parse_args()
37
 
38
  base_url = URLUtils.get_base_url(args.url)
39
 
40
  if args.extract:
41
- # Extract links if the flag is set
42
- links = LinkExtractor.scrape_url(
43
- args.url,
44
- element_id=args.element_id,
45
- element_type=args.element_type,
46
- )
47
  print(f"Unique links for {args.url}:")
48
  for link in links:
49
  print(link)
50
  elif args.convert:
51
- # Convert to markdown if the flag is set
52
  html_content = Scraper.fetch_html(args.url)
53
  markdown_content = Converter.html_to_markdown(html_content, base_url)
54
  print(markdown_content)
55
  else:
56
- print(
57
- "Please specify an action: --convert for markdown conversion or --extract for link extraction."
58
- )
59
-
60
 
61
  if __name__ == "__main__":
62
  main()
 
 
1
  import argparse
 
2
  from rag_scraper.converter import Converter
3
+ from rag_scraper.link_extractor import LinkExtractor, LinkType
4
  from rag_scraper.scraper import Scraper
5
  from rag_scraper.utils import URLUtils
6
 
 
7
  def main():
8
  parser = argparse.ArgumentParser(
9
+ description="RAGScraper: Scrape webpages, extract links, and convert to markdown."
10
  )
 
11
  parser.add_argument("url", help="The URL of the webpage to scrape.")
12
+ parser.add_argument("--element_id", help="The ID of the element to search for links.", default=None)
13
+ parser.add_argument("--element_type", help='The type of element to search for links. Default is "nav".', default="nav")
14
+ parser.add_argument("--convert", help="Convert the webpage to markdown.", action="store_true")
15
+ parser.add_argument("--extract", help="Extract links from the specified element.", action="store_true")
16
+ parser.add_argument("--depth", type=int, default=0, help="Set search depth for link extraction.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  args = parser.parse_args()
19
 
20
  base_url = URLUtils.get_base_url(args.url)
21
 
22
  if args.extract:
23
+ # Extract links recursively
24
+ links = LinkExtractor.scrape_url(args.url, link_type=LinkType.INTERNAL, depth=args.depth)
 
 
 
 
25
  print(f"Unique links for {args.url}:")
26
  for link in links:
27
  print(link)
28
  elif args.convert:
29
+ # Convert to markdown
30
  html_content = Scraper.fetch_html(args.url)
31
  markdown_content = Converter.html_to_markdown(html_content, base_url)
32
  print(markdown_content)
33
  else:
34
+ print("Please specify an action: --convert for markdown conversion or --extract for link extraction.")
 
 
 
35
 
36
  if __name__ == "__main__":
37
  main()
38
+
rag_scraper/link_extractor.py CHANGED
@@ -1,70 +1,62 @@
1
  from enum import Enum, auto
2
  from typing import Set
3
  from urllib.parse import urljoin, urlparse
4
-
5
  import requests
6
  from bs4 import BeautifulSoup
7
 
8
-
9
  class LinkType(Enum):
10
  ALL = auto()
11
  INTERNAL = auto()
12
  EXTERNAL = auto()
13
 
14
-
15
  class LinkExtractor:
16
  @staticmethod
17
- def scrape_url(
18
- url: str, link_type: LinkType = LinkType.ALL, **kwargs
19
- ) -> Set[str]:
20
  """
21
- Scrape a given URL for unique links within a specified element, with an option to choose between internal, external, or all links.
22
- Converts relative URLs to absolute URLs.
 
23
  :param url: The URL of the website to scrape.
24
- :param link_type: The type of links to scrape (LinkType.ALL, LinkType.INTERNAL, LinkType.EXTERNAL).
25
- :param kwargs: Keyword arguments to specify element id and element type.
26
- :return: A set of unique link URLs found within the specified element.
 
27
  """
28
- element_id = kwargs.get("element_id")
29
- element_type = kwargs.get("element_type", "nav")
 
 
 
 
 
 
30
  base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
 
31
 
32
  try:
33
- response = requests.get(url)
34
  response.raise_for_status()
35
  soup = BeautifulSoup(response.text, "html.parser")
36
 
37
- if element_id:
38
- fetched_element = soup.find_all(element_type, id=element_id)
39
- else:
40
- fetched_element = soup.find_all(element_type)
41
-
42
- links = set()
43
 
44
- # Iterate over all found elements and extract links
45
- for element in fetched_element:
46
- for a_tag in element.find_all("a", href=True):
47
- href = a_tag["href"]
48
- absolute_url = urljoin(url, href)
49
- domain = urlparse(absolute_url).netloc
50
 
51
- if (
52
- link_type == LinkType.INTERNAL
53
- and domain == urlparse(base_url).netloc
54
- ):
55
- links.add(absolute_url)
56
- elif (
57
- link_type == LinkType.EXTERNAL
58
- and domain != urlparse(base_url).netloc
59
- ):
60
- links.add(absolute_url)
61
- elif link_type == LinkType.ALL:
62
- links.add(absolute_url)
63
 
64
- return links
65
  except requests.RequestException as e:
66
  print(f"Request failed for {url}: {e}")
67
  return set()
68
- except Exception as e:
69
- print(f"An error occurred: {e}")
70
- return set()
 
1
  from enum import Enum, auto
2
  from typing import Set
3
  from urllib.parse import urljoin, urlparse
 
4
  import requests
5
  from bs4 import BeautifulSoup
6
 
 
7
  class LinkType(Enum):
8
  ALL = auto()
9
  INTERNAL = auto()
10
  EXTERNAL = auto()
11
 
 
12
  class LinkExtractor:
13
  @staticmethod
14
+ def scrape_url(url: str, link_type: LinkType = LinkType.ALL, depth: int = 0, visited_urls: Set[str] = None) -> Set[str]:
 
 
15
  """
16
+ Scrape a given URL for unique links within a specified element,
17
+ with recursive depth support.
18
+
19
  :param url: The URL of the website to scrape.
20
+ :param link_type: The type of links to scrape (ALL, INTERNAL, EXTERNAL).
21
+ :param depth: The recursion depth for extracting links.
22
+ :param visited_urls: A set to keep track of visited URLs.
23
+ :return: A set of unique link URLs found.
24
  """
25
+ if visited_urls is None:
26
+ visited_urls = set()
27
+
28
+ if url in visited_urls or depth < 0:
29
+ return set()
30
+
31
+ visited_urls.add(url)
32
+
33
  base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
34
+ extracted_links = set()
35
 
36
  try:
37
+ response = requests.get(url, timeout=5)
38
  response.raise_for_status()
39
  soup = BeautifulSoup(response.text, "html.parser")
40
 
41
+ for a_tag in soup.find_all("a", href=True):
42
+ href = a_tag["href"]
43
+ absolute_url = urljoin(url, href)
44
+ domain = urlparse(absolute_url).netloc
 
 
45
 
46
+ if link_type == LinkType.INTERNAL and domain == urlparse(base_url).netloc:
47
+ extracted_links.add(absolute_url)
48
+ elif link_type == LinkType.EXTERNAL and domain != urlparse(base_url).netloc:
49
+ extracted_links.add(absolute_url)
50
+ elif link_type == LinkType.ALL:
51
+ extracted_links.add(absolute_url)
52
 
53
+ # Recursive scraping if depth > 0
54
+ if depth > 0:
55
+ for link in extracted_links.copy():
56
+ extracted_links.update(LinkExtractor.scrape_url(link, link_type, depth - 1, visited_urls))
 
 
 
 
 
 
 
 
57
 
58
+ return extracted_links
59
  except requests.RequestException as e:
60
  print(f"Request failed for {url}: {e}")
61
  return set()
62
+