Spaces:
Running
Running
Added recursion
Browse files- app.py +46 -16
- rag_scraper/cli.py +12 -36
- rag_scraper/link_extractor.py +34 -42
app.py
CHANGED
@@ -1,31 +1,61 @@
|
|
1 |
import gradio as gr
|
2 |
from rag_scraper.scraper import Scraper
|
3 |
from rag_scraper.converter import Converter
|
|
|
|
|
4 |
|
5 |
-
def scrape_and_convert(url):
|
6 |
-
"""Fetch HTML content and convert
|
7 |
try:
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
except Exception as e:
|
20 |
return f"Error: {str(e)}"
|
21 |
|
22 |
# Define Gradio interface
|
23 |
iface = gr.Interface(
|
24 |
fn=scrape_and_convert,
|
25 |
-
inputs=
|
|
|
|
|
|
|
26 |
outputs=gr.Code(label="Markdown Output", language="markdown"),
|
27 |
-
title="RAGScraper",
|
28 |
-
description="Enter a URL
|
29 |
)
|
30 |
|
31 |
# Launch the Gradio app
|
|
|
1 |
import gradio as gr
|
2 |
from rag_scraper.scraper import Scraper
|
3 |
from rag_scraper.converter import Converter
|
4 |
+
from rag_scraper.link_extractor import LinkExtractor, LinkType
|
5 |
+
from rag_scraper.utils import URLUtils
|
6 |
|
7 |
+
def scrape_and_convert(url, depth):
|
8 |
+
"""Fetch HTML content, extract links recursively (up to given depth), and convert to Markdown."""
|
9 |
try:
|
10 |
+
visited_urls = set()
|
11 |
+
|
12 |
+
def recursive_scrape(url, current_depth):
|
13 |
+
"""Recursively scrape and convert pages up to the given depth."""
|
14 |
+
if url in visited_urls or current_depth < 0:
|
15 |
+
return ""
|
16 |
+
|
17 |
+
visited_urls.add(url)
|
18 |
+
|
19 |
+
# Fetch HTML content
|
20 |
+
try:
|
21 |
+
html_content = Scraper.fetch_html(url)
|
22 |
+
except Exception as e:
|
23 |
+
return f"Error fetching {url}: {str(e)}\n"
|
24 |
+
|
25 |
+
# Convert to Markdown
|
26 |
+
markdown_content = Converter.html_to_markdown(
|
27 |
+
html=html_content,
|
28 |
+
base_url=url,
|
29 |
+
parser_features='html.parser',
|
30 |
+
ignore_links=True
|
31 |
+
)
|
32 |
+
|
33 |
+
# If depth > 0, extract links and process them
|
34 |
+
if current_depth > 0:
|
35 |
+
links = LinkExtractor.scrape_url(url, link_type=LinkType.INTERNAL)
|
36 |
+
for link in links:
|
37 |
+
markdown_content += f"\n\n## Extracted from: {link}\n"
|
38 |
+
markdown_content += recursive_scrape(link, current_depth - 1)
|
39 |
+
|
40 |
+
return markdown_content
|
41 |
+
|
42 |
+
# Start the recursive scraping process
|
43 |
+
result = recursive_scrape(url, depth)
|
44 |
+
return result
|
45 |
+
|
46 |
except Exception as e:
|
47 |
return f"Error: {str(e)}"
|
48 |
|
49 |
# Define Gradio interface
|
50 |
iface = gr.Interface(
|
51 |
fn=scrape_and_convert,
|
52 |
+
inputs=[
|
53 |
+
gr.Textbox(label="Enter URL"),
|
54 |
+
gr.Slider(minimum=0, maximum=3, step=1, label="Search Depth (0 = Only main page)")
|
55 |
+
],
|
56 |
outputs=gr.Code(label="Markdown Output", language="markdown"),
|
57 |
+
title="RAGScraper with Recursive Depth",
|
58 |
+
description="Enter a URL and specify the search depth. The app will fetch, extract links, and convert HTML to Markdown."
|
59 |
)
|
60 |
|
61 |
# Launch the Gradio app
|
rag_scraper/cli.py
CHANGED
@@ -1,62 +1,38 @@
|
|
1 |
import argparse
|
2 |
-
|
3 |
from rag_scraper.converter import Converter
|
4 |
-
from rag_scraper.link_extractor import LinkExtractor
|
5 |
from rag_scraper.scraper import Scraper
|
6 |
from rag_scraper.utils import URLUtils
|
7 |
|
8 |
-
|
9 |
def main():
|
10 |
parser = argparse.ArgumentParser(
|
11 |
-
description="RAGScraper:
|
12 |
)
|
13 |
-
|
14 |
parser.add_argument("url", help="The URL of the webpage to scrape.")
|
15 |
-
parser.add_argument(
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
)
|
20 |
-
parser.add_argument(
|
21 |
-
"--element_type",
|
22 |
-
help='The type of the element to search for links. Default is "nav".',
|
23 |
-
default="nav",
|
24 |
-
)
|
25 |
-
parser.add_argument(
|
26 |
-
"--convert",
|
27 |
-
help="Convert the webpage to markdown.",
|
28 |
-
action="store_true",
|
29 |
-
)
|
30 |
-
parser.add_argument(
|
31 |
-
"--extract",
|
32 |
-
help="Extract links from the specified element.",
|
33 |
-
action="store_true",
|
34 |
-
)
|
35 |
|
36 |
args = parser.parse_args()
|
37 |
|
38 |
base_url = URLUtils.get_base_url(args.url)
|
39 |
|
40 |
if args.extract:
|
41 |
-
# Extract links
|
42 |
-
links = LinkExtractor.scrape_url(
|
43 |
-
args.url,
|
44 |
-
element_id=args.element_id,
|
45 |
-
element_type=args.element_type,
|
46 |
-
)
|
47 |
print(f"Unique links for {args.url}:")
|
48 |
for link in links:
|
49 |
print(link)
|
50 |
elif args.convert:
|
51 |
-
# Convert to markdown
|
52 |
html_content = Scraper.fetch_html(args.url)
|
53 |
markdown_content = Converter.html_to_markdown(html_content, base_url)
|
54 |
print(markdown_content)
|
55 |
else:
|
56 |
-
print(
|
57 |
-
"Please specify an action: --convert for markdown conversion or --extract for link extraction."
|
58 |
-
)
|
59 |
-
|
60 |
|
61 |
if __name__ == "__main__":
|
62 |
main()
|
|
|
|
1 |
import argparse
|
|
|
2 |
from rag_scraper.converter import Converter
|
3 |
+
from rag_scraper.link_extractor import LinkExtractor, LinkType
|
4 |
from rag_scraper.scraper import Scraper
|
5 |
from rag_scraper.utils import URLUtils
|
6 |
|
|
|
7 |
def main():
|
8 |
parser = argparse.ArgumentParser(
|
9 |
+
description="RAGScraper: Scrape webpages, extract links, and convert to markdown."
|
10 |
)
|
|
|
11 |
parser.add_argument("url", help="The URL of the webpage to scrape.")
|
12 |
+
parser.add_argument("--element_id", help="The ID of the element to search for links.", default=None)
|
13 |
+
parser.add_argument("--element_type", help='The type of element to search for links. Default is "nav".', default="nav")
|
14 |
+
parser.add_argument("--convert", help="Convert the webpage to markdown.", action="store_true")
|
15 |
+
parser.add_argument("--extract", help="Extract links from the specified element.", action="store_true")
|
16 |
+
parser.add_argument("--depth", type=int, default=0, help="Set search depth for link extraction.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
args = parser.parse_args()
|
19 |
|
20 |
base_url = URLUtils.get_base_url(args.url)
|
21 |
|
22 |
if args.extract:
|
23 |
+
# Extract links recursively
|
24 |
+
links = LinkExtractor.scrape_url(args.url, link_type=LinkType.INTERNAL, depth=args.depth)
|
|
|
|
|
|
|
|
|
25 |
print(f"Unique links for {args.url}:")
|
26 |
for link in links:
|
27 |
print(link)
|
28 |
elif args.convert:
|
29 |
+
# Convert to markdown
|
30 |
html_content = Scraper.fetch_html(args.url)
|
31 |
markdown_content = Converter.html_to_markdown(html_content, base_url)
|
32 |
print(markdown_content)
|
33 |
else:
|
34 |
+
print("Please specify an action: --convert for markdown conversion or --extract for link extraction.")
|
|
|
|
|
|
|
35 |
|
36 |
if __name__ == "__main__":
|
37 |
main()
|
38 |
+
|
rag_scraper/link_extractor.py
CHANGED
@@ -1,70 +1,62 @@
|
|
1 |
from enum import Enum, auto
|
2 |
from typing import Set
|
3 |
from urllib.parse import urljoin, urlparse
|
4 |
-
|
5 |
import requests
|
6 |
from bs4 import BeautifulSoup
|
7 |
|
8 |
-
|
9 |
class LinkType(Enum):
|
10 |
ALL = auto()
|
11 |
INTERNAL = auto()
|
12 |
EXTERNAL = auto()
|
13 |
|
14 |
-
|
15 |
class LinkExtractor:
|
16 |
@staticmethod
|
17 |
-
def scrape_url(
|
18 |
-
url: str, link_type: LinkType = LinkType.ALL, **kwargs
|
19 |
-
) -> Set[str]:
|
20 |
"""
|
21 |
-
Scrape a given URL for unique links within a specified element,
|
22 |
-
|
|
|
23 |
:param url: The URL of the website to scrape.
|
24 |
-
:param link_type: The type of links to scrape (
|
25 |
-
:param
|
26 |
-
:
|
|
|
27 |
"""
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
|
|
|
31 |
|
32 |
try:
|
33 |
-
response = requests.get(url)
|
34 |
response.raise_for_status()
|
35 |
soup = BeautifulSoup(response.text, "html.parser")
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
links = set()
|
43 |
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
)
|
55 |
-
links.add(absolute_url)
|
56 |
-
elif (
|
57 |
-
link_type == LinkType.EXTERNAL
|
58 |
-
and domain != urlparse(base_url).netloc
|
59 |
-
):
|
60 |
-
links.add(absolute_url)
|
61 |
-
elif link_type == LinkType.ALL:
|
62 |
-
links.add(absolute_url)
|
63 |
|
64 |
-
return
|
65 |
except requests.RequestException as e:
|
66 |
print(f"Request failed for {url}: {e}")
|
67 |
return set()
|
68 |
-
|
69 |
-
print(f"An error occurred: {e}")
|
70 |
-
return set()
|
|
|
1 |
from enum import Enum, auto
|
2 |
from typing import Set
|
3 |
from urllib.parse import urljoin, urlparse
|
|
|
4 |
import requests
|
5 |
from bs4 import BeautifulSoup
|
6 |
|
|
|
7 |
class LinkType(Enum):
|
8 |
ALL = auto()
|
9 |
INTERNAL = auto()
|
10 |
EXTERNAL = auto()
|
11 |
|
|
|
12 |
class LinkExtractor:
|
13 |
@staticmethod
|
14 |
+
def scrape_url(url: str, link_type: LinkType = LinkType.ALL, depth: int = 0, visited_urls: Set[str] = None) -> Set[str]:
|
|
|
|
|
15 |
"""
|
16 |
+
Scrape a given URL for unique links within a specified element,
|
17 |
+
with recursive depth support.
|
18 |
+
|
19 |
:param url: The URL of the website to scrape.
|
20 |
+
:param link_type: The type of links to scrape (ALL, INTERNAL, EXTERNAL).
|
21 |
+
:param depth: The recursion depth for extracting links.
|
22 |
+
:param visited_urls: A set to keep track of visited URLs.
|
23 |
+
:return: A set of unique link URLs found.
|
24 |
"""
|
25 |
+
if visited_urls is None:
|
26 |
+
visited_urls = set()
|
27 |
+
|
28 |
+
if url in visited_urls or depth < 0:
|
29 |
+
return set()
|
30 |
+
|
31 |
+
visited_urls.add(url)
|
32 |
+
|
33 |
base_url = "{uri.scheme}://{uri.netloc}".format(uri=urlparse(url))
|
34 |
+
extracted_links = set()
|
35 |
|
36 |
try:
|
37 |
+
response = requests.get(url, timeout=5)
|
38 |
response.raise_for_status()
|
39 |
soup = BeautifulSoup(response.text, "html.parser")
|
40 |
|
41 |
+
for a_tag in soup.find_all("a", href=True):
|
42 |
+
href = a_tag["href"]
|
43 |
+
absolute_url = urljoin(url, href)
|
44 |
+
domain = urlparse(absolute_url).netloc
|
|
|
|
|
45 |
|
46 |
+
if link_type == LinkType.INTERNAL and domain == urlparse(base_url).netloc:
|
47 |
+
extracted_links.add(absolute_url)
|
48 |
+
elif link_type == LinkType.EXTERNAL and domain != urlparse(base_url).netloc:
|
49 |
+
extracted_links.add(absolute_url)
|
50 |
+
elif link_type == LinkType.ALL:
|
51 |
+
extracted_links.add(absolute_url)
|
52 |
|
53 |
+
# Recursive scraping if depth > 0
|
54 |
+
if depth > 0:
|
55 |
+
for link in extracted_links.copy():
|
56 |
+
extracted_links.update(LinkExtractor.scrape_url(link, link_type, depth - 1, visited_urls))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
+
return extracted_links
|
59 |
except requests.RequestException as e:
|
60 |
print(f"Request failed for {url}: {e}")
|
61 |
return set()
|
62 |
+
|
|
|
|