Spaces:
Running
Running
File size: 3,599 Bytes
42d0d85 1dfda18 42d0d85 c611d5d 42d0d85 1dfda18 42d0d85 c35bf5c 42d0d85 92c7d91 42d0d85 4d8ba20 0978851 42d0d85 c611d5d 7b9c6c2 c611d5d 92c7d91 1dfda18 42d0d85 1dfda18 4d8ba20 1dfda18 c35bf5c 61817f9 90e94cc 61817f9 3611a8a c35bf5c 3611a8a 1dfda18 92c817f 9378079 1dfda18 c35bf5c 92c7d91 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from zipfile import ZipFile
from io import BytesIO
import gradio as gr
def download_file(url, session):
"""Download a file and return its content."""
try:
response = session.get(url)
response.raise_for_status()
return response.content
except requests.exceptions.RequestException as e:
print(f"Error downloading {url}: {e}")
return None
def save_webpage_as_zip(url):
"""Save a webpage and its assets as a ZIP file."""
session = requests.Session()
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
temp_dir = 'temp_webpage'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
main_html_path = os.path.join(temp_dir, 'index.html')
with open(main_html_path, 'wb') as f:
f.write(response.content)
assets = []
for tag in soup.find_all(['img', 'link', 'script']):
if tag.name == 'img' and tag.get('src'):
assets.append(tag['src'])
elif tag.name == 'link' and tag.get('href'):
assets.append(tag['href'])
elif tag.name == 'script' and tag.get('src'):
assets.append(tag['src'])
for asset in assets:
asset_url = urljoin(url, asset)
asset_path = urlparse(asset_url).path.lstrip('/')
asset_full_path = os.path.join(temp_dir, asset_path)
if asset_path.endswith('/'):
print(f"Skipping directory {asset_full_path}")
continue
os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
content = download_file(asset_url, session)
if content:
if os.path.isdir(asset_full_path):
print(f"Skipping directory {asset_full_path}")
continue
with open(asset_full_path, 'wb') as f:
f.write(content)
zip_buffer = BytesIO()
with ZipFile(zip_buffer, 'w') as zipf:
for root, _, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, temp_dir))
for root, _, files in os.walk(temp_dir, topdown=False):
for file in files:
os.remove(os.path.join(root, file))
os.rmdir(root)
zip_buffer.seek(0)
return zip_buffer
def generate_zip_file(url):
"""Generate ZIP file from a webpage URL."""
zip_buffer = save_webpage_as_zip(url)
temp_zip_path = "webpage.zip"
with open(temp_zip_path, 'wb') as f:
f.write(zip_buffer.read())
return temp_zip_path
examples = [
"https://www.bmw.com/en/index.html",
"https://www.ferrari.com/en-EN",
"https://streamlit.io/"
]
DESCRIPTION = """
## Webpage to ZIP Downloader 🔗
"""
with gr.Blocks(theme="bethecloud/storj_theme") as demo:
gr.Markdown(DESCRIPTION)
gr.Markdown("Enter a URL to download the webpage and its assets as a ZIP file.")
url_input = gr.Textbox(label="Website URL", placeholder="Enter a URL (e.g., https://www.example.com)")
download_button = gr.Button("Download as ZIP")
output_file = gr.File(label="Download")
def set_example_url(url):
url_input.value = url
download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file)
gr.Examples(
examples=examples,
inputs=url_input,
outputs=output_file,
fn=generate_zip_file
)
demo.launch() |