Spaces:
Running
Running
File size: 3,394 Bytes
42d0d85 1dfda18 42d0d85 c611d5d 42d0d85 1dfda18 42d0d85 7b9c6c2 4d8ba20 0978851 42d0d85 c611d5d 7b9c6c2 c611d5d 42d0d85 1dfda18 42d0d85 1dfda18 4d8ba20 1dfda18 0978851 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from zipfile import ZipFile
from io import BytesIO
import gradio as gr
def download_file(url, session):
"""Download a file and return its content."""
try:
response = session.get(url)
response.raise_for_status()
return response.content
except requests.exceptions.RequestException as e:
print(f"Error downloading {url}: {e}")
return None
def save_webpage_as_zip(url):
"""Save a webpage and its assets as a ZIP file."""
session = requests.Session()
response = session.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
# Create a temporary directory to store downloaded files
temp_dir = 'temp_webpage'
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
# Download and save the main HTML file
main_html_path = os.path.join(temp_dir, 'index.html')
with open(main_html_path, 'wb') as f:
f.write(response.content)
# Prepare a list of all assets to download
assets = []
for tag in soup.find_all(['img', 'link', 'script']):
if tag.name == 'img' and tag.get('src'):
assets.append(tag['src'])
elif tag.name == 'link' and tag.get('href'):
assets.append(tag['href'])
elif tag.name == 'script' and tag.get('src'):
assets.append(tag['src'])
# Download and save all assets
for asset in assets:
asset_url = urljoin(url, asset)
asset_path = urlparse(asset_url).path.lstrip('/')
asset_full_path = os.path.join(temp_dir, asset_path)
# Skip if asset_full_path is a directory
if asset_path.endswith('/'):
print(f"Skipping directory {asset_full_path}")
continue
# Create directories if they don't exist
os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)
# Download and save the asset
content = download_file(asset_url, session)
if content:
if os.path.isdir(asset_full_path):
print(f"Skipping directory {asset_full_path}")
continue
with open(asset_full_path, 'wb') as f:
f.write(content)
# Create a ZIP file in memory
zip_buffer = BytesIO()
with ZipFile(zip_buffer, 'w') as zipf:
for root, _, files in os.walk(temp_dir):
for file in files:
file_path = os.path.join(root, file)
zipf.write(file_path, os.path.relpath(file_path, temp_dir))
# Clean up temporary directory
for root, _, files in os.walk(temp_dir, topdown=False):
for file in files:
os.remove(os.path.join(root, file))
os.rmdir(root)
zip_buffer.seek(0)
return zip_buffer
def generate_zip_file(url):
"""Generate ZIP file from a webpage URL."""
zip_buffer = save_webpage_as_zip(url)
temp_zip_path = "webpage.zip"
with open(temp_zip_path, 'wb') as f:
f.write(zip_buffer.read())
return temp_zip_path
# Gradio Interface
with gr.Blocks() as demo:
url_input = gr.Textbox(label="Website URL")
download_button = gr.Button("Download as ZIP")
output_file = gr.File(label="Download")
download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file)
demo.launch() |