File size: 3,394 Bytes
42d0d85
 
 
 
 
 
1dfda18
42d0d85
 
 
c611d5d
 
 
 
 
 
 
42d0d85
1dfda18
42d0d85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7b9c6c2
4d8ba20
0978851
 
 
42d0d85
 
 
 
 
c611d5d
7b9c6c2
 
 
c611d5d
 
42d0d85
1dfda18
 
 
42d0d85
 
 
 
 
 
 
 
 
 
 
1dfda18
 
 
 
 
 
4d8ba20
 
 
 
1dfda18
 
 
 
 
 
 
 
 
0978851
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from zipfile import ZipFile
from io import BytesIO
import gradio as gr

def download_file(url, session):
    """Download a file and return its content."""
    try:
        response = session.get(url)
        response.raise_for_status()
        return response.content
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return None

def save_webpage_as_zip(url):
    """Save a webpage and its assets as a ZIP file."""
    session = requests.Session()
    response = session.get(url)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')

    # Create a temporary directory to store downloaded files
    temp_dir = 'temp_webpage'
    if not os.path.exists(temp_dir):
        os.makedirs(temp_dir)

    # Download and save the main HTML file
    main_html_path = os.path.join(temp_dir, 'index.html')
    with open(main_html_path, 'wb') as f:
        f.write(response.content)

    # Prepare a list of all assets to download
    assets = []
    for tag in soup.find_all(['img', 'link', 'script']):
        if tag.name == 'img' and tag.get('src'):
            assets.append(tag['src'])
        elif tag.name == 'link' and tag.get('href'):
            assets.append(tag['href'])
        elif tag.name == 'script' and tag.get('src'):
            assets.append(tag['src'])

    # Download and save all assets
    for asset in assets:
        asset_url = urljoin(url, asset)
        asset_path = urlparse(asset_url).path.lstrip('/')
        asset_full_path = os.path.join(temp_dir, asset_path)

        # Skip if asset_full_path is a directory
        if asset_path.endswith('/'):
            print(f"Skipping directory {asset_full_path}")
            continue

        # Create directories if they don't exist
        os.makedirs(os.path.dirname(asset_full_path), exist_ok=True)

        # Download and save the asset
        content = download_file(asset_url, session)
        if content:
            if os.path.isdir(asset_full_path):
                print(f"Skipping directory {asset_full_path}")
                continue
            with open(asset_full_path, 'wb') as f:
                f.write(content)

    # Create a ZIP file in memory
    zip_buffer = BytesIO()
    with ZipFile(zip_buffer, 'w') as zipf:
        for root, _, files in os.walk(temp_dir):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, temp_dir))

    # Clean up temporary directory
    for root, _, files in os.walk(temp_dir, topdown=False):
        for file in files:
            os.remove(os.path.join(root, file))
        os.rmdir(root)

    zip_buffer.seek(0)
    return zip_buffer

def generate_zip_file(url):
    """Generate ZIP file from a webpage URL."""
    zip_buffer = save_webpage_as_zip(url)
    temp_zip_path = "webpage.zip"
    with open(temp_zip_path, 'wb') as f:
        f.write(zip_buffer.read())
    return temp_zip_path

# Gradio Interface
with gr.Blocks() as demo:
    url_input = gr.Textbox(label="Website URL")
    download_button = gr.Button("Download as ZIP")
    output_file = gr.File(label="Download")

    download_button.click(fn=generate_zip_file, inputs=url_input, outputs=output_file)

demo.launch()