Spaces:

VinayHajare
/

Text-To-Image-EfficientCLIP-GAN

Runtime error

File size: 3,765 Bytes

import os
import sys
import io
import torch
import torchvision
import clip
import numpy as np
from huggingface_hub import hf_hub_download
from PIL import Image
from torchvision.transforms.functional import to_pil_image

from utils import load_model_weights
from model import NetG, CLIP_TXT_ENCODER

# checking the device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

# Getting the HF token
HF_TOKEN = os.getenv("HF_TOKEN")

# repositiory of the model
repo_id = "VinayHajare/EfficientCLIP-GAN"
file_name = "saved_models/state_epoch_1480.pth"

# clip model wrapped with the custom encoder
clip_text = "ViT-B/32"
clip_model, preprocessor = clip.load(clip_text, device=device)
clip_model = clip_model.eval()
text_encoder = CLIP_TXT_ENCODER(clip_model).to(device)

# loading the model from the repository and extracting the generator model
model_path = hf_hub_download(repo_id = repo_id, filename = file_name, token = HF_TOKEN)
checkpoint = torch.load(model_path, map_location=torch.device(device))
netG = NetG(64, 100, 512, 256, 3, False, clip_model).to(device)
generator = load_model_weights(netG, checkpoint['model']['netG'], multi_gpus=False)

# Function to generate images from text
def generate_image_from_text(caption, batch_size=4):
    # Create the noise vector
    noise = torch.randn((batch_size, 100)).to(device)
    with torch.no_grad():
        # Tokenize caption
        tokenized_text = clip.tokenize([caption]).to(device)
        # Extract the sentence and word embedding from Custom CLIP ENCODER
        sent_emb, word_emb = text_encoder(tokenized_text)
        # Repeat the sentence embedding to match the batch size
        sent_emb = sent_emb.repeat(batch_size, 1)
        # generate the images
        generated_images = generator(noise, sent_emb, eval=True).float()

        # Convert the tensor images to PIL format
        pil_images = []
        for image_tensor in generated_images.unbind(0):
            # Rescale tensor values to [0, 1]
            image_tensor = image_tensor.data.clamp(-1, 1)
            image_tensor = (image_tensor + 1.0) / 2.0

            # Convert tensor to numpy array
            image_numpy = image_tensor.permute(1, 2, 0).cpu().numpy()

            # Clip numpy array values to [0, 1]
            image_numpy = np.clip(image_numpy, 0, 1)

            # Create a PIL image from the numpy array
            pil_image = Image.fromarray((image_numpy * 255).astype(np.uint8))

            pil_images.append(pil_image)

        return pil_images

# Function to generate images from text
def generate_image_from_text_with_persistent_storage(caption, batch_size=4):
    # Create the noise vector
    noise = torch.randn((batch_size, 100)).to(device)
    with torch.no_grad():
        # Tokenize caption
        tokenized_text = clip.tokenize([caption]).to(device)
        # Extract the sentence and word embedding from Custom CLIP ENCODER
        sent_emb, word_emb = text_encoder(tokenized_text)
        # Repeat the sentence embedding to match the batch size
        sent_emb = sent_emb.repeat(batch_size, 1)
        # generate the images
        generated_images = generator(noise, sent_emb, eval=True).float()

        # Create a permanent directory if it doesn't exist
        permanent_dir = "generated_images"
        if not os.path.exists(permanent_dir):
            os.makedirs(permanent_dir)

        image_paths = []
        for idx, image_tensor in enumerate(generated_images.unbind(0)):
            # Save the image tensor to a permanent file
            image_path = os.path.join(permanent_dir, f"image_{idx}.png")
            torchvision.utils.save_image(image_tensor.data, image_path, value_range=(-1, 1), normalize=True)
            image_paths.append(image_path)

        return image_paths