import gradio as gr from text_to_video import model_t2v_fun, setup_seed from omegaconf import OmegaConf import torch import imageio import os import cv2 import pandas as pd import torchvision import random from models import get_models from pipelines.pipeline_videogen import VideoGenPipeline from download import find_model from diffusers.schedulers import DDIMScheduler, DDPMScheduler, PNDMScheduler, EulerDiscreteScheduler from diffusers.models import AutoencoderKL from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextModelWithProjection config_path = "./base/configs/sample.yaml" args = OmegaConf.load("./base/configs/sample.yaml") device = "cpu" # Force CPU usage css = """ h1 { text-align: center; } #component-0 { max-width: 730px; margin: auto; } """ sd_path = args.pretrained_path unet = get_models(args, sd_path).to(device, dtype=torch.float32) # Use float32 for CPU state_dict = find_model("./pretrained_models/lavie_base.pt") unet.load_state_dict(state_dict) vae = AutoencoderKL.from_pretrained(sd_path, subfolder="vae", torch_dtype=torch.float32).to(device) # Use float32 for CPU tokenizer_one = CLIPTokenizer.from_pretrained(sd_path, subfolder="tokenizer") text_encoder_one = CLIPTextModel.from_pretrained(sd_path, subfolder="text_encoder", torch_dtype=torch.float32).to(device) # Use float32 for CPU unet.eval() vae.eval() text_encoder_one.eval() def infer(prompt, seed_inp, ddim_steps, cfg, infer_type): if seed_inp != -1: setup_seed(seed_inp) else: seed_inp = random.choice(range(10000000)) setup_seed(seed_inp) if infer_type == 'ddim': scheduler = DDIMScheduler.from_pretrained(sd_path, subfolder="scheduler", beta_start=args.beta_start, beta_end=args.beta_end, beta_schedule=args.beta_schedule) elif infer_type == 'eulerdiscrete': scheduler = EulerDiscreteScheduler.from_pretrained(sd_path, subfolder="scheduler", beta_start=args.beta_start, beta_end=args.beta_end, beta_schedule=args.beta_schedule) elif infer_type == 'ddpm': scheduler = DDPMScheduler.from_pretrained(sd_path, subfolder="scheduler", beta_start=args.beta_start, beta_end=args.beta_end, beta_schedule=args.beta_schedule) model = VideoGenPipeline(vae=vae, text_encoder=text_encoder_one, tokenizer=tokenizer_one, scheduler=scheduler, unet=unet) model.to(device) # Disable xformers for CPU # if device == "cuda": # model.enable_xformers_memory_efficient_attention() videos = model(prompt, video_length=8, height=160, width=256, num_inference_steps=ddim_steps, guidance_scale=cfg).video # Reduced resolution and length if not os.path.exists(args.output_folder): os.mkdir(args.output_folder) torchvision.io.write_video(args.output_folder + prompt[0:30].replace(' ', '_') + '-' + str(seed_inp) + '-' + str(ddim_steps) + '-' + str(cfg) + '-.mp4', videos[0], fps=4) # Reduced FPS return args.output_folder + prompt[0:30].replace(' ', '_') + '-' + str(seed_inp) + '-' + str(ddim_steps) + '-' + str(cfg) + '-.mp4' title = """
Apply Intern·Vchitect to generate a video