GPU memory
How much GPU memory is needed to run the sd 3.5 large? I get constantly torch.OutOfMemoryError: CUDA out of memory even if I use a 4090 nvidia 24G.
Edit: I used
pipe.enable_model_cpu_offload()
instead of
pipe = pipe.to("cuda")
and ran properly.
Strangely, I find this model to be more GPU intensive then FLUX dev. Saying that you can do the following to minimize VRAM usage if using the bfloat16 model:
Quantize the T5 Encoder and Transformer. You can put this into whatever folder to be used later. Note you can use FP8 or Qint8 as I have or something smaller.
I don't need to use pipe.enable_model_cpu_offload() when quantizing. I can usually (after saving the pipeline to 'cpu' and using again to 'cuda' run this pipe
for maybe 6+ times in succession before running out of memory and restarting. It's a hungry beast.
################## SD35 QUANTIZE #########################
from huggingface_hub import login
login(token="your token")
from diffusers import SD3Transformer2DModel
from transformers import T5EncoderModel
from optimum.quanto import freeze, qint8, quantize, quantization_map #qint8, fp8, etc...
from pathlib import Path
import json
base_model = "stabilityai/stable-diffusion-3.5-large"
dtype = torch.bfloat16
transformer = SD3Transformer2DModel.from_pretrained(
base_model,
subfolder= "transformer",
torch_dtype=dtype
)
quantize(transformer, weights=qint8)
freeze(transformer)
save_directory = "./SD35/sd3transformer2dmodel_qint8"
transformer.save_pretrained(save_directory)
qmap_name = Path(save_directory, "quanto_qmap.json" )
qmap = quantization_map(transformer)
with open (qmap_name, "w" , encoding= "utf8" ) as f:
json.dump(qmap, f, indent= 4 )
print('Transformer done')
text_encoder_3 = T5EncoderModel.from_pretrained(
base_model,
subfolder= "text_encoder_3",
torch_dtype=dtype
)
quantize(text_encoder_3, weights=qint8)
freeze(text_encoder_3)
save_directory = "./sd35/t5encodermodel_qint8"
text_encoder_3.save_pretrained(save_directory)
qmap_name = Path(save_directory, "quanto_qmap.json" )
qmap = quantization_map(text_encoder_3)
with open (qmap_name, "w" , encoding= "utf8" ) as f:
json.dump(qmap, f, indent= 4 )
print('T5encoder done')
After quantizing and saving, when running inference do this prior to creating the pipeline:
dtype = torch.bfloat16
base_model = "stabilityai/stable-diffusion-3.5-large"
print('Quantizing transformer')
class QuantizedSD3Transformer2DModel(QuantizedDiffusersModel):
base_class = SD3Transformer2DModel
transformer = QuantizedSD3Transformer2DModel.from_pretrained(
"./SD35/sd3transformer2dmodel_qint8"
).to(dtype=dtype)
print('Quantizing text_encoder_3')
class QuantizedT5EncoderModelForCausalLM (QuantizedTransformersModel):
auto_class = T5EncoderModel
auto_class.from_config = auto_class._from_config
text_encoder_3 = QuantizedT5EncoderModelForCausalLM.from_pretrained(
"./SD35/t5encodermodel_qint8"
).to(dtype=dtype)
pipe = StableDiffusion3Pipeline.from_pretrained(base_model, token=token, transformer=transformer, text_encoder_3=text_encoder_3, torch_dtype=dtype, use_safetensors=True)
if device == 'cpu':
pipe.enable_model_cpu_offload()
else:
pipe = pipe.to(device)