k4d3 commited on
Commit
eb8cf2c
1 Parent(s): ca498bf

notice of move

Browse files
Files changed (1) hide show
  1. README.md +1 -951
README.md CHANGED
@@ -14,957 +14,7 @@ tags:
14
 
15
  ---
16
 
17
-
18
- ---
19
-
20
- ## Auto Taggers
21
-
22
- ### [eva02-vit-large-448-8046](https://huggingface.co/Thouph/eva02-vit-large-448-8046)
23
-
24
- You want to install the only dependency, besides torch, I mean..
25
-
26
- ```bash
27
- pip install timm
28
- ```
29
-
30
- The following inference script for the tagger needs a folder as input, be warned that it also converts WebP images to PNG and you can specify tags to be ignored and some other stuff! I recommend reading through it and changing whatever you need.
31
-
32
- [Colab Notebook](https://colab.research.google.com/drive/1gIB2fGjLAuh6s_hrNlIPCkw_3jodoFP0?usp=sharing)
33
-
34
- <div style="background-color: lightyellow; padding: 10px;">
35
- <details>
36
- <summary>Click to reveal inference script</summary>
37
-
38
- ```python
39
- import os
40
- import torch
41
- from torchvision import transforms
42
- from PIL import Image
43
- import json
44
- import re
45
-
46
- # Set the threshold for tag selection
47
- THRESHOLD = 0.3
48
-
49
- # Define the directory containing the images and the path to the model
50
- image_dir = r"./images"
51
- model_path = r"./model.pth"
52
-
53
- # Define the set of ignored tags
54
- ignored_tags = {"grandfathered content"}
55
-
56
- # Check if CUDA is available, else use CPU
57
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
58
-
59
- # Load the model and set it to evaluation mode
60
- model = torch.load(model_path, map_location=device)
61
- model = model.to(device)
62
- model.eval()
63
-
64
- # Define the image transformations
65
- transform = transforms.Compose(
66
- [
67
- # Resize the images to 448x448
68
- transforms.Resize((448, 448)),
69
- # Convert the images to PyTorch tensors
70
- transforms.ToTensor(),
71
- # Normalize the images with the given mean and standard deviation
72
- transforms.Normalize(
73
- mean=[0.48145466, 0.4578275, 0.40821073],
74
- std=[0.26862954, 0.26130258, 0.27577711],
75
- ),
76
- ]
77
- )
78
-
79
- # Load the tags from the JSON file
80
- with open("tags_8041.json", "r", encoding="utf8") as file:
81
- tags = json.load(file)
82
- allowed_tags = sorted(tags)
83
-
84
- # Add placeholders and explicitness tags to the list of allowed tags
85
- allowed_tags.insert(0, "placeholder0")
86
- allowed_tags.append("placeholder1")
87
- allowed_tags.append("explicit")
88
- allowed_tags.append("questionable")
89
- allowed_tags.append("safe")
90
-
91
- # Define the allowed image extensions
92
- image_exts = [".jpg", ".jpeg", ".png"]
93
-
94
- for filename in os.listdir(image_dir):
95
- # Check if the file is a WebP image
96
- if filename.endswith(".webp"):
97
- # Construct the input and output file paths
98
- input_path = os.path.join(image_dir, filename)
99
- output_path = os.path.join(image_dir, os.path.splitext(filename)[0] + ".png")
100
-
101
- # Open the WebP image and save it as a PNG
102
- image = Image.open(input_path)
103
- image.save(output_path, "PNG")
104
- print(f"Converted {filename} to {os.path.basename(output_path)}")
105
-
106
- # Delete the original WebP image
107
- os.remove(input_path)
108
- print(f"Deleted {filename}")
109
-
110
- # Get the list of image files in the directory
111
- image_files = [
112
- file
113
- for file in os.listdir(image_dir)
114
- if os.path.splitext(file)[1].lower() in image_exts
115
- ]
116
-
117
- for image_filename in image_files:
118
- image_path = os.path.join(image_dir, image_filename)
119
-
120
- # Open the image
121
- img = Image.open(image_path)
122
-
123
- # If the image has an alpha channel, replace it with black
124
- if img.mode in ("RGBA", "LA") or (img.mode == "P" and "transparency" in img.info):
125
- alpha = Image.new(
126
- "L", img.size, 0
127
- ) # Create alpha image with mode 'L' (8-bit grayscale)
128
- alpha = alpha.convert(img.mode) # Convert alpha image to same mode as img
129
- img = Image.alpha_composite(alpha, img)
130
-
131
- # Convert the image to RGB
132
- img = img.convert("RGB")
133
-
134
- # Apply the transformations and move the tensor to the device
135
- tensor = transform(img).unsqueeze(0).to(device)
136
-
137
- # Make a forward pass through the model and get the output
138
- with torch.no_grad():
139
- out = model(tensor)
140
-
141
- # Apply the sigmoid function to the output to get probabilities
142
- probabilities = torch.sigmoid(out[0])
143
-
144
- # Get the indices of the tags with probabilities above the threshold
145
- indices = torch.where(probabilities > THRESHOLD)[0]
146
- values = probabilities[indices]
147
-
148
- # Sort the indices by the corresponding probabilities in descending order
149
- sorted_indices = torch.argsort(values, descending=True)
150
-
151
- # Get the tags corresponding to the sorted indices, excluding ignored tags and replacing underscores with spaces
152
- tags_to_write = [
153
- allowed_tags[indices[i]].replace("_", " ")
154
- for i in sorted_indices
155
- if allowed_tags[indices[i]] not in ignored_tags
156
- and allowed_tags[indices[i]] not in ("placeholder0", "placeholder1")
157
- ]
158
-
159
- # Replace 'safe', 'explicit', and 'questionable' with their 'rating_' counterparts
160
- tags_to_write = [
161
- tag.replace("safe", "rating_safe")
162
- .replace("explicit", "rating_explicit")
163
- .replace("questionable", "rating_questionable")
164
- for tag in tags_to_write
165
- ]
166
-
167
- # Escape unescaped parentheses in the tags
168
- tags_to_write_escaped = [
169
- re.sub(r"(?<!\\)(\(|\))", r"\\\1", tag) for tag in tags_to_write
170
- ]
171
-
172
- # Create a text file for each image with the filtered and escaped tags
173
- text_filename = os.path.splitext(image_filename)[0] + ".txt"
174
- text_path = os.path.join(image_dir, text_filename)
175
- with open(text_path, "w", encoding="utf8") as text_file:
176
- text_file.write(", ".join(tags_to_write_escaped))
177
- ```
178
-
179
- </details>
180
- </div>
181
-
182
- ## LoRA Training Guide
183
-
184
- ### Installation Tips
185
-
186
- ---
187
-
188
- Firstly, download kohya_ss' [sd-scripts](https://github.com/kohya-ss/sd-scripts), you need to set up your environment either like [this](https://github.com/kohya-ss/sd-scripts?tab=readme-ov-file#windows-installation) tells you for Windows, or if you are using Linux or Miniconda on Windows, you are probably smart enough to figure out the installation for it. I recommend always installing the latest [PyTorch](https://pytorch.org/get-started/locally/) in the virtual environment you are going to use, which at the time of writing is `2.2.2`. I hope future me has faster PyTorch!
189
-
190
- Ok, just in case you aren't smart enough how to install the sd-scripts under Miniconda for Windows I actually "guided" someone recently, just so I can tell you about it:
191
-
192
- ```bash
193
- # Installing sd-scripts
194
- git clone https://github.com/kohya-ss/sd-scripts
195
- cd sd-scripts
196
-
197
- # Creating the conda environment and installing requirements
198
- conda create -n sdscripts python=3.10.14
199
- conda activate sdscripts
200
- conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
201
- python -m pip install --use-pep517 --upgrade -r requirements.txt
202
- python -m pip install --use-pep517 lycoris_lora
203
- accelerate config
204
- ```
205
-
206
- `accelerate config` will ask you a bunch of questions, you need to actually read each one and reply with the truth. In most cases the truth looks like this: `This machine, No distributed training, no, no, no, all, fp16`.
207
-
208
- You might also want to install `xformers` or `bitsandbytes`.
209
-
210
- ```bash
211
- # Installing xformers
212
- # Use the same command just replace 'xformers' with any other package you may need.
213
- python -m pip install --use-pep517 xformers
214
-
215
- # Installing bitsandbytes for windows
216
- python -m pip install --use-pep517 bitsandbytes --index-url=https://jllllll.github.io/bitsandbytes-windows-webui
217
- ```
218
-
219
- ---
220
-
221
- ### Pony Training
222
-
223
- ---
224
-
225
- I'm not going to lie, it is a bit complicated to explain everything. But here is my best attempt going through some "basic" stuff and almost all lines in order.
226
-
227
- #### Download Pony in Diffusers Format
228
-
229
- I'm using the diffusers version for training I converted, you can download it using `git`.
230
-
231
- ```bash
232
- git clone https://huggingface.co/k4d3/ponydiffusers
233
- ```
234
-
235
- ---
236
-
237
- #### Sample Prompt File
238
-
239
- A sample prompt file is used during training to sample images. A sample prompt for example might look like this for Pony:
240
-
241
- ```py
242
- # anthro female kindred
243
- score_9, score_8_up, score_7_up, score_6_up, rating_explicit, source_furry, solo, female anthro kindred, mask, presenting, white pillow, bedroom, looking at viewer, detailed background, amazing_background, scenery porn, realistic, photo --n low quality, worst quality, blurred background, blurry, simple background --w 1024 --h 1024 --d 1 --l 6.0 --s 40
244
- # anthro female wolf
245
- score_9, score_8_up, score_7_up, score_6_up, rating_explicit, source_furry, solo, anthro female wolf, sexy pose, standing, gray fur, brown fur, canine pussy, black nose, blue eyes, pink areola, pink nipples, detailed background, amazing_background, realistic, photo --n low quality, worst quality, blurred background, blurry, simple background --w 1024 --h 1024 --d 1 --l 6.0 --s 40
246
- ```
247
-
248
- Please note that sample prompts should not exceed 77 tokens, you can use [Count Tokens in Sample Prompts](https://huggingface.co/k4d3/yiff_toolkit/blob/main/dataset_tools/Count%20Tokens%20in%20Sample%20Prompts.ipynb) from [/dataset_tools](https://huggingface.co/k4d3/yiff_toolkit/tree/main/dataset_tools) to analyze your prompts.
249
-
250
- If you are training with multiple GPUs, ensure that the total number of prompts is divisible by the number of GPUs without any remainder or a card will idle.
251
-
252
- ---
253
-
254
- #### Training Commands
255
-
256
- ---
257
-
258
- ##### `accelerate launch`
259
-
260
- For two GPUs:
261
-
262
- ```python
263
- accelerate launch --num_processes=2 --multi_gpu --num_machines=1 --gpu_ids=0,1 --num_cpu_threads_per_process=2 "./sdxl_train_network.py"
264
- ```
265
-
266
- Single GPU:
267
-
268
- ```python
269
- accelerate launch --num_cpu_threads_per_process=2 "./sdxl_train_network.py"
270
- ```
271
-
272
- ---
273
-
274
- &nbsp;
275
-
276
- And now lets break down a bunch of arguments we can pass to `sd-scripts`.
277
-
278
- &nbsp;
279
-
280
- ##### `--lowram`
281
-
282
- If you are running running out of system memory like I do with 2 GPUs and a really fat model that gets loaded into it per GPU, this option will help you save a bit of it and might get you out of OOM hell.
283
-
284
- ---
285
-
286
- ##### `--pretrained_model_name_or_path`
287
-
288
- The directory containing the checkpoint you just downloaded. I recommend closing the path if you are using a local diffusers model with a `/`. You can also specify a `.safetensors` or `.ckpt` if that is what you have!
289
-
290
- ```python
291
- --pretrained_model_name_or_path="/ponydiffusers/"
292
- ```
293
-
294
- ---
295
-
296
- ##### `--output_dir`
297
-
298
- This is where all the saved epochs or steps will be saved, including the last one. If y
299
-
300
- ```python
301
- --output_dir="/output_dir"
302
- ```
303
-
304
- ---
305
-
306
- ##### `--train_data_dir`
307
-
308
- The directory containing the dataset. We prepared this earlier together.
309
-
310
- ```python
311
- --train_data_dir="/training_dir"
312
- ```
313
-
314
- ---
315
-
316
- ##### `--resolution`
317
-
318
- Always set this to match the model's resolution, which in Pony's case it is 1024x1024. If you can't fit into the VRAM, you can decrease it to `512,512` as a last resort.
319
-
320
- ```python
321
- --resolution="1024,1024"
322
- ```
323
-
324
- ---
325
-
326
- ##### `--enable_bucket`
327
-
328
- Creates different buckets by pre-categorizing images with different aspect ratios into different buckets. This technique helps to avoid issues like unnatural crops that are common when models are trained to produce square images. This allows the creation of batches where every item has the same size, but the image size of batches may differ.
329
-
330
- ---
331
-
332
- ##### `--min_bucket_reso` and `--max_bucket_reso`
333
-
334
- Specifies the minimum and maximum resolutions used by the buckets. These values are ignored if `--bucket_no_upscale` is set.
335
-
336
- ```python
337
- --min_bucket_reso=256 --max_bucket_reso=1024
338
- ```
339
-
340
- ---
341
-
342
- ##### `--network_alpha`
343
-
344
- Specifies how many of the trained Network Ranks are allowed to alter the base model.
345
-
346
- ```python
347
- --network_alpha=4
348
- ```
349
-
350
- ---
351
-
352
- ##### `--save_model_as`
353
-
354
- You can use this to specify either `ckpt` or `safetensors` for the file format.
355
-
356
- ```python
357
- --save_model_as="safetensors"
358
- ```
359
-
360
- ---
361
-
362
- ##### `--network_module`
363
-
364
- Specifies which network module you are going to train.
365
-
366
- ```python
367
- --network_module="lycoris.kohya"
368
- ```
369
-
370
- ---
371
-
372
- ##### `--network_args`
373
-
374
- The arguments passed down to the network.
375
-
376
- ```python
377
- --network_args \
378
- "use_reentrant=False" \
379
- "preset=full" \
380
- "conv_dim=256" \
381
- "conv_alpha=4" \
382
- "use_tucker=False" \
383
- "use_scalar=False" \
384
- "rank_dropout_scale=False" \
385
- "algo=locon" \
386
- "train_norm=False" \
387
- "block_dims=8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8" \
388
- "block_alphas=0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625" \
389
- ```
390
-
391
- **Let's break it down!**
392
-
393
- ---
394
-
395
- ###### `preset`
396
-
397
- The [Preset](https://github.com/KohakuBlueleaf/LyCORIS/blob/HEAD/docs/Preset.md)/config system added to LyCORIS for more fine-grained control.
398
-
399
- - `full`
400
- - default preset, train all the layers in the UNet and CLIP.
401
- - `full-lin`
402
- - `full` but skip convolutional layers.
403
- - `attn-mlp`
404
- - "kohya preset", train all the transformer block.
405
- - `attn-only`
406
- - only attention layer will be trained, lot of papers only do training on attn layer.
407
- - `unet-transformer-only`
408
- - as same as kohya_ss/sd_scripts with disabled TE, or, attn-mlp preset with train_unet_only enabled.
409
- - `unet-convblock-only`
410
- - only ResBlock, UpSample, DownSample will be trained.
411
-
412
- ---
413
-
414
- ###### `conv_dim` and `conv_alpha`
415
-
416
- The convolution dimensions are related to the rank of the convolution in the model, adjusting this value can have a [significant impact](https://ashejunius.com/alpha-and-dimensions-two-wild-settings-of-training-lora-in-stable-diffusion-d7ad3e3a3b0a) and lowering it affected the aesthetic differences between different LoRA samples. and an alpha value of `128` was used for training a specific character's face while Kohaku recommended to set this to `1` for both LoCon and LoHa.
417
-
418
- ```python
419
- conv_block_dims = [conv_dim] * num_total_blocks
420
- conv_block_alphas = [conv_alpha] * num_total_blocks
421
- ```
422
-
423
- ---
424
-
425
- ###### `module_dropout` and `dropout` and `rank_dropout`
426
-
427
- [![An AI generated image.](https://huggingface.co/k4d3/yiff_toolkit/resolve/main/static/tutorial/dropout1.png)](https://huggingface.co/k4d3/yiff_toolkit/resolve/main/static/tutorial/dropout1.png)
428
-
429
- `rank_dropout` is a form of dropout, which is a regularization technique used in neural networks to prevent overfitting and improve generalization. However, unlike traditional dropout which randomly sets a proportion of inputs to zero, `rank_dropout` operates on the rank of the input tensor `lx`. First a binary mask is created with the same rank as `lx` with each element set to `True` with probability `1 - rank_dropout` and `False` otherwise. Then the `mask` is applied to `lx` to randomly set some of its elements to zero. After applying the dropout, a scaling factor is applied to `lx` to compensate for the dropped out elements. This is done to ensure that the expected sum of `lx` remains the same before and after dropout. The scaling factor is `1.0 / (1.0 - self.rank_dropout)`.
430
-
431
- It’s called “rank” dropout because it operates on the rank of the input tensor, rather than its individual elements. This can be particularly useful in tasks where the rank of the input is important.
432
-
433
- If `rank_dropout` is set to `0`, it means that no dropout is applied to the rank of the input tensor `lx`. All elements of the mask would be set to `True` and when the mask gets applied to `lx` all of it's elements would be retained and when the scaling factor is applied after dropout it's value would just equal `self.scale` because `1.0 / (1.0 - 0)` is `1`. Basically, setting this to `0` effectively disables the dropout mechanism but it will still do some meaningless calculations, and you can't set it to None, so if you really want to disable dropouts simply don't specify them! 😇
434
-
435
- ```python
436
- def forward(self, x):
437
- org_forwarded = self.org_forward(x)
438
-
439
- # module dropout
440
- if self.module_dropout is not None and self.training:
441
- if torch.rand(1) < self.module_dropout:
442
- return org_forwarded
443
-
444
- lx = self.lora_down(x)
445
-
446
- # normal dropout
447
- if self.dropout is not None and self.training:
448
- lx = torch.nn.functional.dropout(lx, p=self.dropout)
449
-
450
- # rank dropout
451
- if self.rank_dropout is not None and self.training:
452
- mask = torch.rand((lx.size(0), self.lora_dim), device=lx.device) > self.rank_dropout
453
- if len(lx.size()) == 3:
454
- mask = mask.unsqueeze(1)
455
- elif len(lx.size()) == 4:
456
- mask = mask.unsqueeze(-1).unsqueeze(-1)
457
- lx = lx * mask
458
-
459
- scale = self.scale * (1.0 / (1.0 - self.rank_dropout))
460
- else:
461
- scale = self.scale
462
-
463
- lx = self.lora_up(lx)
464
-
465
- return org_forwarded + lx * self.multiplier * scale
466
- ```
467
-
468
- The network you are training needs to support it though! See [PR#545](https://github.com/kohya-ss/sd-scripts/pull/545) for more details.
469
-
470
- ---
471
-
472
- ###### `use_tucker`
473
-
474
- Can be used for all but `(IA)^3` and native fine-tuning.
475
-
476
- Tucker decomposition is a method in mathematics that decomposes a tensor into a set of matrices and one small core tensor reducing the computational complexity and memory requirements of the model. It is used in various LyCORIS modules on various blocks. In LoCon for example, if `use_tucker` is `True` and the kernel size `k_size` is not `(1, 1)`, then the convolution operation is decomposed into three separate operations.
477
-
478
- 1. A 1x1 convolution that reduces the number of channels from `in_dim` to `lora_dim`.
479
- 2. A convolution with the original kernel size `k_size`, stride `stride`, and padding `padding`, but with a reduced number of channels `lora_dim`.
480
- 3. A 1x1 convolution that increases the number of channels back from `lora_dim` to `out_dim`.
481
-
482
- If `use_tucker` is `False` or not set, or if the kernel size k_size is `(1, 1)`, then a standard convolution operation is performed with the original kernel size, stride, and padding, and the number of channels is reduced from `in_dim` to `lora_dim`.
483
-
484
- ---
485
-
486
- ###### `use_scalar`
487
-
488
- An additional learned parameter that scales the contribution of the low-rank weights before they are added to the original weights. This scalar can control the extent to which the low-rank adaptation modifies the original weights. By training this scalar, the model can learn the optimal balance between preserving the original pre-trained weights and allowing for low-rank adaptation.
489
-
490
- ```python
491
- # Check if the 'use_scalar' flag is set to True
492
- if use_scalar:
493
- # If True, initialize a learnable parameter 'scalar' with a starting value of 0.0.
494
- # This parameter will be optimized during the training process.
495
- self.scalar = nn.Parameter(torch.tensor(0.0))
496
- else:
497
- # If the 'use_scalar' flag is False, set 'scalar' to a fixed value of 1.0.
498
- # This means the low-rank weights will be added to the original weights without scaling.
499
- self.scalar = torch.tensor(1.0)
500
- ```
501
-
502
- The `use_scalar` flag allows the model to determine how much influence the low-rank weights should have on the final weights. If `use_scalar` is `True`, the model can learn the optimal value for `self.scalar` during training, which multiplies the low-rank weights before they are added to the original weights. This provides a way to balance between the original pre-trained weights and the new low-rank adaptations, potentially leading to better performance and more efficient training. The initial value of `0.0` for `self.scalar` suggests that the model starts with no contribution from the low-rank weights and learns the appropriate scale during training.
503
-
504
- ---
505
-
506
- ###### `rank_dropout_scale`
507
-
508
- A boolean flag that determines whether to scale the dropout mask to have an average value of `1` or not. This is particularly useful when you want to maintain the original scale of the tensor values after applying dropout, which can be important for the stability of the training process.
509
-
510
- ```python
511
- def forward(self, orig_weight, org_bias, new_weight, new_bias, *args, **kwargs):
512
- # Retrieve the device that the 'oft_blocks' tensor is on. This ensures that any new tensors created are on the same device.
513
- device = self.oft_blocks.device
514
-
515
- # Check if rank dropout is enabled and the model is in training mode.
516
- if self.rank_dropout and self.training:
517
- # Create a random tensor the same shape as 'oft_blocks', with values drawn from a uniform distribution.
518
- # Then create a dropout mask by checking if each value is less than 'self.rank_dropout' probability.
519
- drop = (torch.rand(self.oft_blocks, device=device) < self.rank_dropout).to(
520
- self.oft_blocks.dtype
521
- )
522
-
523
- # If 'rank_dropout_scale' is True, scale the dropout mask to have an average value of 1.
524
- # This helps maintain the scale of the tensor's values after dropout is applied.
525
- if self.rank_dropout_scale:
526
- drop /= drop.mean()
527
- else:
528
- # If rank dropout is not enabled or the model is not in training mode, set 'drop' to 1 (no dropout).
529
- drop = 1
530
- ```
531
-
532
- ---
533
-
534
- ###### `algo`
535
-
536
- The LyCORIS algorithm used, you can find a [list](https://github.com/KohakuBlueleaf/LyCORIS/blob/HEAD/docs/Algo-List.md) of the implemented algorithms and an [explanation](https://github.com/KohakuBlueleaf/LyCORIS/blob/HEAD/docs/Algo-Details.md) of them, with a [demo](https://github.com/KohakuBlueleaf/LyCORIS/blob/HEAD/docs/Demo.md) you can also dig into the [research paper](https://arxiv.org/pdf/2309.14859.pdf).
537
-
538
- ---
539
-
540
- ###### `train_norm`
541
-
542
- Controls whether to train normalization layers used by all algorithms except `(IA)^3` or not.
543
-
544
- ---
545
-
546
- ###### `block_dims`
547
-
548
- Specify the rank of each block, it takes exactly 25 numbers, that is why this line is so long.
549
-
550
- ---
551
-
552
- ###### `block_alphas`
553
-
554
- Specifies the alpha of each block, this too also takes 25 numbers if you don't specify it `network_alpha` will be used instead for the value.
555
-
556
- ---
557
-
558
- That concludes the `network_args`.
559
-
560
- ---
561
-
562
- ##### `--network_dropout`
563
-
564
- This float controls the drop of neurons out of training every step, `0` or `None` is default behavior (no dropout), 1 would drop all neurons. Using `weight_decompose=True` will ignore `network_dropout` and only rank and module dropout will be applied.
565
-
566
- ```python
567
- --network_dropout=0 \
568
- ```
569
-
570
- ---
571
-
572
- ##### `--lr_scheduler`
573
-
574
- A learning rate scheduler in PyTorch is a tool that adjusts the learning rate during the training process. It’s used to modulate the learning rate in response to how the model is performing, which can lead to increased performance and reduced training time.
575
-
576
- Possible values: `linear`, `cosine`, `cosine_with_restarts`, `polynomial`, `constant` (default), `constant_with_warmup`, `adafactor`
577
-
578
- Note, `adafactor` scheduler can only be used with the `adafactor` optimizer!
579
-
580
- ```python
581
- --lr_scheduler="cosine" \
582
- ```
583
-
584
- ---
585
-
586
- ##### `--lr_scheduler_num_cycles`
587
-
588
- Number of restarts for cosine scheduler with restarts. It isn't used by any other scheduler.
589
-
590
- ```py
591
- --lr_scheduler_num_cycles=1 \
592
- ```
593
-
594
- ---
595
-
596
- ##### `--learning_rate` and `--unet_lr` and `--text_encoder_lr`
597
-
598
- The learning rate determines how much the weights of the network are updated in response to the estimated error each time the weights are updated. If the learning rate is too large, the weights may overshoot the optimal solution. If it’s too small, the weights may get stuck in a suboptimal solution.
599
-
600
- For AdamW the optimal LR seems to be `0.0001` or `1e-4` if you want to impress your friends.
601
-
602
- ```py
603
- --learning_rate=0.0001 --unet_lr=0.0001 --text_encoder_lr=0.0001
604
- ```
605
-
606
- ---
607
-
608
- ##### `--network_dim`
609
-
610
- The Network Rank (Dimension) is responsible for how many features your LoRA will be training. It is in a close relation with Network Alpha and the Unet + TE learning rates and of course the quality of your dataset. Personal experimentation with these values is strongly recommended.
611
-
612
- ```py
613
- --network_dim=8
614
- ```
615
-
616
- ---
617
-
618
- ##### `--output_name`
619
-
620
- Specify the output name excluding the file extension.
621
-
622
- **WARNING**: If for some reason this is ever left empty your last epoch won't be saved!
623
-
624
- ```py
625
- --output_name="last"
626
- ```
627
-
628
- ---
629
-
630
- ##### `--scale_weight_norms`
631
-
632
- Max-norm regularization is a technique that constrains the norm of the incoming weight vector at each hidden unit to be upper bounded by a fixed constant. It prevents the weights from growing too large and helps improve the performance of stochastic gradient descent training of deep neural nets.
633
-
634
- Dropout affects the network architecture without changing the weights, while Max-Norm Regularization directly modifies the weights of the network. Both techniques are used to prevent overfitting and improve the generalization of the model. You can learn more about both in this [research paper](https://www.cs.toronto.edu/~rsalakhu/papers/srivastava14a.pdf).
635
-
636
- ```py
637
- --scale_weight_norms=1.0
638
- ```
639
-
640
- ---
641
-
642
- ##### `--max_grad_norm`
643
-
644
- Also known as Gradient Clipping, if you notice that gradients are exploding during training (loss becomes NaN or very large), consider adjusting the `--max_grad_norm` parameter, it operates on the gradients during the backpropagation process, while `--scale_weight_norms` operates on the weights of the neural network. This allows them to complement each other and provide a more robust approach to stabilizing the learning process and improving model performance.
645
-
646
- ```py
647
- --max_grad_norm=1.0
648
- ```
649
-
650
- ---
651
-
652
- ##### `--no_half_vae`
653
-
654
- Disables mixed precision for the SDXL VAE and sets it to `float32`. Very useful if you don't like NaNs.
655
-
656
- ---
657
-
658
- ##### `--save_every_n_epochs` and `--save_last_n_epochs` or `--save_every_n_steps` and `--save_last_n_steps`
659
-
660
- - `--save_every_n_steps` and `--save_every_n_epochs`: A LoRA file will be created at each n-th step or epoch specified here.
661
- - `--save_last_n_steps` and `--save_last_n_epochs`: Discards every saved file except for the last `n` you specify here.
662
-
663
- Learning will always end with what you specify in `--max_train_epochs` or `--max_train_steps`.
664
-
665
- ```py
666
- --save_every_n_epochs=50
667
- ```
668
-
669
- ---
670
-
671
- ##### `--mixed_precision`
672
-
673
- ⚠️
674
-
675
- ```py
676
- --mixed_precision="fp16"
677
- ```
678
-
679
- ---
680
-
681
- ##### `--save_precision`
682
-
683
- ⚠️
684
-
685
- ```py
686
- --save_precision="fp16"
687
- ```
688
-
689
- ---
690
-
691
- ##### `--caption_extension`
692
-
693
- The file extension for caption files. Default is `.caption`. These caption files contain text descriptions that are associated with the training images. When you run the training script, it will look for files with this specified extension in the training data folder. The script uses the content of these files as captions to provide context for the images during the training process.
694
-
695
- For example, if your images are named `image1.jpg`, `image2.jpg`, and so on, and you use the default .caption extension, the script will expect the caption files to be named `image1.caption`, `image2.caption`, etc. If you want to use a different extension, like `.txt`, you would set the caption_extension parameter to `.txt`, and the script would then look for `image1.txt`, `image2.txt`, and so on.
696
-
697
- ```py
698
- --caption_extension=".txt"
699
- ```
700
-
701
- ##### `--cache_latents` and `--cache_latents_to_disk`
702
-
703
- ⚠️
704
-
705
- ```py
706
- --cache_latents --cache_latents_to_disk
707
- ```
708
-
709
- ---
710
-
711
- ##### `--optimizer_type`
712
-
713
- The default optimizer is `AdamW` and there are a bunch of them added every month or so, therefore I'm not listing them all, you can find the list if you really want, but `AdamW` is the best as of this writing so we use that!
714
-
715
- ```py
716
- --optimizer_type="AdamW"
717
- ```
718
-
719
- ---
720
-
721
- ##### `--dataset_repeats`
722
-
723
- Repeats the dataset when training with captions, by default it is set to `1` so we'll set this to `0` with:
724
-
725
- ```py
726
- --dataset_repeats=0
727
- ```
728
-
729
- ---
730
-
731
- ##### `--max_train_steps`
732
-
733
- Specify the number of steps or epochs to train. If both `--max_train_steps` and `--max_train_epochs` are specified, the number of epochs takes precedence.
734
-
735
- ```py
736
- --max_train_steps=400
737
- ```
738
-
739
- ---
740
-
741
- ##### `--shuffle_caption`
742
-
743
- Shuffles the captions set by `--caption_separator`, it is a comma `,` by default which will work perfectly for our case since our captions look like this:
744
-
745
- > rating_questionable, 5 fingers, anthro, bent over, big breasts, blue eyes, blue hair, breasts, butt, claws, curved horn, female, finger claws, fingers, fur, hair, huge breasts, looking at viewer, looking back, looking back at viewer, nipples, nude, pink body, pink hair, pink nipples, rear view, solo, tail, tail tuft, tuft, by lunarii, by x-leon-x, mythology, krystal \(darkmaster781\), dragon, scalie, wickerbeast, The image showcases a pink-scaled wickerbeast a furred dragon creature with blue eyes., She has large breasts and a thick tail., Her blue and pink horns are curved and pointy and she has a slight smiling expression on her face., Her scales are shiny and she has a blue and pink pattern on her body., Her hair is a mix of pink and blue., She is looking back at the viewer with a curious expression., She has a slight blush.,
746
-
747
- As you can tell, I have separated the caption part not just the tags with a `,` to make sure everything gets shuffled. I'm at this point pretty certain this is beneficial especially when your caption file contains more than 77 tokens.
748
-
749
- NOTE: `--cache_text_encoder_outputs` and `--cache_text_encoder_outputs_to_disk` can't be used together with `--shuffle_caption`. Both of these aim to reduce VRAM usage, you will need to decide between these yourself!
750
-
751
- ---
752
-
753
- ##### `--sdpa` or `--xformers` or `--mem_eff_attn`
754
-
755
- Each of these options modifies the attention mechanism used in the model, which can have a significant impact on the model's performance and memory usage. The choice between `--xformers` or `--mem_eff_attn` and `--spda` will depend on your GPU. You can benchmark it by repeating a training with them!
756
-
757
- - `--xformers`: This flag enables the use of XFormers in the model. XFormers is a library developed by Facebook Research that provides a collection of transformer models optimized for different hardware and use-cases. These models are designed to be highly efficient, flexible, and customizable. They offer various types of attention mechanisms and other features that can be beneficial in scenarios where you have limited GPU memory or need to handle large-scale data.
758
- - `--mem_eff_attn`: This flag enables the use of memory-efficient attention mechanisms in the model. The memory-efficient attention is designed to reduce the memory footprint during the training of transformer models, which can be particularly beneficial when working with large models or datasets.
759
- - `--sdpa`: This option enables the use of Scaled Dot-Product Attention (SDPA) within the model. SDPA is a fundamental component of transformer models that calculates the attention scores between queries and keys. It scales the dot products by the dimensionality of the keys to stabilize gradients during training. This mechanism is particularly useful for handling long sequences and can potentially improve the model’s ability to capture long-range dependencies.
760
-
761
- ```python
762
- --sdpa
763
- ```
764
-
765
- ---
766
-
767
- ##### `--multires_noise_iterations` and `--multires_noise_discount`
768
-
769
- Multi-resolution noise is a new approach that adds noise at multiple resolutions to an image or latent image during the training of diffusion models. A model trained with this technique can generate visually striking images with a distinct aesthetic compared to the usual outputs of diffusion models.
770
-
771
- A model trained with multi-resolution noise can generate a more diverse range of images than regular stable diffusion, including extremely light or dark images. These have historically been challenging to achieve without resorting to using a large number of sampling steps.
772
-
773
- This technique is particularly beneficial when working with small datasets but you I don't think you should ever not use it.
774
-
775
- The `--multires_noise_discount` parameter controls the extent to which the noise amount at each resolution is weakened. A value of 0.1 is recommended. The `--multires_noise_iterations` parameter determines the number of iterations for adding multi-resolution noise, with a recommended range of 6 to 10.
776
-
777
- Please note that `--multires_noise_discount` has no effect without `--multires_noise_iterations`.
778
-
779
- ###### Implementation Details
780
-
781
- The `get_noise_noisy_latents_and_timesteps` function samples noise that will be added to the latents. If `args.noise_offset` is true, it applies a noise offset. If `args.multires_noise_iterations` is true, it applies multi-resolution noise to the sampled noise.
782
-
783
- The function then samples a random timestep for each image and adds noise to the latents according to the noise magnitude at each timestep. This is the forward diffusion process.
784
-
785
- The `pyramid_noise_like` function generates noise with a pyramid structure. It starts with the original noise and adds upscaled noise at decreasing resolutions. The noise at each level is scaled by a discount factor raised to the power of the level. The noise is then scaled back to roughly unit variance. This function is used to implement the multi-resolution noise.
786
-
787
- ```python
788
- --multires_noise_iterations=10 --multires_noise_discount=0.1
789
- ```
790
-
791
- ---
792
-
793
- ##### `--sample_prompts` and `--sample_sampler` and `--sample_every_n_steps`
794
-
795
- You have the option of generating images during training so you can check the progress, the argument let's you pick between different samplers, by default it is on `ddim`, so you better change it!
796
-
797
- You can also use `--sample_every_n_epochs` instead which will take precedence over steps. The `k_` prefix means karras and the `_a` suffix means ancestral.
798
-
799
- ```py
800
- --sample_prompts=/training_dir/sample-prompts.txt --sample_sampler="euler_a" --sample_every_n_steps=100
801
- ```
802
-
803
- My recommendation for Pony is to use `euler_a` for toony and for realistic `k_dpm_2`.
804
-
805
- Your sampler options include the following:
806
-
807
- ```bash
808
- ddim, pndm, lms, euler, euler_a, heun, dpm_2, dpm_2_a, dpmsolver, dpmsolver++, dpmsingle, k_lms, k_euler, k_euler_a, k_dpm_2, k_dpm_2_a
809
- ```
810
-
811
- ---
812
-
813
- So, the whole thing would look something like this:
814
-
815
- ```python
816
- accelerate launch --num_cpu_threads_per_process=2 "./sdxl_train_network.py" \
817
- --lowram \
818
- --pretrained_model_name_or_path="/ponydiffusers/" \
819
- --train_data_dir="/training_dir" \
820
- --resolution="1024,1024" \
821
- --output_dir="/output_dir" \
822
- --enable_bucket \
823
- --min_bucket_reso=256 \
824
- --max_bucket_reso=1024 \
825
- --network_alpha=4 \
826
- --save_model_as="safetensors" \
827
- --network_module="lycoris.kohya" \
828
- --network_args \
829
- "preset=full" \
830
- "conv_dim=256" \
831
- "conv_alpha=4" \
832
- "use_tucker=False" \
833
- "use_scalar=False" \
834
- "rank_dropout_scale=False" \
835
- "algo=locon" \
836
- "train_norm=False" \
837
- "block_dims=8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8" \
838
- "block_alphas=0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625" \
839
- --network_dropout=0 \
840
- --lr_scheduler="cosine" \
841
- --learning_rate=0.0001 \
842
- --unet_lr=0.0001 \
843
- --text_encoder_lr=0.0001 \
844
- --network_dim=8 \
845
- --output_name="yifftoolkit" \
846
- --scale_weight_norms=1 \
847
- --no_half_vae \
848
- --save_every_n_epochs=50 \
849
- --mixed_precision="fp16" \
850
- --save_precision="fp16" \
851
- --caption_extension=".txt" \
852
- --cache_latents \
853
- --cache_latents_to_disk \
854
- --optimizer_type="AdamW" \
855
- --max_grad_norm=1 \
856
- --keep_tokens=1 \
857
- --max_data_loader_n_workers=8 \
858
- --bucket_reso_steps=32 \
859
- --multires_noise_iterations=10 \
860
- --multires_noise_discount=0.1 \
861
- --log_prefix=xl-locon \
862
- --gradient_accumulation_steps=12 \
863
- --gradient_checkpointing \
864
- --train_batch_size=8 \
865
- --dataset_repeats=0 \
866
- --max_train_steps=400 \
867
- --shuffle_caption \
868
- --sdpa \
869
- --sample_prompts=/training_dir/sample-prompts.txt \
870
- --sample_sampler="euler_a" \
871
- --sample_every_n_steps=100
872
- ```
873
-
874
- ---
875
-
876
- ## Embeddings for 1.5 and SDXL
877
-
878
- Embeddings in Stable Diffusion are high-dimensional representations of input data, such as images or text, that capture their essential features and relationships. These embeddings are used to guide the diffusion process, enabling the model to generate outputs that closely match the desired characteristics specified in the input.
879
-
880
- You can find in the [`/embeddings`](https://huggingface.co/k4d3/yiff_toolkit/tree/main/embeddings) folder a whole bunch of them I collected for SD 1.5 that I later converted with [this](https://huggingface.co/spaces/FoodDesert/Embedding_Converter) tool for SDXL.
881
-
882
- ## ComfyUI Walkthrough any%
883
-
884
- ⚠️ Coming next year! ⚠️
885
-
886
- ---
887
-
888
- ## AnimateDiff for Masochists
889
-
890
- ⚠️ Coming in 2026! ⚠️
891
-
892
- ---
893
-
894
- ## Stable Cascade Furry Bible
895
-
896
- ### Resonance Cascade
897
-
898
- 🍆
899
-
900
- ---
901
-
902
- ## SDXL Furry Bible
903
-
904
- ### Some Common Knowledge Stuff
905
-
906
- [Resolution Lora](https://huggingface.co/jiaxiangc/res-adapter/resolve/main/sdxl-i/resolution_lora.safetensors?download=true) is a nice thing to have, it will help with consistency. For SDXL it is just a LoRA you can load in and it will do its magic. No need for a custom node or extension in this case.
907
-
908
- ### SeaArt Furry
909
-
910
- ---
911
-
912
- SeaArt's furry model sadly has its cons not just pros, yes it might come with artist knowledge bundled, but it seems to have trouble doing more than one character or everyone is bad at prompting, oh and it uses raw e621 tags, which just means you have to use underscores `_` instead of spaces&nbsp;` `&nbsp; inside the tags.
913
-
914
- ⚠️ TODO: Prompting tips.
915
-
916
- ### Pony Diffusion V6
917
-
918
- ---
919
-
920
- #### Requirements
921
-
922
- Download the [model](https://civitai.com/models/257749/pony-diffusion-v6-xl) and load it in to whatever you use to generate images.
923
-
924
- #### Positive Prompt Stuff
925
-
926
- ```python
927
- score_9, score_8_up, score_7_up, score_6_up, rating_explicit, source_furry,
928
- ```
929
-
930
- I just assumed you wanted _explicit_ and _furry_, you can also set the rating to `rating_safe` or `rating_questionable` and the source to `source_anime`, `source_cartoon`, `source_pony`, `source_rule34` and optionally mix them however you'd like. Its your life! `score_9` is an interesting tag, the model seems to have put all it's "_artsy_" knowledge. You might want to check if it is for your taste. The other interesting tag is `score_5_up` which seems to have learned a little bit of everything regarding quality while `score_4_up` seems to be at the bottom of the autism spectrum regarding art, I do not recommend using it, but you can do whatever you want!
931
-
932
- You can talk to Pony in three ways, use tags only, tags are neat, but you can also just type in
933
- `The background is of full white marble towers in greek architecture style and a castle.` and use natural language to the fullest extent, but the best way is to mix it both, its actually recommended since the score tags by definition are tags, and you need to use them! There are also artist styles that seeped into some random tokens during training, there is a community effort by some weebs to sort them [here](https://lite.framacalc.org/4ttgzvd0rx-a6jf).
934
-
935
- Other nice words to have in the box depending on your mood:
936
-
937
- ```python
938
- detailed background, amazing_background, scenery porn
939
- ```
940
-
941
- Other types of backgrounds include:
942
-
943
- ```python
944
- simple background, abstract background, spiral background, geometric background, heart background, gradient background, monotone background, pattern background, dotted background, stripped background, textured background, blurred background
945
- ```
946
-
947
- After `simple background` you can also define a color for the background like `white background` to get a simple white background.
948
-
949
- For the character portrayal you can set many different types:
950
-
951
- ```python
952
- three-quarter view, full-length portrait, headshot portrait, bust portrait, half-length portrait, torso shot
953
- ```
954
-
955
- Its a good thing to describe your subject or subjects start with `solo` or `duo` or maybe `trio, group` , and then finally start describing your character in an interesting situation.
956
-
957
- #### Negative Prompt Stuff
958
-
959
- ⚠️
960
-
961
- #### How to Prompt Female Anthro Lions
962
-
963
- ```python
964
- anthro ⚠️?
965
- ```
966
-
967
- ---
968
 
969
  ## Pony Diffusion V6 LoRAs
970
 
 
14
 
15
  ---
16
 
17
+ The README has moved to [https://cringe.live/docs/yiff_toolkit/](https://cringe.live/docs/yiff_toolkit/)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  ## Pony Diffusion V6 LoRAs
20