.gitattributes CHANGED
@@ -18,7 +18,6 @@
18
  *.ot filter=lfs diff=lfs merge=lfs -text
19
  *.parquet filter=lfs diff=lfs merge=lfs -text
20
  *.pb filter=lfs diff=lfs merge=lfs -text
21
- *.pdf filter=lfs diff=lfs merge=lfs -text
22
  *.pickle filter=lfs diff=lfs merge=lfs -text
23
  *.pkl filter=lfs diff=lfs merge=lfs -text
24
  *.png filter=lfs diff=lfs merge=lfs -text
 
18
  *.ot filter=lfs diff=lfs merge=lfs -text
19
  *.parquet filter=lfs diff=lfs merge=lfs -text
20
  *.pb filter=lfs diff=lfs merge=lfs -text
 
21
  *.pickle filter=lfs diff=lfs merge=lfs -text
22
  *.pkl filter=lfs diff=lfs merge=lfs -text
23
  *.png filter=lfs diff=lfs merge=lfs -text
The_Ultra-Scale_Playbook_Training_LLMs_on_GPU_Clusters.pdf DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:274a19a2577ed220cd3a102b4469c44310e4a7c8e8f8ebc36842d907cb51e127
3
- size 14059172
 
 
 
 
assets/images/256px-PDF.png DELETED

Git LFS Details

  • SHA256: 48b7ab9362d78d22ca0d66b2943406759e85cffb86b585176990035d12ac2c7d
  • Pointer size: 129 Bytes
  • Size of remote file: 5.46 kB
dist/assets/.DS_Store DELETED
Binary file (6.15 kB)
 
dist/assets/images/256px-PDF.png DELETED

Git LFS Details

  • SHA256: 38ba9d71a429465ee0b469b43dd0969790e9cfe72c02857f31a75412b3c9e81e
  • Pointer size: 129 Bytes
  • Size of remote file: 1.25 kB
dist/bibliography.bib CHANGED
@@ -488,7 +488,7 @@ url = {https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md}
488
  @software{torchao,
489
  title = {torchao: PyTorch native quantization and sparsity for training and inference},
490
  author = {torchao maintainers and contributors},
491
- url = {https://github.com/pytorch/ao},
492
  license = {BSD-3-Clause},
493
  month = oct,
494
  year = {2024}
 
488
  @software{torchao,
489
  title = {torchao: PyTorch native quantization and sparsity for training and inference},
490
  author = {torchao maintainers and contributors},
491
+ url = {https://github.com/pytorch/torchao},
492
  license = {BSD-3-Clause},
493
  month = oct,
494
  year = {2024}
dist/distill.bundle.js CHANGED
@@ -2146,7 +2146,7 @@ function _arrayWithHoles(r) { if (Array.isArray(r)) return r; }
2146
  function bylineTemplate(frontMatter) {
2147
  return "\n <div class=\"byline grid\">\n <div>\n <h3>Authors</h3>\n <div>\n ".concat(frontMatter.authors.map(function (author, i) {
2148
  return "\n <span class=\"author\">\n ".concat(author.personalURL ? "\n <a class=\"name\" href=\"".concat(author.personalURL, "\">").concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</a>" : "\n <span class=\"name\">".concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</span>", "\n </span>\n ");
2149
- }).join(''), "\n </div>\n </div>\n <div >\n <h3>Affiliation</h3>\n <div><a href=\"https://huggingface.co/\">Hugging Face</a>\n </div>\n </div>\n <div >\n <h3>Published</h3>\n <div>Feb 19, 2025</div>\n </div>\n </div>\n <div class=\"side pdf-download\">\n <a href=\"https://huggingface.co/spaces/nanotron/ultrascale-playbook/resolve/main/The_Ultra-Scale_Playbook_Training_LLMs_on_GPU_Clusters.pdf\">Download PDF\n <br>\n <img style=\"width: 32px;\" src=\"../assets/images/256px-PDF.png\" alt=\"PDF\"></a>\n \n </div>\n");
2150
  }
2151
  var Byline = /*#__PURE__*/function (_HTMLElement4) {
2152
  function Byline() {
 
2146
  function bylineTemplate(frontMatter) {
2147
  return "\n <div class=\"byline grid\">\n <div>\n <h3>Authors</h3>\n <div>\n ".concat(frontMatter.authors.map(function (author, i) {
2148
  return "\n <span class=\"author\">\n ".concat(author.personalURL ? "\n <a class=\"name\" href=\"".concat(author.personalURL, "\">").concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</a>" : "\n <span class=\"name\">".concat(author.name) + (i + 1 < frontMatter.authors.length ? "," : "") + "</span>", "\n </span>\n ");
2149
+ }).join(''), "\n </div>\n </div>\n <div >\n <h3>Affiliation</h3>\n <div><a href=\"https://huggingface.co/\">Hugging Face</a>\n </div>\n </div>\n <div >\n <h3>Published</h3>\n <div>Feb 19, 2025</div>\n </div>\n </div>\n");
2150
  }
2151
  var Byline = /*#__PURE__*/function (_HTMLElement4) {
2152
  function Byline() {
dist/distill.bundle.js.map CHANGED
The diff for this file is too large to render. See raw diff
 
dist/index.html CHANGED
@@ -75,7 +75,7 @@
75
  <p>
76
  Thousands of GPUs humming in perfect harmony. That's what it takes to train today's most powerful AI models – a symphony of computing power that until recently was the exclusive domain of elite research labs. Open source has transformed this landscape, but not completely. Yes, you can download the latest <a href="https://huggingface.co/meta-llama">Llama</a> or <a href="https://huggingface.co/deepseek-ai">DeepSeek</a> models. Yes, you can read their <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> and <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">experiment</a> reports. But the most challenging part – the training code, the knowledge and technics necessary to coordinate GPUs to train these massive systems – remains shrouded in complexity and spread around a series of disconnected papers and often private codebases.
77
  </p>
78
- <aside>Reading time: 2-4 days. <br>For the best reading experience, we recommend not using a mobile phone.</aside>
79
  <p>
80
  This open-source book is here to changes that. Starting from the basics, we'll walk you through the knowledge necessary to scale the training of large language models from one GPU to tens, hundreds and even thousands of GPUs, illustrating theory with practical code examples and reproducible benchmarks.
81
  </p>
@@ -361,7 +361,7 @@
361
 
362
  <h3>Memory usage in Transformers</h3>
363
 
364
- <p>When training a neural network model, one stores several items in memory:</p>
365
 
366
  <ul>
367
  <li>Model weights</li>
@@ -374,7 +374,7 @@
374
  <p class="note-box-title">📝 Note</p>
375
  <div class="note-box-content">
376
  <p>
377
- You would think for a model you could compute the memory requirements exactly but there are a few additional memory occupants that make it hard to be exact:
378
  <ul>
379
  <li>CUDA Kernels typically require 1-2 GB of GPU memory, which you can quickly verify by running <code>import torch; torch.ones((1, 1)).to("cuda")</code> and then checking the GPU memory with <code>nvidia-smi</code>.</li>
380
  <li>Some rest memory usage from buffers, intermediate results and some memory that can’t be used due to fragmentation</li>
@@ -389,7 +389,7 @@
389
 
390
  <h4>Profiling the memory usage</h4>
391
 
392
- <p>Using the Pytorch profiler we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
393
 
394
  <aside>Check out <a target="_self" href="#a1%3A_distributed_training_profiling" class="">A1: Distributed Training Profiling</a> for a walkthrough how to profile your model.</aside>
395
 
@@ -403,7 +403,7 @@
403
 
404
  <p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
405
 
406
- <p>Why does the first step look different: the activations increase quickly and then plateau for a while. In this first step the torch cache allocator does a lot of preparation preparing memory allocations to speed up the subsequent steps so that they don’t require searching for free memory blocks afterwards (see <a href="https://zdevito.github.io/2022/08/04/cuda-caching-allocator.html">Zach’s blog</a>). After the first step we also see the optimizer states appearing which generally offset the memory usage for further training steps.</p>
407
 
408
  <aside>Ever noticed how sometimes the training succeeds in the first step but then OOMs during the following training steps? This can be explained by the build-up of the optimizer state after the first step.
409
  </aside>
@@ -611,7 +611,7 @@
611
 
612
  <p>But if you’ve carefully followed, you probably noticed that the forward/backward passes for each micro-batch can actually be run in parallel. Forward/backward passes are independent from each other, with independent input samples being the only difference. Seems like it’s time to start extending our training to more than one GPU! </p>
613
 
614
- <p>Before that, let's quickly see how we can vizualise computation and communication with a short tour of one of the most useful tool in the distributed training toolbox: the <strong>profiler</strong>. This tool will be extremely useful to understand and validate how communications between GPUs and compute are happening and where bottlenecks are.</p>
615
 
616
  <h4>Profiling GPU compute and communication</h4>
617
 
@@ -1320,6 +1320,7 @@
1320
  <ul>
1321
  <li>for both methods we notice the biggest performance drop when we move from TP=8 to TP=16, because that’s when we move from only communicating within a single node (NVLink), to communicating inter-nodes (EFA)</li>
1322
  <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
 
1323
  </ul>
1324
 
1325
  <p><strong>We have seen how TP helps us shard activations across several GPUs by splitting the attention and feedforward operations along the hidden dimension and how SP is a natural complement for the remaining operations by splitting along the sequence dimension.</strong></p>
@@ -1363,7 +1364,7 @@
1363
 
1364
  <p>There is one important exception though as we we need to pay particular attention to the <strong>Attention blocks</strong> (haha.. pun intended :D). In the attention module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
1365
 
1366
- <p>Because Context Parallelism splits the inputs along the sequence dimension across GPUs, the attention module will require full communication between GPUs to exchange the necessary key/value data.</p>
1367
 
1368
  <p>That sounds very expensive if we do it naively. Is there a way to do this rather efficiently and fast! Thankfully there is: a core technique to handle this communication of key/value pairs efficiently is called <em>Ring Attention</em>.</p>
1369
 
@@ -1745,7 +1746,7 @@
1745
  </table>
1746
  </div>
1747
 
1748
- <p>As you can see, ZeRO-3 and PP solve the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1749
 
1750
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1751
 
@@ -1793,7 +1794,7 @@
1793
  <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1794
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1795
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1796
- <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipeline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1797
  </ul>
1798
 
1799
  <table>
@@ -1912,7 +1913,7 @@
1912
 
1913
  <h2>Finding the Best Training Configuration</h2>
1914
 
1915
- <p>We’ve now covered all the parallelism techniques that are actually used to distribute and train larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1916
 
1917
  <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1918
 
@@ -2272,7 +2273,7 @@
2272
 
2273
  </ol>
2274
 
2275
- <p>Let’s talk about one of the most frequent technique we can use in CUDA: optimizing memory access. The global memory in GPUs (the largest memory in our above graph) has a long latency and low bandwidth in comparison to the cache which often creates a major bottleneck for most applications. Efficiently accessing data from global memory can improve performance by a lot.</p>
2276
 
2277
  <h4>Memory Coalescing</h4>
2278
 
@@ -2410,7 +2411,7 @@
2410
 
2411
  <p>So it seems warps are stalling waiting for shared memory accesses to return! To solve this issue we can apply a technique called <strong>Thread Coarsening</strong> which involves merging several threads into a single coarsened thread. This will significantly reduce shared memory accesses as each coarsened thread can handle multiple output elements.</p>
2412
 
2413
- <p>Let's briefly go through a last important consideration when writing or improving custom kernels: <strong>Minimizing Control Divergence</strong>.</p>
2414
 
2415
  <h4>Minimizing Control Divergence</h4>
2416
 
@@ -2571,7 +2572,7 @@
2571
 
2572
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2573
 
2574
- <p>A common metric to measure a formats resolution is epsilon: the first representable number after <d-math>1.00</d-math>. We can see that for the float32 format <d-math>10^{-4}</d-math> is an upper bound (it’s actually <d-math>1.19^{-7}</d-math>). For float16 it is ~ <d-math>10^{-3}</d-math> and for bfloat 10x higher still.</p>
2575
 
2576
  <p>The idea of mixed precision training is to use some of these lower precisions formats while maintaining the performance of full precision training. </p>
2577
 
@@ -3379,7 +3380,7 @@
3379
 
3380
  <p>There are a few finer points in the decision tree that we leave to the reader to explore in the PyTorch guide referenced above.</p>
3381
 
3382
- <p>Now that we covered the fundamental operations for distributed training and you should now be ready to follow the blog post easily.</p>
3383
 
3384
  <h3>A1: Distributed Training Profiling</h3>
3385
 
@@ -3828,8 +3829,7 @@
3828
  }
3829
  if (level === 0)
3830
  ToC += '<div>' + link + '</div>';
3831
- else
3832
- // else if (level === 1)
3833
  ToC += '<li>' + link + '</li>';
3834
  }
3835
 
 
75
  <p>
76
  Thousands of GPUs humming in perfect harmony. That's what it takes to train today's most powerful AI models – a symphony of computing power that until recently was the exclusive domain of elite research labs. Open source has transformed this landscape, but not completely. Yes, you can download the latest <a href="https://huggingface.co/meta-llama">Llama</a> or <a href="https://huggingface.co/deepseek-ai">DeepSeek</a> models. Yes, you can read their <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> and <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">experiment</a> reports. But the most challenging part – the training code, the knowledge and technics necessary to coordinate GPUs to train these massive systems – remains shrouded in complexity and spread around a series of disconnected papers and often private codebases.
77
  </p>
78
+ <aside>Reading time: 2-4 days. For the best reading experience, we recommend not using a mobile phone.</aside>
79
  <p>
80
  This open-source book is here to changes that. Starting from the basics, we'll walk you through the knowledge necessary to scale the training of large language models from one GPU to tens, hundreds and even thousands of GPUs, illustrating theory with practical code examples and reproducible benchmarks.
81
  </p>
 
361
 
362
  <h3>Memory usage in Transformers</h3>
363
 
364
+ <p>When training a neural network model, one store several items in memory:</p>
365
 
366
  <ul>
367
  <li>Model weights</li>
 
374
  <p class="note-box-title">📝 Note</p>
375
  <div class="note-box-content">
376
  <p>
377
+ You would think for a model you could compute the memory requirements exactly but there are a few additional memory occupants that makes it hard to be exact:
378
  <ul>
379
  <li>CUDA Kernels typically require 1-2 GB of GPU memory, which you can quickly verify by running <code>import torch; torch.ones((1, 1)).to("cuda")</code> and then checking the GPU memory with <code>nvidia-smi</code>.</li>
380
  <li>Some rest memory usage from buffers, intermediate results and some memory that can’t be used due to fragmentation</li>
 
389
 
390
  <h4>Profiling the memory usage</h4>
391
 
392
+ <p>Using the Pytorch profiler we can understand how memory is allocated througho ut training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
393
 
394
  <aside>Check out <a target="_self" href="#a1%3A_distributed_training_profiling" class="">A1: Distributed Training Profiling</a> for a walkthrough how to profile your model.</aside>
395
 
 
403
 
404
  <p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
405
 
406
+ <p>Why does the first step looks different: the activations increase quickly and then plateau for a while. In this first step the torch cache allocator does a lot of preparation preparing memory allocations to speed up the subsequent steps so that they don’t require searching for free memory blocks afterwards (see <a href="https://zdevito.github.io/2022/08/04/cuda-caching-allocator.html">Zach’s blog</a>). After the first step we also see the optimizer states appearing which generally offset the memory usage for further training steps.</p>
407
 
408
  <aside>Ever noticed how sometimes the training succeeds in the first step but then OOMs during the following training steps? This can be explained by the build-up of the optimizer state after the first step.
409
  </aside>
 
611
 
612
  <p>But if you’ve carefully followed, you probably noticed that the forward/backward passes for each micro-batch can actually be run in parallel. Forward/backward passes are independent from each other, with independent input samples being the only difference. Seems like it’s time to start extending our training to more than one GPU! </p>
613
 
614
+ <p>Before that, let's quickly see how we can vizualise computation and communication with a short tour of one of the most usefull tool in the distributed training toolbox: the <strong>profiler</strong>. This tool will be extremely usefull to understand and validate how communications between GPUs and compute are happening and where bottlenecks are.</p>
615
 
616
  <h4>Profiling GPU compute and communication</h4>
617
 
 
1320
  <ul>
1321
  <li>for both methods we notice the biggest performance drop when we move from TP=8 to TP=16, because that’s when we move from only communicating within a single node (NVLink), to communicating inter-nodes (EFA)</li>
1322
  <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
1323
+ <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
1324
  </ul>
1325
 
1326
  <p><strong>We have seen how TP helps us shard activations across several GPUs by splitting the attention and feedforward operations along the hidden dimension and how SP is a natural complement for the remaining operations by splitting along the sequence dimension.</strong></p>
 
1364
 
1365
  <p>There is one important exception though as we we need to pay particular attention to the <strong>Attention blocks</strong> (haha.. pun intended :D). In the attention module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
1366
 
1367
+ <p>Because Context Parallelism splits the inputs along the sequence dimension across GPUs, the attention module will requires full communication between GPUs to exchange the necessary key/value data.</p>
1368
 
1369
  <p>That sounds very expensive if we do it naively. Is there a way to do this rather efficiently and fast! Thankfully there is: a core technique to handle this communication of key/value pairs efficiently is called <em>Ring Attention</em>.</p>
1370
 
 
1746
  </table>
1747
  </div>
1748
 
1749
+ <p>As you can see, ZeRO-3 and PP sove the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1750
 
1751
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1752
 
 
1794
  <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1795
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1796
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1797
+ <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipaline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1798
  </ul>
1799
 
1800
  <table>
 
1913
 
1914
  <h2>Finding the Best Training Configuration</h2>
1915
 
1916
+ <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1917
 
1918
  <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1919
 
 
2273
 
2274
  </ol>
2275
 
2276
+ <p>Let’s talk about one of the most frequent technique we can use in CUDA: optimizing memory access. The global memory in GPUs (the largest memory in our above graph) has a long latency and low bandwidth in comparison to the cache which often creates a major bottleneck for most applications. Efficiently accessing data from global memory can improve a lot the performance.</p>
2277
 
2278
  <h4>Memory Coalescing</h4>
2279
 
 
2411
 
2412
  <p>So it seems warps are stalling waiting for shared memory accesses to return! To solve this issue we can apply a technique called <strong>Thread Coarsening</strong> which involves merging several threads into a single coarsened thread. This will significantly reduce shared memory accesses as each coarsened thread can handle multiple output elements.</p>
2413
 
2414
+ <p>Let's briefly mentionned a last important consideration when writing or improving custom kernels: <strong>Minimizing Control Divergence</strong>.</p>
2415
 
2416
  <h4>Minimizing Control Divergence</h4>
2417
 
 
2572
 
2573
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2574
 
2575
+ <p>A common metric to measure a formats resolution is epsilon: the first representable number after <d-math>1.00</d-math>. We can see that for the float32 format <d-math>10^{-4}</d-math> is an upper bound (it’s actually <d-math>1.19^{-7}</d-math>). For float16 it is <d-math>\tilde 10^{-3}</d-math> and for bfloat 10x higher still.</p>
2576
 
2577
  <p>The idea of mixed precision training is to use some of these lower precisions formats while maintaining the performance of full precision training. </p>
2578
 
 
3380
 
3381
  <p>There are a few finer points in the decision tree that we leave to the reader to explore in the PyTorch guide referenced above.</p>
3382
 
3383
+ <p>Now that we covered the fundamental operations for distributed training and when you should should be ready to follow the blog post easily.</p>
3384
 
3385
  <h3>A1: Distributed Training Profiling</h3>
3386
 
 
3829
  }
3830
  if (level === 0)
3831
  ToC += '<div>' + link + '</div>';
3832
+ else if (level === 1)
 
3833
  ToC += '<li>' + link + '</li>';
3834
  }
3835
 
src/bibliography.bib CHANGED
@@ -488,7 +488,7 @@ url = {https://github.com/meta-llama/llama3/blob/main/MODEL_CARD.md}
488
  @software{torchao,
489
  title = {torchao: PyTorch native quantization and sparsity for training and inference},
490
  author = {torchao maintainers and contributors},
491
- url = {https://github.com/pytorch/ao},
492
  license = {BSD-3-Clause},
493
  month = oct,
494
  year = {2024}
 
488
  @software{torchao,
489
  title = {torchao: PyTorch native quantization and sparsity for training and inference},
490
  author = {torchao maintainers and contributors},
491
+ url = {https://github.com/pytorch/torchao},
492
  license = {BSD-3-Clause},
493
  month = oct,
494
  year = {2024}
src/distill.js CHANGED
@@ -2105,12 +2105,6 @@ d-appendix > distill-appendix {
2105
  <div>Feb 19, 2025</div>
2106
  </div>
2107
  </div>
2108
- <div class="side pdf-download">
2109
- <a href="https://huggingface.co/spaces/nanotron/ultrascale-playbook/resolve/main/The_Ultra-Scale_Playbook_Training_LLMs_on_GPU_Clusters.pdf">Download PDF
2110
- <br>
2111
- <img style="width: 32px;" src="../assets/images/256px-PDF.png" alt="PDF"></a>
2112
-
2113
- </div>
2114
  `;
2115
  }
2116
 
 
2105
  <div>Feb 19, 2025</div>
2106
  </div>
2107
  </div>
 
 
 
 
 
 
2108
  `;
2109
  }
2110
 
src/index.html CHANGED
@@ -75,7 +75,7 @@
75
  <p>
76
  Thousands of GPUs humming in perfect harmony. That's what it takes to train today's most powerful AI models – a symphony of computing power that until recently was the exclusive domain of elite research labs. Open source has transformed this landscape, but not completely. Yes, you can download the latest <a href="https://huggingface.co/meta-llama">Llama</a> or <a href="https://huggingface.co/deepseek-ai">DeepSeek</a> models. Yes, you can read their <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> and <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">experiment</a> reports. But the most challenging part – the training code, the knowledge and technics necessary to coordinate GPUs to train these massive systems – remains shrouded in complexity and spread around a series of disconnected papers and often private codebases.
77
  </p>
78
- <aside>Reading time: 2-4 days. <br>For the best reading experience, we recommend not using a mobile phone.</aside>
79
  <p>
80
  This open-source book is here to changes that. Starting from the basics, we'll walk you through the knowledge necessary to scale the training of large language models from one GPU to tens, hundreds and even thousands of GPUs, illustrating theory with practical code examples and reproducible benchmarks.
81
  </p>
@@ -361,7 +361,7 @@
361
 
362
  <h3>Memory usage in Transformers</h3>
363
 
364
- <p>When training a neural network model, one stores several items in memory:</p>
365
 
366
  <ul>
367
  <li>Model weights</li>
@@ -374,7 +374,7 @@
374
  <p class="note-box-title">📝 Note</p>
375
  <div class="note-box-content">
376
  <p>
377
- You would think for a model you could compute the memory requirements exactly but there are a few additional memory occupants that make it hard to be exact:
378
  <ul>
379
  <li>CUDA Kernels typically require 1-2 GB of GPU memory, which you can quickly verify by running <code>import torch; torch.ones((1, 1)).to("cuda")</code> and then checking the GPU memory with <code>nvidia-smi</code>.</li>
380
  <li>Some rest memory usage from buffers, intermediate results and some memory that can’t be used due to fragmentation</li>
@@ -389,7 +389,7 @@
389
 
390
  <h4>Profiling the memory usage</h4>
391
 
392
- <p>Using the Pytorch profiler we can understand how memory is allocated throughout training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
393
 
394
  <aside>Check out <a target="_self" href="#a1%3A_distributed_training_profiling" class="">A1: Distributed Training Profiling</a> for a walkthrough how to profile your model.</aside>
395
 
@@ -403,7 +403,7 @@
403
 
404
  <p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
405
 
406
- <p>Why does the first step look different: the activations increase quickly and then plateau for a while. In this first step the torch cache allocator does a lot of preparation preparing memory allocations to speed up the subsequent steps so that they don’t require searching for free memory blocks afterwards (see <a href="https://zdevito.github.io/2022/08/04/cuda-caching-allocator.html">Zach’s blog</a>). After the first step we also see the optimizer states appearing which generally offset the memory usage for further training steps.</p>
407
 
408
  <aside>Ever noticed how sometimes the training succeeds in the first step but then OOMs during the following training steps? This can be explained by the build-up of the optimizer state after the first step.
409
  </aside>
@@ -611,7 +611,7 @@
611
 
612
  <p>But if you’ve carefully followed, you probably noticed that the forward/backward passes for each micro-batch can actually be run in parallel. Forward/backward passes are independent from each other, with independent input samples being the only difference. Seems like it’s time to start extending our training to more than one GPU! </p>
613
 
614
- <p>Before that, let's quickly see how we can vizualise computation and communication with a short tour of one of the most useful tool in the distributed training toolbox: the <strong>profiler</strong>. This tool will be extremely useful to understand and validate how communications between GPUs and compute are happening and where bottlenecks are.</p>
615
 
616
  <h4>Profiling GPU compute and communication</h4>
617
 
@@ -1320,6 +1320,7 @@
1320
  <ul>
1321
  <li>for both methods we notice the biggest performance drop when we move from TP=8 to TP=16, because that’s when we move from only communicating within a single node (NVLink), to communicating inter-nodes (EFA)</li>
1322
  <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
 
1323
  </ul>
1324
 
1325
  <p><strong>We have seen how TP helps us shard activations across several GPUs by splitting the attention and feedforward operations along the hidden dimension and how SP is a natural complement for the remaining operations by splitting along the sequence dimension.</strong></p>
@@ -1363,7 +1364,7 @@
1363
 
1364
  <p>There is one important exception though as we we need to pay particular attention to the <strong>Attention blocks</strong> (haha.. pun intended :D). In the attention module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
1365
 
1366
- <p>Because Context Parallelism splits the inputs along the sequence dimension across GPUs, the attention module will require full communication between GPUs to exchange the necessary key/value data.</p>
1367
 
1368
  <p>That sounds very expensive if we do it naively. Is there a way to do this rather efficiently and fast! Thankfully there is: a core technique to handle this communication of key/value pairs efficiently is called <em>Ring Attention</em>.</p>
1369
 
@@ -1745,7 +1746,7 @@
1745
  </table>
1746
  </div>
1747
 
1748
- <p>As you can see, ZeRO-3 and PP solve the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1749
 
1750
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1751
 
@@ -1793,7 +1794,7 @@
1793
  <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1794
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1795
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1796
- <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipeline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1797
  </ul>
1798
 
1799
  <table>
@@ -1912,7 +1913,7 @@
1912
 
1913
  <h2>Finding the Best Training Configuration</h2>
1914
 
1915
- <p>We’ve now covered all the parallelism techniques that are actually used to distribute and train larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1916
 
1917
  <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1918
 
@@ -2272,7 +2273,7 @@
2272
 
2273
  </ol>
2274
 
2275
- <p>Let’s talk about one of the most frequent technique we can use in CUDA: optimizing memory access. The global memory in GPUs (the largest memory in our above graph) has a long latency and low bandwidth in comparison to the cache which often creates a major bottleneck for most applications. Efficiently accessing data from global memory can improve performance by a lot.</p>
2276
 
2277
  <h4>Memory Coalescing</h4>
2278
 
@@ -2410,7 +2411,7 @@
2410
 
2411
  <p>So it seems warps are stalling waiting for shared memory accesses to return! To solve this issue we can apply a technique called <strong>Thread Coarsening</strong> which involves merging several threads into a single coarsened thread. This will significantly reduce shared memory accesses as each coarsened thread can handle multiple output elements.</p>
2412
 
2413
- <p>Let's briefly go through a last important consideration when writing or improving custom kernels: <strong>Minimizing Control Divergence</strong>.</p>
2414
 
2415
  <h4>Minimizing Control Divergence</h4>
2416
 
@@ -2571,7 +2572,7 @@
2571
 
2572
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2573
 
2574
- <p>A common metric to measure a formats resolution is epsilon: the first representable number after <d-math>1.00</d-math>. We can see that for the float32 format <d-math>10^{-4}</d-math> is an upper bound (it’s actually <d-math>1.19^{-7}</d-math>). For float16 it is ~ <d-math>10^{-3}</d-math> and for bfloat 10x higher still.</p>
2575
 
2576
  <p>The idea of mixed precision training is to use some of these lower precisions formats while maintaining the performance of full precision training. </p>
2577
 
@@ -3379,7 +3380,7 @@
3379
 
3380
  <p>There are a few finer points in the decision tree that we leave to the reader to explore in the PyTorch guide referenced above.</p>
3381
 
3382
- <p>Now that we covered the fundamental operations for distributed training and you should now be ready to follow the blog post easily.</p>
3383
 
3384
  <h3>A1: Distributed Training Profiling</h3>
3385
 
@@ -3828,8 +3829,7 @@
3828
  }
3829
  if (level === 0)
3830
  ToC += '<div>' + link + '</div>';
3831
- else
3832
- // else if (level === 1)
3833
  ToC += '<li>' + link + '</li>';
3834
  }
3835
 
 
75
  <p>
76
  Thousands of GPUs humming in perfect harmony. That's what it takes to train today's most powerful AI models – a symphony of computing power that until recently was the exclusive domain of elite research labs. Open source has transformed this landscape, but not completely. Yes, you can download the latest <a href="https://huggingface.co/meta-llama">Llama</a> or <a href="https://huggingface.co/deepseek-ai">DeepSeek</a> models. Yes, you can read their <a href="https://ai.meta.com/research/publications/the-llama-3-herd-of-models/">technical</a> and <a href="https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf">experiment</a> reports. But the most challenging part – the training code, the knowledge and technics necessary to coordinate GPUs to train these massive systems – remains shrouded in complexity and spread around a series of disconnected papers and often private codebases.
77
  </p>
78
+ <aside>Reading time: 2-4 days. For the best reading experience, we recommend not using a mobile phone.</aside>
79
  <p>
80
  This open-source book is here to changes that. Starting from the basics, we'll walk you through the knowledge necessary to scale the training of large language models from one GPU to tens, hundreds and even thousands of GPUs, illustrating theory with practical code examples and reproducible benchmarks.
81
  </p>
 
361
 
362
  <h3>Memory usage in Transformers</h3>
363
 
364
+ <p>When training a neural network model, one store several items in memory:</p>
365
 
366
  <ul>
367
  <li>Model weights</li>
 
374
  <p class="note-box-title">📝 Note</p>
375
  <div class="note-box-content">
376
  <p>
377
+ You would think for a model you could compute the memory requirements exactly but there are a few additional memory occupants that makes it hard to be exact:
378
  <ul>
379
  <li>CUDA Kernels typically require 1-2 GB of GPU memory, which you can quickly verify by running <code>import torch; torch.ones((1, 1)).to("cuda")</code> and then checking the GPU memory with <code>nvidia-smi</code>.</li>
380
  <li>Some rest memory usage from buffers, intermediate results and some memory that can’t be used due to fragmentation</li>
 
389
 
390
  <h4>Profiling the memory usage</h4>
391
 
392
+ <p>Using the Pytorch profiler we can understand how memory is allocated througho ut training. We can see that memory utilization is not a static thing but varies a lot during training and during a training step:</p>
393
 
394
  <aside>Check out <a target="_self" href="#a1%3A_distributed_training_profiling" class="">A1: Distributed Training Profiling</a> for a walkthrough how to profile your model.</aside>
395
 
 
403
 
404
  <p>Clearly the first step looks very different from the subsequent ones, but let’s first have a look at the general anatomy of a step: first the activations increase quickly as we do the forward pass, then during the backward pass the gradients build up and as the backward pass propagates, the stored activations used to compute the gradients are progressively cleared. Finally, we perform the optimization step during which we need all the gradients and then update the optimizer states before we start the next forward pass. </p>
405
 
406
+ <p>Why does the first step looks different: the activations increase quickly and then plateau for a while. In this first step the torch cache allocator does a lot of preparation preparing memory allocations to speed up the subsequent steps so that they don’t require searching for free memory blocks afterwards (see <a href="https://zdevito.github.io/2022/08/04/cuda-caching-allocator.html">Zach’s blog</a>). After the first step we also see the optimizer states appearing which generally offset the memory usage for further training steps.</p>
407
 
408
  <aside>Ever noticed how sometimes the training succeeds in the first step but then OOMs during the following training steps? This can be explained by the build-up of the optimizer state after the first step.
409
  </aside>
 
611
 
612
  <p>But if you’ve carefully followed, you probably noticed that the forward/backward passes for each micro-batch can actually be run in parallel. Forward/backward passes are independent from each other, with independent input samples being the only difference. Seems like it’s time to start extending our training to more than one GPU! </p>
613
 
614
+ <p>Before that, let's quickly see how we can vizualise computation and communication with a short tour of one of the most usefull tool in the distributed training toolbox: the <strong>profiler</strong>. This tool will be extremely usefull to understand and validate how communications between GPUs and compute are happening and where bottlenecks are.</p>
615
 
616
  <h4>Profiling GPU compute and communication</h4>
617
 
 
1320
  <ul>
1321
  <li>for both methods we notice the biggest performance drop when we move from TP=8 to TP=16, because that’s when we move from only communicating within a single node (NVLink), to communicating inter-nodes (EFA)</li>
1322
  <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
1323
+ <li>the memory savings in activations when using TP with SP helps us fit far bigger batches than TP alone</li>
1324
  </ul>
1325
 
1326
  <p><strong>We have seen how TP helps us shard activations across several GPUs by splitting the attention and feedforward operations along the hidden dimension and how SP is a natural complement for the remaining operations by splitting along the sequence dimension.</strong></p>
 
1364
 
1365
  <p>There is one important exception though as we we need to pay particular attention to the <strong>Attention blocks</strong> (haha.. pun intended :D). In the attention module each token needs to access key/value pairs from <strong>all</strong> other sequence tokens or in the case of causal attention at least attends to each previous token.</p>
1366
 
1367
+ <p>Because Context Parallelism splits the inputs along the sequence dimension across GPUs, the attention module will requires full communication between GPUs to exchange the necessary key/value data.</p>
1368
 
1369
  <p>That sounds very expensive if we do it naively. Is there a way to do this rather efficiently and fast! Thankfully there is: a core technique to handle this communication of key/value pairs efficiently is called <em>Ring Attention</em>.</p>
1370
 
 
1746
  </table>
1747
  </div>
1748
 
1749
+ <p>As you can see, ZeRO-3 and PP sove the same challenge but involve different approaches and the choice between both will depend whether you decide to focus communication either on weights or on activations. While they can be combined, it's not often done in practice as doing so requires increasing the global batch size significantly to amortize the communication costs, creating a tradeoff between global batch size, model size, network bandwidth, and training efficiency. If you decide to combine them, ZeRO-3 should be configured to keep the weights in memory during the series of PP micro-batches to minimize as much as possible un-necessary communication overhead.</p>
1750
 
1751
  <p>On the other hand, ZeRO-1 and ZeRO-2, which focus on optimizer states and gradients, can be easily combined with Pipeline Parallelism and are complementary to it. Combining them don't raise any particular new challenge. For instance, the training of DeepSeek-v3 used PP combined with ZeRO-1 (sic).</p>
1752
 
 
1794
  <li>Tensor Parallelism (and Sequence Parallelism) affects computation throughout the entire model by sharding both weights and activations.</li>
1795
  <li>Context Parallelism primarily impacts attention layers since that's where cross-sequence communication is required, with other layers operating independently on sharded sequences.</li>
1796
  <li>Expert Parallelism primarly affects the MoE layers (which replace standard MLP blocks), leaving attention and other components unchanged</li>
1797
+ <li>Pipeline Parallelism and ZeRO are not especially specific to any sub-module or component with the exception that modules and layers need to be balanced in Pipaline Parallelism, the first and last layers are thus often treated differently due to the additional embedding layers.</li>
1798
  </ul>
1799
 
1800
  <table>
 
1913
 
1914
  <h2>Finding the Best Training Configuration</h2>
1915
 
1916
+ <p>We’ve now covered all the parallelism techniques that are actually used to distribute and training larger models as well as how and why they can be combined together. There remain a general question: which ones should we choose in the end and how to decide on a specific combination?</p>
1917
 
1918
  <p>We touched this a little bit in the previous section but let's now walk in details through a possible decision process, step by step, keeping in mind that you'll always have to run a few experiments to find the definitive optimal setup for your compute cluster given its various physical properties, network bandwidth, GPUs per node, memory per GPU, etc.</p>
1919
 
 
2273
 
2274
  </ol>
2275
 
2276
+ <p>Let’s talk about one of the most frequent technique we can use in CUDA: optimizing memory access. The global memory in GPUs (the largest memory in our above graph) has a long latency and low bandwidth in comparison to the cache which often creates a major bottleneck for most applications. Efficiently accessing data from global memory can improve a lot the performance.</p>
2277
 
2278
  <h4>Memory Coalescing</h4>
2279
 
 
2411
 
2412
  <p>So it seems warps are stalling waiting for shared memory accesses to return! To solve this issue we can apply a technique called <strong>Thread Coarsening</strong> which involves merging several threads into a single coarsened thread. This will significantly reduce shared memory accesses as each coarsened thread can handle multiple output elements.</p>
2413
 
2414
+ <p>Let's briefly mentionned a last important consideration when writing or improving custom kernels: <strong>Minimizing Control Divergence</strong>.</p>
2415
 
2416
  <h4>Minimizing Control Divergence</h4>
2417
 
 
2572
 
2573
  <p>We can see here that bfloat16 maintained the range of float32 over float16 but did this with the cost of sacrificing more precision. In case of float8 the situation is even more dire as e4m3 can represent 7 and e5m2 only 3 number on the interval 1-2.</p>
2574
 
2575
+ <p>A common metric to measure a formats resolution is epsilon: the first representable number after <d-math>1.00</d-math>. We can see that for the float32 format <d-math>10^{-4}</d-math> is an upper bound (it’s actually <d-math>1.19^{-7}</d-math>). For float16 it is <d-math>\tilde 10^{-3}</d-math> and for bfloat 10x higher still.</p>
2576
 
2577
  <p>The idea of mixed precision training is to use some of these lower precisions formats while maintaining the performance of full precision training. </p>
2578
 
 
3380
 
3381
  <p>There are a few finer points in the decision tree that we leave to the reader to explore in the PyTorch guide referenced above.</p>
3382
 
3383
+ <p>Now that we covered the fundamental operations for distributed training and when you should should be ready to follow the blog post easily.</p>
3384
 
3385
  <h3>A1: Distributed Training Profiling</h3>
3386
 
 
3829
  }
3830
  if (level === 0)
3831
  ToC += '<div>' + link + '</div>';
3832
+ else if (level === 1)
 
3833
  ToC += '<li>' + link + '</li>';
3834
  }
3835