NVIDIA
diff --git a/‎docs/source/api-guide/fine_grained_activation_offloading.md‎
Lines changed: 29 additions & 0 deletions b/‎docs/source/api-guide/fine_grained_activation_offloading.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎docs/source/api-guide/index.rst‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/api-guide/index.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png‎
325 KB b/‎docs/source/images/fine_grained_activation_offloading/offloading_and_recomputing.png‎
325 KB
diff --git a/‎megatron/core/extensions/transformer_engine.py‎
Lines changed: 10 additions & 0 deletions b/‎megatron/core/extensions/transformer_engine.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 7 additions & 0 deletions b/‎megatron/core/models/common/model_chunk_schedule_plan.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 17 additions & 4 deletions b/‎megatron/core/models/gpt/fine_grained_callables.py‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 25 additions & 0 deletions b/‎megatron/core/models/gpt/gpt_model.py‎
Lines changed: 25 additions & 0 deletions
@@ -0,0 +1,29 @@
+# Fine-grained Activation Offloading (collaborated with rednote)
+
+Memory capacity is more and more important with the rising of extreme sparse MoE models like DeepSeek-V3 and Qwen3-235B. Fine-grained recomputing reduces the memory footprint at the cost of extra recomputation, while offloading could utilize the host-device bandwidth to achieve nearly zero-overhead. Fine-grained Activation Offloading targets at offloading the activation at the granularity of specific modules, so that we can calibrate the amount of offloading activation to maximize the training throughput.
+
+**Features**
+* Support PP=1/PP/Interleaved PP
+* Compatible with fine-grained recomputation
+* Support FP8
+* Support MTP
+* Support mixed dense & moe layer
+* Support A2A Overlap
+* Support CUDA Graph
+  * (Temporary) cuda graph scope cannot contains the offloading modules
+
+**Usage**
+```bash
+# Enable fine-grained activation offloading
+--fine-grained-activation-offloading
+
+# Specify which modules are going to offload its input
+# Choices: "attn_norm", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act".
+--offload-modules expert_fc1
+```
+**Compatible with Fine-grained Recomputation**
+- For modules with minor perf overhead like layernorm or moe_act, use recomputing to reduce memory footprint;
+- For other modules, use offloading to reduce memory footprint;
+- Make sure the offloading/reloading could be overlapped with computing;
+
+![Fine-grained Activation Offloading and Fine-grained Recomputation](../images/fine_grained_activation_offloading/offloading_and_recomputing.png)
@@ -22,3 +22,4 @@ API Guide
    optimizer_cpu_offload
    multi_token_prediction
    tokenizers
+   fine_grained_activation_offloading
@@ -299,6 +299,7 @@ def __init__(
                 extra_kwargs["delay_wgrad_compute"] = self.config.delay_wgrad_compute
             else:
                 raise RuntimeError("Only TE with version >=2.3.0 supports delay_wgrad_compute now.")
+
         if (
             self.config.tp_comm_overlap
             and tp_comm_buffer_name
@@ -2116,3 +2117,12 @@ def set_save_original_input(module):
             "set_save_original_input is only needed on transformer-engine modules that save "
             "quantized tensors by default. It needs transformer-engine>=2.6.0dev0."
         )
+
+
+try:
+    # pylint: disable=unused-import
+    from transformer_engine.pytorch import cpu_offload
+    from transformer_engine.pytorch.float8_tensor import Float8Tensor
+except ImportError:
+    Float8Tensor = None
+    cpu_offload = None
@@ -8,6 +8,9 @@
 
 from megatron.core.enums import Fp8Recipe
 from megatron.core.fp8_utils import get_fp8_context
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    fine_grained_offloading_set_last_layer,
+)
 from megatron.core.pipeline_parallel.utils import (
     AbstractSchedulePlan,
     NoopScheduleNode,
@@ -450,6 +453,8 @@ def run(
             f_layer = f_schedule_plan.get_layer(i)
             b_layer = b_schedule_plan.get_layer(b_num_layers - 1 - i)
             torch.cuda.nvtx.range_push(f"layer_{i}f-layer_{b_num_layers - 1 - i}b")
+            if f_layer.layer.config.fine_grained_activation_offloading:
+                fine_grained_offloading_set_last_layer(i == f_num_layers - 1)
             f_input, b_grad = TransformerLayerSchedulePlan.run(
                 f_layer,
                 b_layer,
@@ -472,6 +477,8 @@ def run(
         for i in range(overlapped_layers, f_num_layers):
             f_layer = f_schedule_plan.get_layer(i)
             torch.cuda.nvtx.range_push(f"layer_{i}f")
+            if f_layer.layer.config.fine_grained_activation_offloading:
+                fine_grained_offloading_set_last_layer(i == f_num_layers - 1)
             f_input, _ = TransformerLayerSchedulePlan.run(f_layer, None, f_input=f_input)
             torch.cuda.nvtx.range_pop()
 
 
@@ -8,6 +8,11 @@
 import torch
 
 from megatron.core import tensor_parallel
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    fine_grained_offloading_group_commit,
+    fine_grained_offloading_group_start,
+    get_fine_grained_offloading_context,
+)
 from megatron.core.pipeline_parallel.utils import ScheduleNode, make_viewless
 from megatron.core.transformer.module import float16_to_fp32
 from megatron.core.transformer.moe.moe_layer import MoELayer
@@ -350,13 +355,17 @@ def submodule_post_attn_forward(node: ScheduleNode, hidden_states: torch.Tensor)
         Run forward pass for computations between attention and dispatch:
             pre mlp layernorm->router->dispatch preprocess
         """
+        if layer.offload_mlp_norm:
+            hidden_states = fine_grained_offloading_group_start(hidden_states, name="mlp_norm")
         if layer.recompute_pre_mlp_layernorm:
             layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput()
-            pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint(
-                layer.pre_mlp_layernorm, hidden_states
-            )
+            with get_fine_grained_offloading_context(layer.offload_mlp_norm):
+                pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint(
+                    layer.pre_mlp_layernorm, hidden_states
+                )
         else:
-            pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states)
+            with get_fine_grained_offloading_context(layer.offload_mlp_norm):
+                pre_mlp_layernorm_output = layer.pre_mlp_layernorm(hidden_states)
 
         local_tokens, probs, _ = layer.mlp.router_and_preprocess(pre_mlp_layernorm_output)
 
@@ -437,6 +446,10 @@ def submodule_combine_forward(
             hidden_states = layer.mlp_bda(layer.training, layer.config.bias_dropout_fusion)(
                 mlp_output_with_bias, residual, layer.hidden_dropout
             )
+        if layer.offload_mlp_norm:
+            (hidden_states,) = fine_grained_offloading_group_commit(
+                hidden_states, name="mlp_norm", forced_released_tensors=[residual]
+            )
         output = make_viewless_tensor(
             inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
         )
 
@@ -18,6 +18,9 @@
 )
 from megatron.core.models.common.language_module.language_module import LanguageModule
 from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    fine_grained_offloading_init_chunk_handler,
+)
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.quantization.utils import get_quant_config_or_none
 from megatron.core.tensor_parallel import gather_from_sequence_parallel_region
@@ -117,6 +120,7 @@ def __init__(
         self.parallel_output = parallel_output
         self.share_embeddings_and_output_weights = share_embeddings_and_output_weights
         self.vp_stage = vp_stage
+        self.disable_param_offloading = True
 
         if hasattr(self.config, 'position_embedding_type'):
             self.position_embedding_type = self.config.position_embedding_type
@@ -410,6 +414,22 @@ def _preprocess(
 
         return preproc_output
 
+    def preprocess_for_fine_grained_offloading(self):
+        """Preprocess for fine-grained activation offloading."""
+        fine_grained_offloading_init_chunk_handler(
+            self.vp_stage, self.config.min_offloaded_tensor_size
+        )
+        if self.disable_param_offloading:
+            for param in self.decoder.parameters():
+                param.offloading_activation = False
+            if self.mtp_process:
+                for param in self.mtp.parameters():
+                    param.offloading_activation = False
+            if self.post_process:
+                for param in self.output_layer.parameters():
+                    param.offloading_activation = False
+            self.disable_param_offloading = False
+
     def forward(
         self,
         input_ids: Tensor,
@@ -435,6 +455,8 @@ def forward(
             runtime_gather_output (bool): Gather output at runtime. Default None means
                 `parallel_output` arg in the constructor will be used.
         """
+        if self.config.fine_grained_activation_offloading:
+            self.preprocess_for_fine_grained_offloading()
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
@@ -701,6 +723,9 @@ def build_schedule_plan(
             TransformerModelChunkSchedulePlan: The model chunk schedule plan.
         """
 
+        if self.config.fine_grained_activation_offloading:
+            self.preprocess_for_fine_grained_offloading()
+
         from ..common.model_chunk_schedule_plan import TransformerModelChunkSchedulePlan
 
         return TransformerModelChunkSchedulePlan(