add rabbit feedback

Fridah-nv · Fridah-nv · commit e391ea1a43eb · 2026-02-06T21:02:12.000Z
Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -486,13 +486,12 @@ def get_error_func(self) -> Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
             def local_hessian_error(x: torch.Tensor, xq: torch.Tensor) -> torch.Tensor:
                 """Compute local Hessian-weighted error."""
                 original_shape = x.shape
-                dw = (x - xq).view(-1, 1, bs)  # (num_blocks, 1, block_size)
-                # Repeat hessian for each output channel
-                hessian_expanded = hessian.repeat(
-                    cout, 1, 1
-                )  # (num_blocks, block_size, block_size)
-                # Per-block loss: (num_blocks,)
-                block_loss = (dw @ hessian_expanded @ dw.transpose(-1, -2)).squeeze(-1).squeeze(-1)
+                # Reshape to (cout, num_blocks_per_cin, block_size)
+                dw = (x - xq).view(cout, -1, bs)
+                # Use einsum to avoid materializing cout-repeated Hessian
+                # dw: (cout, n_blocks, bs), hessian: (n_blocks, bs, bs) -> (cout, n_blocks)
+                block_loss = torch.einsum("cnb,nbd,cnd->cn", dw, hessian, dw)
+                block_loss = block_loss.reshape(-1)
                 error = block_loss.unsqueeze(-1).expand(-1, bs).reshape(original_shape)
                 return error
 
@@ -522,12 +521,14 @@ def forward(self, input, *args, **kwargs):
     # Setup helpers for all quantized linear modules
     name_to_module = dict(model.named_modules())
     weight_quantizers_info = []
+    all_patched_modules = []  # Track all modules for cleanup (including disabled ones)
 
     for name, module in name_to_module.items():
         if is_quantized_linear(module) and module.weight_quantizer.is_enabled:
             with enable_weight_access_and_writeback(module, model, name_to_module):
                 module.local_hessian = LocalHessianHelper(module, name)
             module.local_hessian.setup()
+            all_patched_modules.append((name, module))
             if module.local_hessian.is_enabled:
                 weight_quantizers_info.append((name, module))
 
@@ -619,7 +620,7 @@ def quant_func(x, amax, quantizer=weight_quantizer):
 
     # Cleanup and free memory
     LocalHessianHelper.cache_mode = False
-    for name, module in weight_quantizers_info:
+    for name, module in all_patched_modules:
         module.local_hessian.cleanup()
 
     print_rank_0("local_hessian: Calibration complete.")
diff --git a/tests/gpu/torch/quantization/test_quantize_cuda.py b/tests/gpu/torch/quantization/test_quantize_cuda.py
@@ -87,7 +87,7 @@
         mtq.NVFP4_AWQ_LITE_CFG,
         mtq.NVFP4_AWQ_CLIP_CFG,
         mtq.NVFP4_AWQ_FULL_CFG,
-        mtq.NVFP4_LOCAL_HESSIAN_WEIGHT_ONLY_CFG,
+        mtq.NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG,
         mtq.MXFP8_DEFAULT_CFG,
         mtq.MXFP6_DEFAULT_CFG,
         mtq.MXFP4_DEFAULT_CFG,
@@ -114,7 +114,7 @@ def test_quantize(model_cls, config):
         mtq.FP8_2D_BLOCKWISE_WEIGHT_ONLY_CFG,
         NVFP4_WEIGHT_ACT_MSE_CFG,
         NVFP4_WEIGHT_MSE_FP8_SWEEP_CFG,
-        mtq.NVFP4_LOCAL_HESSIAN_WEIGHT_ONLY_CFG,
+        mtq.NVFP4_W4A4_WEIGHT_LOCAL_HESSIAN_CFG,
     ]:
         if get_cuda_ext_mx() is None:
             pytest.skip("cuda_ext_mx is not available")