Update on "[slimtensor] Add CUDA Storage with DeviceTraits and memory allocation"

Gasoonjia · Gasoonjia · commit 39b751d05604 · 2026-01-22T09:55:54.000-08:00
This diff adds CUDA storage infrastructure to SlimTensor, enabling GPU memory allocation and management. **Key changes:** 1. **`cuda/Guard.h`** - CUDAGuard RAII class: - Saves current CUDA device on construction, restores on destruction - Exception-safe device context switching - Constructors accept device index or Device object 2. **`core/Storage.h`** - Extended for CUDA support: - Added `DeviceTraits<DeviceType::CUDA>` specialization with: - `allocate()` - Uses cudaMalloc with CUDAGuard for device selection - `free()` - Uses cudaFree with warning on error - `memcpy()` - Supports Host↔Device and Device↔Device copies - Added `DEFAULT_CUDA_DEVICE` constant - Updated `MaybeOwningStorage` constructor to handle CUDA devices - Stub implementation when `CUDA_AVAILABLE` is not defined (throws error) Differential Revision: [D91202899](https://our.internmc.facebook.com/intern/diff/D91202899/) [ghstack-poisoned]
diff --git a/backends/qualcomm/README.md b/backends/qualcomm/README.md
@@ -24,6 +24,7 @@ Please check `generate_qnn_executorch_compiler_spec()` in
 - Snapdragon 8 Elite Gen 5
 - SA8295
 - SA8255
+- SA8797 (also used by SA8397)
 - SSG2115P
 - SSG2125P
 - SXR1230P
diff --git a/backends/qualcomm/serialization/qc_compiler_spec.fbs b/backends/qualcomm/serialization/qc_compiler_spec.fbs
@@ -34,6 +34,7 @@ table HtpInfo {
 enum QcomChipset: int {
   UNKNOWN_SM = 0,
   SA8295 = 39,
+  SA8797 = 72,
   SM8350 = 30,
   SM8450 = 36,
   SM8475 = 42,
diff --git a/backends/qualcomm/serialization/qc_schema.py b/backends/qualcomm/serialization/qc_schema.py
@@ -40,6 +40,7 @@ class HtpInfo:
 class QcomChipset(IntEnum):
     UNKNOWN_SM = 0
     SA8295 = 39  # v68
+    SA8797 = 72  # v81
     SM8350 = 30  # v68
     SM8450 = 36  # v69
     SM8475 = 42  # v69
@@ -68,6 +69,7 @@ class SocInfo:
 
 _soc_info_table = {
     QcomChipset.SA8295: SocInfo(QcomChipset.SA8295, HtpInfo(HtpArch.V68, 8)),
+    QcomChipset.SA8797: SocInfo(QcomChipset.SA8797, HtpInfo(HtpArch.V81, 16)),
     QcomChipset.SM8350: SocInfo(QcomChipset.SM8350, HtpInfo(HtpArch.V68, 4)),
     QcomChipset.SM8450: SocInfo(QcomChipset.SM8450, HtpInfo(HtpArch.V69, 8)),
     QcomChipset.SM8475: SocInfo(QcomChipset.SM8475, HtpInfo(HtpArch.V69, 8)),
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -1144,6 +1144,7 @@ def generate_qnn_executorch_compiler_spec(
 def get_soc_to_arch_map():
     return {
         "SA8295": HtpArch.V68,
+        "SA8797": HtpArch.V81,
         "SM8350": HtpArch.V68,
         "SM8450": HtpArch.V69,
         "SM8475": HtpArch.V69,
@@ -1168,6 +1169,7 @@ def get_soc_to_arch_map():
 def get_soc_to_chipset_map():
     return {
         "SA8295": QcomChipset.SA8295,
+        "SA8797": QcomChipset.SA8797,
         "SM8350": QcomChipset.SM8350,
         "SM8450": QcomChipset.SM8450,
         "SM8475": QcomChipset.SM8475,
diff --git a/docs/source/backends-overview.md b/docs/source/backends-overview.md
@@ -18,12 +18,13 @@ Backends are the bridge between your exported model and the hardware it runs on.
 
 ## Choosing a Backend
 
-| Backend                                                      | Platform(s) | Hardware Type | Typical Use Case                |
-|--------------------------------------------------------------|-------------|---------------|---------------------------------|
-| [XNNPACK](backends/xnnpack/xnnpack-overview.md)              | All         | CPU           | General-purpose, fallback       |
-| [Core ML](/backends/coreml/coreml-overview.md)               | iOS, macOS  | NPU/GPU/CPU   | Apple devices, high performance |
-| [Metal Performance Shaders](/backends/mps/mps-overview.md)   | iOS, macOS  | GPU           | Apple GPU acceleration          |
-| [Vulkan ](/backends/vulkan/vulkan-overview.md)               | Android     | GPU           | Android GPU acceleration        |
+| Backend                                                      | Platform(s)   | Hardware Type | Typical Use Case                |
+|--------------------------------------------------------------|---------------|---------------|---------------------------------|
+| [XNNPACK](backends/xnnpack/xnnpack-overview.md)              | All           | CPU           | General-purpose, fallback       |
+| [CUDA](/backends/cuda/cuda-overview.md)                      | Linux/Windows | GPU           | NVIDIA GPU acceleration         |
+| [Core ML](/backends/coreml/coreml-overview.md)               | iOS, macOS    | NPU/GPU/CPU   | Apple devices, high performance |
+| [Metal Performance Shaders](/backends/mps/mps-overview.md)   | iOS, macOS    | GPU           | Apple GPU acceleration          |
+| [Vulkan ](/backends/vulkan/vulkan-overview.md)               | Android       | GPU           | Android GPU acceleration        |
 | [Qualcomm](backends-qualcomm)                                | Android     | NPU           | Qualcomm SoCs                   |
 | [MediaTek](backends-mediatek)                                | Android     | NPU           | MediaTek SoCs                   |
 | [Arm Ethos-U](/backends/arm-ethos-u/arm-ethos-u-overview.md) | Embedded    | NPU           | Arm MCUs                        |
@@ -51,6 +52,7 @@ Backends are the bridge between your exported model and the hardware it runs on.
 :caption: Backend Overview
 
 backends-xnnpack
+backends/cuda/cuda-overview
 backends/coreml/coreml-overview
 backends-mps
 backends-vulkan
diff --git a/docs/source/backends-qualcomm.md b/docs/source/backends-qualcomm.md
@@ -61,6 +61,7 @@ For more details and troubleshooting, refer to the official Microsoft WSL instal
 ### Hardware:
 You will need an Android / Linux device with adb-connected running on one of below Qualcomm SoCs:
  - SA8295
+ - SA8797 (also used by SA8397)
  - SM8450 (Snapdragon 8 Gen 1)
  - SM8475 (Snapdragon 8 Gen 1+)
  - SM8550 (Snapdragon 8 Gen 2)
diff --git a/docs/source/backends/cuda/cuda-overview.md b/docs/source/backends/cuda/cuda-overview.md
@@ -0,0 +1,122 @@
+# CUDA Backend
+
+The CUDA backend is the ExecuTorch solution for running models on NVIDIA GPUs. It leverages the [AOTInductor](https://pytorch.org/docs/stable/torch.compiler_aot_inductor.html) compiler to generate optimized CUDA kernels with libtorch-free execution, and uses [Triton](https://triton-lang.org/) for high-performance GPU kernel generation.
+
+## Features
+
+- **Optimized GPU Execution**: Uses AOTInductor to generate highly optimized CUDA kernels for model operators
+- **Triton Kernel Support**: Leverages Triton for GEMM (General Matrix Multiply), convolution, and SDPA (Scaled Dot-Product Attention) kernels.
+- **Quantization Support**: INT4 weight quantization with tile-packed format for improved performance and reduced memory footprint
+- **Cross-Platform**: Supports both Linux and Windows platforms
+- **Multiple Model Support**: Works with various models including LLMs, vision-language models, and audio models
+
+## Target Requirements
+
+Below are the requirements for running a CUDA-delegated ExecuTorch model:
+
+- **Hardware**: NVIDIA GPU with CUDA compute capability
+- **CUDA Toolkit**: CUDA 11.x or later (CUDA 12.x recommended)
+- **Operating System**: Linux or Windows
+- **Drivers**: PyTorch-Compatible NVIDIA GPU drivers installed
+
+## Development Requirements
+
+To develop and export models using the CUDA backend:
+
+- **Python**: Python 3.8+
+- **PyTorch**: PyTorch with CUDA support
+- **ExecuTorch**: Install ExecuTorch with CUDA backend support
+
+## Using the CUDA Backend
+
+### Exporting Models with Python API
+
+The CUDA backend uses the `CudaBackend` and `CudaPartitioner` classes to export models. Here is a complete example:
+
+```python
+import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+from executorch.extension.export_util.utils import save_pte_program
+
+# Configure edge compilation
+edge_compile_config = EdgeCompileConfig(
+    _check_ir_validity=False,
+    _skip_dim_order=True,
+)
+
+# Define your model
+model = YourModel().eval()
+example_inputs = (torch.randn(1, 3, 224, 224),)
+
+# Export the model using torch.export
+exported_program = torch.export.export(model, example_inputs)
+
+# Create the CUDA partitioner
+partitioner = CudaPartitioner(
+    [CudaBackend.generate_method_name_compile_spec(model_name)]
+)
+
+# Add decompositions for Triton to generate kernels
+exported_program = exported_program.run_decompositions({
+    torch.ops.aten.conv1d.default: conv1d_to_conv2d,
+})
+
+# Lower to ExecuTorch with CUDA backend
+et_program = to_edge_transform_and_lower(
+    exported_program,
+    partitioner=[partitioner],
+    compile_config=edge_compile_config,
+)
+
+# Convert to executable program and save
+exec_program = et_program.to_executorch()
+save_pte_program(exec_program, model_name, "./output_dir")
+```
+This generates `.pte` and `.ptd` files that can be executed on CUDA devices.
+
+For a complete working example, see the [CUDA export script](https://github.com/pytorch/executorch/blob/main/examples/cuda/scripts/export.py).
+
+
+----
+
+## Runtime Integration
+
+To run the model on device, use the standard ExecuTorch runtime APIs. See [Running on Device](getting-started.md#running-on-device) for more information.
+
+When building from source, pass `-DEXECUTORCH_BUILD_CUDA=ON` when configuring the CMake build to compile the CUDA backend.
+
+```
+# CMakeLists.txt
+add_subdirectory("executorch")
+...
+target_link_libraries(
+    my_target
+    PRIVATE executorch
+    extension_module_static
+    extension_tensor
+    aoti_cuda_backend)
+```
+
+No additional steps are necessary to use the backend beyond linking the target. CUDA-delegated `.pte` and `.ptd` files will automatically run on the registered backend.
+
+----
+
+## Examples
+
+For complete end-to-end examples of exporting and running models with the CUDA backend, see:
+
+- [Whisper](https://github.com/pytorch/executorch/blob/main/examples/models/whisper/README.md) — Audio transcription model with CUDA support
+- [Voxtral](https://github.com/pytorch/executorch/blob/main/examples/models/voxtral/README.md) — Audio multimodal model with CUDA support
+- [Gemma3](https://github.com/pytorch/executorch/blob/main/examples/models/gemma3/README.md) — Vision-language model with CUDA support
+
+These examples demonstrate the full workflow including model export, quantization options, building runners, and runtime execution.
+
+ExecuTorch provides Makefile targets for building these example runners:
+
+```bash
+make whisper-cuda   # Build Whisper runner with CUDA
+make voxtral-cuda   # Build Voxtral runner with CUDA
+make gemma3-cuda    # Build Gemma3 runner with CUDA
+```
diff --git a/examples/cuda/scripts/export.py b/examples/cuda/scripts/export.py
@@ -21,8 +21,6 @@
 from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
 
 from executorch.extension.export_util.utils import save_pte_program
-from torch._inductor.decomposition import conv1d_to_conv2d
-from torch.nn.attention import SDPBackend
 
 # Script to export a model with CUDA delegation.
 
@@ -88,24 +86,17 @@ def main():
         kwargs=example_kwargs,
         dynamic_shapes=dynamic_shapes,
     )
-    print(exported_programs)
 
     partitioner = CudaPartitioner(
         [CudaBackend.generate_method_name_compile_spec(args.model_name)]
     )
-    # Add decompositions for triton to generate kernels.
-    exported_programs = exported_programs.run_decompositions(
-        {
-            torch.ops.aten.conv1d.default: conv1d_to_conv2d,
-        }
+
+    et_prog = to_edge_transform_and_lower(
+        exported_programs,
+        partitioner=[partitioner],
+        compile_config=_EDGE_COMPILE_CONFIG,
+        generate_etrecord=args.generate_etrecord,
     )
-    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]):
-        et_prog = to_edge_transform_and_lower(
-            exported_programs,
-            partitioner=[partitioner],
-            compile_config=_EDGE_COMPILE_CONFIG,
-            generate_etrecord=args.generate_etrecord,
-        )
     exec_program = et_prog.to_executorch()
     save_pte_program(exec_program, args.model_name, args.output_dir)
     if args.generate_etrecord:
diff --git a/examples/demo-apps/react-native/rnllama/package.json b/examples/demo-apps/react-native/rnllama/package.json
@@ -55,7 +55,8 @@
   "private": true,
   "resolutions": {
     "cookie": ">=0.7.0",
-    "glob": "^10.5.0"
+    "glob": "^10.5.0",
+    "lodash": ">=4.17.23"
   },
   "packageManager": "yarn@1.22.22+sha512.a6b2f7906b721bba3d67d4aff083df04dad64c399707841b7acf00f6b133b7ac24255f2652fa22ae3534329dc6180534e98d17432037ff6fd140556e2bb3137e"
 }
diff --git a/examples/demo-apps/react-native/rnllama/yarn.lock b/examples/demo-apps/react-native/rnllama/yarn.lock
@@ -4887,9 +4887,9 @@ lodash.throttle@^4.1.1:
   integrity sha512-wIkUCfVKpVsWo3JSZlc+8MB5it+2AN5W8J7YVMST30UrvcQNZ1Okbj+rbVniijTWE6FGYy4XJq/rHkas8qJMLQ==
 
 lodash@^4.17.19, lodash@^4.17.21:
-  version "4.17.21"
-  resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.21.tgz#679591c564c3bffaae8454cf0b3df370c3d6911c"
-  integrity sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==
+  version "4.17.23"
+  resolved "https://registry.yarnpkg.com/lodash/-/lodash-4.17.23.tgz#f113b0378386103be4f6893388c73d0bde7f2c5a"
+  integrity sha512-LgVTMpQtIopCi79SJeDiP0TfWi5CNEc/L/aRdTh3yIvmZXTnheWpKjSZhnvMl8iXbC1tFg9gdHHDMLoV7CnG+w==
 
 log-symbols@^2.2.0:
   version "2.2.0"
diff --git a/exir/pass_base.py b/exir/pass_base.py
@@ -1,6 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
-# Copyright 2025 Arm Limited and/or its affiliates.
+# Copyright 2025-2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -191,6 +191,11 @@ def create_arg(self, a: Argument) -> torch.fx.Node:
                 if not hasattr(a, "constant") or a.constant is None:
                     raise ExportPassBaseError(f"Cannot add {a} to graph.")
                 a = a.constant
+            elif isinstance(a, torch.SymInt):
+                if a.node.constant is not None:
+                    return a.node.constant
+                else:
+                    return a
             node = super().create_arg(a)
             if (
                 isinstance(a, torch.Tensor)
diff --git a/exir/tests/test_dynamic_shape_propagation.py b/exir/tests/test_dynamic_shape_propagation.py