Skip to content

Commit f396003

Browse files
committed
[slimtensor] Add CUDA Storage with DeviceTraits and memory allocation
Pull Request resolved: #16769 This diff adds CUDA storage infrastructure to SlimTensor, enabling GPU memory allocation and management. **Key changes:** 1. **`cuda/Guard.h`** - CUDAGuard RAII class: - Saves current CUDA device on construction, restores on destruction - Exception-safe device context switching - Constructors accept device index or Device object 2. **`core/Storage.h`** - Extended for CUDA support: - Added `DeviceTraits<DeviceType::CUDA>` specialization with: - `allocate()` - Uses cudaMalloc with CUDAGuard for device selection - `free()` - Uses cudaFree with warning on error - `memcpy()` - Supports Host↔Device and Device↔Device copies - Added `DEFAULT_CUDA_DEVICE` constant - Updated `MaybeOwningStorage` constructor to handle CUDA devices - Stub implementation when `CUDA_AVAILABLE` is not defined (throws error) ghstack-source-id: 335102161 @exported-using-ghexport Differential Revision: [D91202899](https://our.internmc.facebook.com/intern/diff/D91202899/)
1 parent 3cd3b92 commit f396003

File tree

8 files changed

+518
-80
lines changed

8 files changed

+518
-80
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#ifdef CUDA_AVAILABLE
12+
13+
#include <cuda.h>
14+
#include <cuda_runtime.h>
15+
16+
#include <executorch/backends/aoti/slim/c10/macros/Macros.h>
17+
#include <executorch/runtime/platform/assert.h>
18+
#include <executorch/runtime/platform/log.h>
19+
20+
/// Checks a CUDA expression and aborts on error.
21+
/// @param EXPR The CUDA expression to check.
22+
#define ET_CUDA_CHECK(EXPR) \
23+
do { \
24+
const cudaError_t __err = EXPR; \
25+
ET_CHECK_MSG( \
26+
__err == cudaSuccess, "CUDA error: %s", cudaGetErrorString(__err)); \
27+
} while (0)
28+
29+
/// Checks a CUDA expression and logs a warning on error (non-fatal).
30+
/// @param EXPR The CUDA expression to check.
31+
#define ET_CUDA_LOG_WARN(EXPR) \
32+
do { \
33+
const cudaError_t __err = EXPR; \
34+
if (SLIMTENSOR_UNLIKELY(__err != cudaSuccess)) { \
35+
[[maybe_unused]] auto error_unused = cudaGetLastError(); \
36+
ET_LOG(Error, "CUDA warning: %s", cudaGetErrorString(__err)); \
37+
} \
38+
} while (0)
39+
40+
#endif // CUDA_AVAILABLE
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
load(":targets.bzl", "define_common_targets")
3+
4+
oncall("executorch")
5+
6+
define_common_targets()
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
2+
3+
def define_common_targets():
4+
"""Define targets for SlimTensor CUDA exception handling module."""
5+
6+
runtime.cxx_library(
7+
name = "exception",
8+
exported_headers = [
9+
"Exception.h",
10+
],
11+
visibility = ["@EXECUTORCH_CLIENTS"],
12+
exported_deps = [
13+
"//executorch/backends/aoti/slim/c10/macros:macros",
14+
"//executorch/runtime/platform:platform",
15+
],
16+
)

backends/aoti/slim/core/Storage.h

Lines changed: 133 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,18 @@
1010

1111
#include <cstring>
1212

13+
#ifdef CUDA_AVAILABLE
14+
#include <executorch/backends/aoti/slim/c10/cuda/Exception.h>
15+
#include <executorch/backends/cuda/runtime/guard.h>
16+
#endif
17+
1318
#include <executorch/backends/aoti/slim/c10/core/Device.h>
1419
#include <executorch/backends/aoti/slim/c10/core/ScalarType.h>
1520
#include <executorch/backends/aoti/slim/util/ArrayRefUtil.h>
1621
#include <executorch/backends/aoti/slim/util/SharedPtr.h>
1722
#include <executorch/backends/aoti/slim/util/SizeUtil.h>
1823
#include <executorch/runtime/platform/assert.h>
24+
#include <executorch/runtime/platform/log.h>
1925

2026
namespace executorch::backends::aoti::slim {
2127

@@ -30,6 +36,10 @@ inline void noop(void*) {}
3036
/// Default CPU device constant.
3137
inline const c10::Device CPU_DEVICE = c10::Device(c10::DeviceType::CPU, 0);
3238

39+
/// Default CUDA device constant.
40+
inline const c10::Device DEFAULT_CUDA_DEVICE =
41+
c10::Device(c10::DeviceType::CUDA, 0);
42+
3343
/// DeviceTraits template for device-specific operations.
3444
/// Device-specific implementations provide allocate(), free(), and memcpy().
3545
template <c10::DeviceType D>
@@ -74,6 +84,119 @@ struct DeviceTraits<c10::DeviceType::CPU> {
7484
}
7585
};
7686

87+
#ifdef CUDA_AVAILABLE
88+
/// CUDA specialization of DeviceTraits.
89+
/// Provides CUDA memory allocation and copy operations using
90+
/// cudaMallocAsync/cudaFreeAsync with proper stream handling.
91+
///
92+
/// IMPORTANT: Callers are expected to set the correct CUDA device and stream
93+
/// using CUDAStreamGuard before calling these methods. This is consistent
94+
/// with PyTorch's CUDACachingAllocator design pattern where the allocator
95+
/// assumes the caller has already set the correct device context.
96+
template <>
97+
struct DeviceTraits<c10::DeviceType::CUDA> {
98+
/// Allocates CUDA device memory on the current stream.
99+
/// Uses cudaMallocAsync for asynchronous allocation on the stream
100+
/// that is currently set via CUDAStreamGuard, similar to how
101+
/// PyTorch's CUDACachingAllocator works.
102+
///
103+
/// NOTE: Caller must ensure the correct device is already set via
104+
/// CUDAStreamGuard. This function does NOT create a device guard internally.
105+
///
106+
/// @param nbytes Number of bytes to allocate.
107+
/// @param device The target CUDA device (used to get the stream).
108+
/// @return Pointer to allocated device memory.
109+
static void* allocate(size_t nbytes, const c10::Device& device) {
110+
// Get the current stream for this device (set by CUDAStreamGuard if any)
111+
// This follows PyTorch's pattern where the allocator assumes the caller
112+
// has already set the correct device via CUDAStreamGuard.
113+
auto stream_result =
114+
executorch::backends::cuda::getCurrentCUDAStream(device.index());
115+
ET_CHECK_MSG(
116+
stream_result.ok(),
117+
"Failed to get current CUDA stream for device %d",
118+
static_cast<int>(device.index()));
119+
120+
cudaStream_t stream = stream_result.get();
121+
void* data = nullptr;
122+
ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream));
123+
return data;
124+
}
125+
126+
/// Frees CUDA device memory on the current stream.
127+
/// @param ptr Pointer to device memory to free.
128+
static void free(void* ptr) {
129+
// Get the current stream for the current device
130+
auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
131+
if (stream_result.ok()) {
132+
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
133+
} else {
134+
// Fallback to synchronous free if we can't get the stream
135+
ET_CUDA_LOG_WARN(cudaFree(ptr));
136+
}
137+
}
138+
139+
/// Copies memory between CPU and CUDA or CUDA and CUDA.
140+
/// @param dst Destination pointer.
141+
/// @param src Source pointer.
142+
/// @param nbytes Number of bytes to copy.
143+
/// @param dst_device Destination device.
144+
/// @param src_device Source device.
145+
static void memcpy(
146+
void* dst,
147+
const void* src,
148+
size_t nbytes,
149+
const c10::Device& dst_device,
150+
const c10::Device& src_device) {
151+
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
152+
153+
if (src_device.is_cpu()) {
154+
direction = cudaMemcpyHostToDevice;
155+
} else if (dst_device.is_cpu()) {
156+
direction = cudaMemcpyDeviceToHost;
157+
} else {
158+
ET_CHECK_MSG(
159+
src_device.index() == dst_device.index(),
160+
"CUDA memcpy across different device indices not supported: %d != %d",
161+
static_cast<int>(src_device.index()),
162+
static_cast<int>(dst_device.index()));
163+
}
164+
165+
ET_CUDA_CHECK(cudaMemcpy(dst, src, nbytes, direction));
166+
}
167+
};
168+
#else
169+
/// CUDA stub when CUDA_AVAILABLE is not defined.
170+
/// All operations abort with an error message.
171+
template <>
172+
struct DeviceTraits<c10::DeviceType::CUDA> {
173+
static void* allocate(size_t nbytes, const c10::Device& device) {
174+
(void)nbytes;
175+
(void)device;
176+
ET_CHECK_MSG(false, "Build with CUDA_AVAILABLE=1 to enable CUDA support");
177+
}
178+
179+
static void free(void* ptr) {
180+
(void)ptr;
181+
ET_LOG(Error, "Build with CUDA_AVAILABLE=1 to enable CUDA support");
182+
}
183+
184+
static void memcpy(
185+
void* dst,
186+
const void* src,
187+
size_t nbytes,
188+
const c10::Device& dst_device,
189+
const c10::Device& src_device) {
190+
(void)dst;
191+
(void)src;
192+
(void)nbytes;
193+
(void)dst_device;
194+
(void)src_device;
195+
ET_CHECK_MSG(false, "Build with CUDA_AVAILABLE=1 to enable CUDA support");
196+
}
197+
};
198+
#endif // CUDA_AVAILABLE
199+
77200
/**
78201
* MaybeOwningStorage - A storage class that manages tensor data memory.
79202
*
@@ -93,17 +216,19 @@ struct DeviceTraits<c10::DeviceType::CPU> {
93216
class MaybeOwningStorage {
94217
public:
95218
/// Constructs owning storage with allocated memory.
96-
/// @param device The device for storage (must be CPU).
219+
/// @param device The device for storage (CPU or CUDA).
97220
/// @param nbytes Number of bytes to allocate.
98221
MaybeOwningStorage(const c10::Device& device, size_t nbytes)
99222
: device_(device), capacity_(nbytes), is_owning_(true) {
100-
ET_CHECK_MSG(
101-
device.is_cpu(),
102-
"Only CPU device is currently supported, got: %s",
103-
device.str().c_str());
104-
105-
data_ = DeviceTraits<c10::DeviceType::CPU>::allocate(nbytes, device);
106-
deleter_ = DeviceTraits<c10::DeviceType::CPU>::free;
223+
if (device.is_cpu()) {
224+
data_ = DeviceTraits<c10::DeviceType::CPU>::allocate(nbytes, device);
225+
deleter_ = DeviceTraits<c10::DeviceType::CPU>::free;
226+
} else if (device.is_cuda()) {
227+
data_ = DeviceTraits<c10::DeviceType::CUDA>::allocate(nbytes, device);
228+
deleter_ = DeviceTraits<c10::DeviceType::CUDA>::free;
229+
} else {
230+
ET_CHECK_MSG(false, "Unsupported device type: %s", device.str().c_str());
231+
}
107232
}
108233

109234
/// Default constructor is deleted - storage must have a device.

backends/aoti/slim/core/targets.bzl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@ def define_common_targets():
1717
"//executorch/backends/aoti/slim/util:shared_ptr",
1818
"//executorch/backends/aoti/slim/util:size_util",
1919
"//executorch/runtime/platform:platform",
20+
"//executorch/backends/aoti/slim/c10/cuda:exception",
21+
"//executorch/backends/cuda/runtime:guard",
2022
],
2123
)
2224

23-
# Header-only library for SlimTensor
25+
# Header-only library for SlimTensor (CPU-only for now)
2426
runtime.cxx_library(
2527
name = "slimtensor",
2628
headers = [

backends/aoti/slim/core/test/targets.bzl

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,36 @@
1+
load("@fbcode_macros//build_defs/lib:re_test_utils.bzl", "re_test_utils")
12
load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
23

4+
def get_backend_mode():
5+
"""Get the supported backend mode of slimtensor."""
6+
return ["cuda", "cpu"]
7+
38
def define_common_targets():
49
"""Define test targets for SlimTensor core module."""
510

6-
runtime.cxx_test(
7-
name = "test_storage",
8-
srcs = [
9-
"test_storage.cpp",
10-
],
11-
deps = [
12-
"//executorch/backends/aoti/slim/core:storage",
13-
],
14-
)
11+
# GPU storage test with CUDA support
12+
for backend_mode in get_backend_mode():
13+
backend_suffix = "_" + backend_mode if backend_mode == "cuda" else ""
14+
15+
backend_kwargs = {
16+
"external_deps": [("cuda", None, "cuda-lazy")],
17+
"preprocessor_flags": ["-DCUDA_AVAILABLE=1"],
18+
"keep_gpu_sections": True,
19+
"remote_execution": re_test_utils.remote_execution(
20+
platform = "gpu-remote-execution",
21+
),
22+
} if backend_mode == "cuda" else {}
23+
24+
runtime.cxx_test(
25+
name = "test_storage" + backend_suffix,
26+
srcs = [
27+
"test_storage.cpp",
28+
],
29+
deps = [
30+
"//executorch/backends/aoti/slim/core:storage",
31+
],
32+
**backend_kwargs
33+
)
1534

1635
runtime.cxx_test(
1736
name = "test_slimtensor_basic",

0 commit comments

Comments
 (0)