Skip to content

Commit 95d1eb6

Browse files
committed
[slimtensor] integration into backend
Pull Request resolved: #16565 This diff makes cuda backend actually use slimtensor. It: 1. updates cuda_backends to create slimtensor from given etensor 2. removed duplicate etensor-driven shim layers under cuda_backend 3. update cmake logic in both cuda backend and aoti backend Perf maintains the same. Shows as before. {F1984982156} Worth to notice that currently we keeps two sets of common shims, one is etensor-based and for metal backend, the other is slimtensor-based which used by cuda backend, to not impact metal backend work. When Metal backend finishs the migration, we should delete the duplicate common shims and only keep slimtensor-based one. ghstack-source-id: 336538676 @exported-using-ghexport Differential Revision: [D90606409](https://our.internmc.facebook.com/intern/diff/D90606409/)
1 parent cca559e commit 95d1eb6

39 files changed

Lines changed: 3405 additions & 9174 deletions

CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,10 @@ if(EXECUTORCH_ENABLE_BUNDLE_IO)
123123
add_definitions(-DET_BUNDLE_IO_ENABLED)
124124
endif()
125125

126+
if(EXECUTORCH_BUILD_CUDA)
127+
add_definitions(-DCUDA_AVAILABLE=1)
128+
endif()
129+
126130
# -ffunction-sections -fdata-sections: breaks function and data into sections so
127131
# they can be properly gc'd. -s: strip symbol.
128132
if(WIN32)

backends/aoti/CMakeLists.txt

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,10 @@ endif()
2525
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
2626
find_package_torch()
2727

28-
# Common AOTI functionality - combines all AOTI common components
28+
# ==============================================================================
29+
# AOTI common shims using ETensor (for Metal backend)
30+
# TODO(gasoonjia): Remove this after metal migration
31+
# ==============================================================================
2932
set(_aoti_common_sources common_shims.cpp)
3033
add_library(aoti_common STATIC ${_aoti_common_sources})
3134
target_include_directories(
@@ -59,3 +62,42 @@ install(
5962
EXPORT ExecuTorchTargets
6063
DESTINATION ${CMAKE_INSTALL_LIBDIR}
6164
)
65+
66+
# ==============================================================================
67+
# AOTI common shims using SlimTensor (for CUDA backend) Uses SlimTensor for all
68+
# tensor operations
69+
# TODO(gasoonjia): Replace aoti_common with this one after metal migration
70+
# ==============================================================================
71+
add_library(aoti_common_shims_slim STATIC common_shims_slim.cpp)
72+
target_include_directories(
73+
aoti_common_shims_slim
74+
PUBLIC $<BUILD_INTERFACE:${EXECUTORCH_ROOT}> $<INSTALL_INTERFACE:include>
75+
$<BUILD_INTERFACE:${EXECUTORCH_ROOT}/..>
76+
)
77+
target_compile_options(
78+
aoti_common_shims_slim
79+
PUBLIC $<$<CXX_COMPILER_ID:MSVC>:/EHsc /GR>
80+
$<$<NOT:$<CXX_COMPILER_ID:MSVC>>:-fexceptions -frtti -fPIC>
81+
)
82+
target_compile_definitions(
83+
aoti_common_shims_slim PUBLIC $<$<PLATFORM_ID:Windows>:EXPORT_AOTI_FUNCTIONS>
84+
)
85+
86+
# Add CUDA include directories and link CUDA runtime when building with CUDA
87+
if(EXECUTORCH_BUILD_CUDA)
88+
find_package(CUDAToolkit REQUIRED)
89+
target_include_directories(
90+
aoti_common_shims_slim PUBLIC ${CUDAToolkit_INCLUDE_DIRS}
91+
)
92+
target_link_libraries(aoti_common_shims_slim PUBLIC CUDA::cudart)
93+
endif()
94+
95+
target_link_libraries(
96+
aoti_common_shims_slim PUBLIC slimtensor extension_tensor ${CMAKE_DL_LIBS}
97+
)
98+
99+
install(
100+
TARGETS aoti_common_shims_slim
101+
EXPORT ExecuTorchTargets
102+
DESTINATION ${CMAKE_INSTALL_LIBDIR}
103+
)

backends/aoti/targets.bzl

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ def define_common_targets():
3333
],
3434
)
3535

36-
# AOTI common shims functionality
36+
# AOTI common shims functionality using ETensor
37+
# TODO(gasoonjia): Remove this after metal migration
3738
runtime.cxx_library(
3839
name = "common_shims",
3940
srcs = [
@@ -89,6 +90,7 @@ def define_common_targets():
8990

9091
# SlimTensor-based common shims library
9192
# Uses SlimTensor for all tensor operations
93+
# TODO(gasoonjia): Replace common_shims with this one after metal migration
9294
runtime.cxx_library(
9395
name = "common_shims_slim",
9496
srcs = [
@@ -97,10 +99,27 @@ def define_common_targets():
9799
headers = [
98100
"common_shims_slim.h",
99101
"export.h",
102+
"utils.h",
100103
],
101104
visibility = ["@EXECUTORCH_CLIENTS"],
102105
exported_deps = [
103106
"//executorch/runtime/core:core",
107+
"//executorch/runtime/core/exec_aten:lib",
104108
"//executorch/backends/aoti/slim/core:slimtensor",
105109
],
106110
)
111+
112+
# Common AOTI functionality for SlimTensor-based backends (combining common_shims_slim and delegate_handle)
113+
# All CUDA backend code should depend on this target
114+
# TODO(gasoonjia): Replace aoti_common with this one after metal migration
115+
runtime.cxx_library(
116+
name = "aoti_common_slim",
117+
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
118+
link_whole = True,
119+
supports_python_dlopen = True,
120+
visibility = ["PUBLIC"],
121+
exported_deps = [
122+
":common_shims_slim",
123+
":delegate_handle",
124+
],
125+
)

backends/cuda/CMakeLists.txt

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,13 +99,18 @@ install(
9999

100100
# CUDA-specific AOTI shim symbols (dynamically linked)
101101
set(_aoti_cuda_shim_sources
102-
runtime/shims/memory.cpp runtime/shims/tensor_attribute.cpp
103-
runtime/guard.cpp runtime/shims/cuda_guard.cpp runtime/shims/int4mm.cu
104-
${EXECUTORCH_ROOT}/backends/aoti/common_shims.cpp
102+
runtime/shims/memory.cpp
103+
runtime/shims/cuda_guard.cpp
104+
runtime/shims/int4mm.cu
105+
${EXECUTORCH_ROOT}/backends/aoti/common_shims_slim.cpp
106+
${EXECUTORCH_ROOT}/backends/aoti/slim/cuda/guard.cpp
105107
)
106108

107109
add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})
108110

111+
# Define CUDA_AVAILABLE to use SlimTensor on GPU in common_shims_slim.h
112+
target_compile_definitions(aoti_cuda_shims PRIVATE CUDA_AVAILABLE=1)
113+
109114
# Define export macros for shared library
110115
if(MSVC)
111116
target_compile_definitions(aoti_cuda_shims PRIVATE EXPORT_AOTI_FUNCTIONS)

backends/cuda/runtime/TARGETS

Lines changed: 12 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -3,28 +3,6 @@ load("//tools/build/buck:nvcc_flags.bzl", "get_nvcc_arch_args")
33

44
oncall("executorch")
55

6-
runtime.cxx_library(
7-
name = "guard",
8-
srcs = [
9-
"guard.cpp",
10-
],
11-
headers = [
12-
"guard.h",
13-
"utils.h",
14-
],
15-
visibility = ["PUBLIC"],
16-
deps = [
17-
"//executorch/runtime/platform:platform",
18-
],
19-
exported_deps = [
20-
"//executorch/runtime/core:core",
21-
"//executorch/runtime/core/exec_aten:lib",
22-
],
23-
external_deps = [
24-
("cuda", None, "cuda-lazy"),
25-
],
26-
)
27-
286
runtime.cxx_library(
297
name = "cuda_platform",
308
srcs = [
@@ -71,14 +49,12 @@ runtime.cxx_library(
7149
runtime.cxx_library(
7250
name = "runtime_shims",
7351
srcs = [
74-
"guard.cpp",
7552
"shims/cuda_guard.cpp",
7653
"shims/int4mm.cu",
7754
"shims/memory.cpp",
7855
"shims/tensor_attribute.cpp",
7956
],
8057
headers = [
81-
"guard.h",
8258
"shims/cuda_guard.h",
8359
"shims/int4mm.cuh",
8460
"shims/int4mm.h",
@@ -91,43 +67,18 @@ runtime.cxx_library(
9167
supports_python_dlopen = True,
9268
# Constructor needed for backend registration.
9369
compiler_flags = ["-Wno-global-constructors"],
70+
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
9471
visibility = ["PUBLIC"],
9572
deps = [
9673
":tensor_maker",
97-
"//executorch/backends/aoti:common_shims",
98-
"//executorch/runtime/core:core",
99-
"//executorch/runtime/core/exec_aten:lib",
100-
"//executorch/runtime/platform:platform",
101-
"//executorch/backends/cuda/runtime:cuda_platform",
102-
],
103-
nvcc_flags = get_nvcc_arch_args() + [
104-
"-_NVCC_HOST_COMPILER_FLAG_",
105-
"gcc",
106-
],
107-
external_deps = [
108-
("cuda", None, "cuda-lazy"),
109-
],
110-
)
111-
112-
runtime.cxx_library(
113-
name = "runtime_shims_slim",
114-
srcs = [
115-
"shims/memory_slim.cpp",
116-
],
117-
headers = [
118-
"shims/memory_slim.h",
119-
],
120-
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
121-
link_whole = True,
122-
supports_python_dlopen = True,
123-
visibility = ["@EXECUTORCH_CLIENTS"],
124-
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
125-
deps = [
74+
"//executorch/backends/aoti:aoti_common_slim",
12675
"//executorch/backends/aoti/slim/core:slimtensor",
12776
"//executorch/backends/aoti/slim/factory:empty",
12877
"//executorch/backends/aoti/slim/factory:from_blob",
129-
"//executorch/backends/aoti:common_shims",
78+
"//executorch/backends/aoti/slim/cuda:guard",
13079
"//executorch/runtime/core:core",
80+
"//executorch/runtime/core/exec_aten:lib",
81+
"//executorch/runtime/core/exec_aten/util:tensor_util",
13182
"//executorch/runtime/platform:platform",
13283
],
13384
nvcc_flags = get_nvcc_arch_args() + [
@@ -149,10 +100,16 @@ runtime.cxx_library(
149100
supports_python_dlopen = True,
150101
# Constructor needed for backend registration.
151102
compiler_flags = ["-Wno-global-constructors"],
103+
preprocessor_flags = ["-DCUDA_AVAILABLE=1"],
152104
visibility = ["PUBLIC"],
153105
deps = [
154106
":runtime_shims",
155-
"//executorch/backends/aoti:aoti_common",
107+
"//executorch/backends/aoti:aoti_common_slim",
108+
"//executorch/backends/aoti/slim/core:slimtensor",
109+
"//executorch/backends/aoti/slim/factory:empty",
110+
"//executorch/backends/aoti/slim/factory:from_blob",
111+
"//executorch/backends/aoti/slim/factory:from_etensor",
112+
"//executorch/extension/tensor:tensor",
156113
"//executorch/runtime/backend:interface",
157114
"//executorch/runtime/core/exec_aten/util:tensor_util",
158115
],

0 commit comments

Comments
 (0)