From 37849a3641e8c781f5805be17b7c6b7f34e73e2f Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 28 Oct 2025 13:39:18 -0700 Subject: [PATCH 01/30] Resolve a Cython build warning. --- cuda_core/cuda/core/experimental/_event.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx index dd6ef0b06e..051c216a4c 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/experimental/_event.pyx @@ -7,14 +7,13 @@ from __future__ import annotations cimport cpython from libc.stdint cimport uintptr_t from libc.string cimport memcpy - from cuda.bindings cimport cydriver - from cuda.core.experimental._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN ) +import cython from dataclasses import dataclass import multiprocessing from typing import TYPE_CHECKING, Optional @@ -277,7 +276,7 @@ cdef class IPCEventDescriptor: raise RuntimeError("IPCEventDescriptor objects cannot be instantiated directly. Please use Event APIs.") @classmethod - def _init(cls, reserved: bytes, busy_waited: bint): + def _init(cls, reserved: bytes, busy_waited: cython.bint): cdef IPCEventDescriptor self = IPCEventDescriptor.__new__(cls) self._reserved = reserved self._busy_waited = busy_waited From ac8a69ca34c93d1b9b77ffe3d5e617d6d23bfb42 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 28 Oct 2025 11:27:23 -0700 Subject: [PATCH 02/30] Make memory module into a package. --- .../core/experimental/_memory/__init__.py | 2 ++ .../cuda/core/experimental/_memory/memory.pxd | 36 +++++++++++++++++++ .../{_memory.pyx => _memory/memory.pyx} | 23 ++---------- 3 files changed, 41 insertions(+), 20 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_memory/__init__.py create mode 100644 cuda_core/cuda/core/experimental/_memory/memory.pxd rename cuda_core/cuda/core/experimental/{_memory.pyx => _memory/memory.pyx} (99%) diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py new file mode 100644 index 0000000000..f97d27eada --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -0,0 +1,2 @@ +from .memory import * +from .memory import _SynchronousMemoryResource diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd new file mode 100644 index 0000000000..7dda135754 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport uintptr_t, intptr_t +from cuda.core.experimental._stream cimport Stream as cyStream + +from cuda.core.experimental._stream import Stream + + +cdef class _cyBuffer: + """ + Internal only. Responsible for offering fast C method access. + """ + cdef: + intptr_t _ptr + size_t _size + _cyMemoryResource _mr + object _ptr_obj + cyStream _alloc_stream + + +cdef class Buffer(_cyBuffer): + cpdef close(self, stream: Stream=*) + + +cdef class _cyMemoryResource: + """ + Internal only. Responsible for offering fast C method access. + """ + cdef Buffer _allocate(self, size_t size, cyStream stream) + cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept + + +cdef class MemoryResource(_cyMemoryResource): + cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx similarity index 99% rename from cuda_core/cuda/core/experimental/_memory.pyx rename to cuda_core/cuda/core/experimental/_memory/memory.pyx index 32519cd26c..5efceae3d1 100644 --- a/cuda_core/cuda/core/experimental/_memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -8,11 +8,8 @@ cimport cpython from libc.limits cimport ULLONG_MAX from libc.stdint cimport uintptr_t, intptr_t from libc.string cimport memset, memcpy - from cuda.bindings cimport cydriver - -from cuda.core.experimental._stream cimport Stream as cyStream -from cuda.core.experimental._stream cimport default_stream +from cuda.core.experimental._stream cimport default_stream, Stream as cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, check_or_create_options, @@ -33,7 +30,7 @@ from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version ) if TYPE_CHECKING: - from ._device import Device + from .._device import Device import uuid @@ -43,19 +40,6 @@ PyCapsule = TypeVar("PyCapsule") DevicePointerT = Union[driver.CUdeviceptr, int, None] """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" - -cdef class _cyBuffer: - """ - Internal only. Responsible for offering fast C method access. - """ - cdef: - intptr_t _ptr - size_t _size - _cyMemoryResource _mr - object _ptr_obj - cyStream _alloc_stream - - cdef class _cyMemoryResource: """ Internal only. Responsible for offering fast C method access. @@ -355,7 +339,6 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes): # TODO: It is better to take a stream for latter deallocation return Buffer._init(ptr, size, mr=mr) - cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC): """Abstract base class for memory resources that manage allocation and deallocation of buffers. @@ -989,7 +972,7 @@ cdef class DeviceMemoryResource(MemoryResource): def _deep_reduce_device_memory_resource(mr): - from . import Device + from .._device import Device device = Device(mr.device_id) alloc_handle = mr.get_allocation_handle() return mr.from_allocation_handle, (device, alloc_handle) From 123aa2437c38daf88a6797737b92d77c9cacdd97 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 29 Oct 2025 11:40:02 -0700 Subject: [PATCH 03/30] Rename cyStream to _cyStream for consistency. --- .../cuda/core/experimental/_memory/memory.pxd | 10 ++++---- .../cuda/core/experimental/_memory/memory.pyx | 24 +++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd index 7dda135754..e23c858149 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pxd +++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from libc.stdint cimport uintptr_t, intptr_t -from cuda.core.experimental._stream cimport Stream as cyStream +from cuda.core.experimental._stream cimport Stream as _cyStream from cuda.core.experimental._stream import Stream @@ -17,7 +17,7 @@ cdef class _cyBuffer: size_t _size _cyMemoryResource _mr object _ptr_obj - cyStream _alloc_stream + _cyStream _alloc_stream cdef class Buffer(_cyBuffer): @@ -28,9 +28,9 @@ cdef class _cyMemoryResource: """ Internal only. Responsible for offering fast C method access. """ - cdef Buffer _allocate(self, size_t size, cyStream stream) - cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept + cdef Buffer _allocate(self, size_t size, _cyStream stream) + cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept cdef class MemoryResource(_cyMemoryResource): - cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept + cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index 5efceae3d1..5e71c30ba4 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -9,7 +9,7 @@ from libc.limits cimport ULLONG_MAX from libc.stdint cimport uintptr_t, intptr_t from libc.string cimport memset, memcpy from cuda.bindings cimport cydriver -from cuda.core.experimental._stream cimport default_stream, Stream as cyStream +from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, check_or_create_options, @@ -44,10 +44,10 @@ cdef class _cyMemoryResource: """ Internal only. Responsible for offering fast C method access. """ - cdef Buffer _allocate(self, size_t size, cyStream stream): + cdef Buffer _allocate(self, size_t size, _cyStream stream): raise NotImplementedError - cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept: + cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept: raise NotImplementedError @@ -106,7 +106,7 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes): self._ptr_obj = ptr self._size = size self._mr = mr - self._alloc_stream = (stream) if stream is not None else None + self._alloc_stream = <_cyStream>(stream) if stream is not None else None return self def __dealloc__(self): @@ -128,16 +128,16 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes): The stream object to use for asynchronous deallocation. If None, the behavior depends on the underlying memory resource. """ - cdef cyStream s + cdef _cyStream s if self._ptr and self._mr is not None: if stream is None: if self._alloc_stream is not None: s = self._alloc_stream else: # TODO: remove this branch when from_handle takes a stream - s = (default_stream()) + s = <_cyStream>(default_stream()) else: - s = stream + s = <_cyStream>stream self._mr._deallocate(self._ptr, self._size, s) self._ptr = 0 self._mr = None @@ -348,7 +348,7 @@ cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC): hold a reference to self, the buffer properties are retrieved simply by looking up the underlying memory resource's respective property.) """ - cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept: + cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept: self.deallocate(ptr, size, stream) @abc.abstractmethod @@ -867,7 +867,7 @@ cdef class DeviceMemoryResource(MemoryResource): raise return self._alloc_handle - cdef Buffer _allocate(self, size_t size, cyStream stream): + cdef Buffer _allocate(self, size_t size, _cyStream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr with nogil: @@ -901,9 +901,9 @@ cdef class DeviceMemoryResource(MemoryResource): raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") if stream is None: stream = default_stream() - return self._allocate(size, stream) + return self._allocate(size, <_cyStream>stream) - cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept: + cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr with nogil: @@ -923,7 +923,7 @@ cdef class DeviceMemoryResource(MemoryResource): If the buffer is deallocated without an explicit stream, the allocation stream is used. """ - self._deallocate(ptr, size, stream) + self._deallocate(ptr, size, <_cyStream>stream) @property def attributes(self) -> DeviceMemoryResourceAttributes: From fe4b67e1d798e8f5e6229b55100c24052af19158 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 29 Oct 2025 12:48:14 -0700 Subject: [PATCH 04/30] Move defs to memory.pxd header --- .../cuda/core/experimental/_memory/memory.pxd | 51 ++++++++++++++----- .../cuda/core/experimental/_memory/memory.pyx | 18 ------- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd index e23c858149..858901825b 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pxd +++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd @@ -3,15 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 from libc.stdint cimport uintptr_t, intptr_t -from cuda.core.experimental._stream cimport Stream as _cyStream +from cuda.bindings cimport cydriver -from cuda.core.experimental._stream import Stream +from cuda.core.experimental._stream cimport Stream as _cyStream cdef class _cyBuffer: - """ - Internal only. Responsible for offering fast C method access. - """ cdef: intptr_t _ptr size_t _size @@ -20,17 +17,47 @@ cdef class _cyBuffer: _cyStream _alloc_stream -cdef class Buffer(_cyBuffer): - cpdef close(self, stream: Stream=*) - - cdef class _cyMemoryResource: - """ - Internal only. Responsible for offering fast C method access. - """ cdef Buffer _allocate(self, size_t size, _cyStream stream) cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept +cdef class Buffer(_cyBuffer): + cpdef close(self, stream=*) + + cdef class MemoryResource(_cyMemoryResource): cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept + + +cdef class IPCBufferDescriptor: + cdef: + bytes _reserved + size_t _size + + +cdef class IPCAllocationHandle: + cdef: + int _handle + object _uuid + + cpdef close(self) + + +cdef class DeviceMemoryResource(MemoryResource): + cdef: + int _dev_id + cydriver.CUmemoryPool _mempool_handle + object _attributes + cydriver.CUmemAllocationHandleType _ipc_handle_type + bint _mempool_owned + bint _is_mapped + object _uuid + IPCAllocationHandle _alloc_handle + object __weakref__ + + cpdef close(self) + cpdef IPCAllocationHandle get_allocation_handle(self) + cdef Buffer _allocate(self, size_t size, _cyStream stream) + cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept + cpdef deallocate(self, ptr, size_t size, stream=*) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index 5e71c30ba4..7f753595f2 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -399,10 +399,6 @@ cdef cydriver.CUmemAllocationHandleType _IPC_HANDLE_TYPE = cydriver.CUmemAllocat cdef class IPCBufferDescriptor: """Serializable object describing a buffer that can be shared between processes.""" - cdef: - bytes _reserved - size_t _size - def __init__(self, *arg, **kwargs): raise RuntimeError("IPCBufferDescriptor objects cannot be instantiated directly. Please use MemoryResource APIs.") @@ -424,10 +420,6 @@ cdef class IPCBufferDescriptor: cdef class IPCAllocationHandle: """Shareable handle to an IPC-enabled device memory pool.""" - cdef: - int _handle - object _uuid - def __init__(self, *arg, **kwargs): raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.") @@ -643,16 +635,6 @@ cdef class DeviceMemoryResource(MemoryResource): methods. The reconstruction procedure uses the registry to find the associated MMR. """ - cdef: - int _dev_id - cydriver.CUmemoryPool _mempool_handle - object _attributes - cydriver.CUmemAllocationHandleType _ipc_handle_type - bint _mempool_owned - bint _is_mapped - object _uuid - IPCAllocationHandle _alloc_handle - object __weakref__ def __cinit__(self): self._dev_id = cydriver.CU_DEVICE_INVALID From ce77d446b511a9c3ba395b25f1944b8bf219fb46 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 29 Oct 2025 13:05:14 -0700 Subject: [PATCH 05/30] Separate VMM. --- .../core/experimental/_memory/__init__.py | 2 + .../cuda/core/experimental/_memory/memory.pyx | 519 +---------------- .../cuda/core/experimental/_memory/vmm.py | 525 ++++++++++++++++++ 3 files changed, 534 insertions(+), 512 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_memory/vmm.py diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index f97d27eada..998009f16c 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -1,2 +1,4 @@ from .memory import * from .memory import _SynchronousMemoryResource +from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource + diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index 7f753595f2..d25b47fce2 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -96,6 +96,13 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes): self._ptr_obj = None self._alloc_stream = None + def _clear(self): + self._ptr = 0 + self._size = 0 + self._mr = None + self._ptr_obj = None + self._alloc_stream = None + def __init__(self, *args, **kwargs): raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.") @@ -1054,515 +1061,3 @@ class _SynchronousMemoryResource(MemoryResource): return self._dev_id -VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None] -VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"] -VirtualMemoryGranularityT = Literal["minimum", "recommended"] -VirtualMemoryAccessTypeT = Union[Literal["rw", "r"], None] -VirtualMemoryAllocationTypeT = Literal["pinned", "managed"] - - -@dataclass -class VirtualMemoryResourceOptions: - """A configuration object for the VirtualMemoryResource - Stores configuration information which tells the resource how to use the CUDA VMM APIs - - Attributes - ---------- - allocation_type: :obj:`~_memory.VirtualMemoryAllocationTypeT` - Controls the type of allocation. - location_type: :obj:`~_memory.VirtualMemoryLocationTypeT` - Controls the location of the allocation. - handle_type: :obj:`~_memory.VirtualMemoryHandleTypeT` - Export handle type for the physical allocation. Use - ``"posix_fd"`` on Linux if you plan to - import/export the allocation (required for cuMemRetainAllocationHandle). - Use `None` if you don't need an exportable handle. - gpu_direct_rdma: bool - Hint that the allocation should be GDR-capable (if supported). - granularity: :obj:`~_memory.VirtualMemoryGranularityT` - Controls granularity query and size rounding. - addr_hint: int - A (optional) virtual address hint to try to reserve at. Setting it to 0 lets the CUDA driver decide. - addr_align: int - Alignment for the VA reservation. If `None`, use the queried granularity. - peers: Iterable[int] - Extra device IDs that should be granted access in addition to ``device``. - self_access: :obj:`~_memory.VirtualMemoryAccessTypeT` - Access flags for the owning device. - peer_access: :obj:`~_memory.VirtualMemoryAccessTypeT` - Access flags for peers. - """ - # Human-friendly strings; normalized in __post_init__ - allocation_type: VirtualMemoryAllocationTypeT = "pinned" - location_type: VirtualMemoryLocationTypeT = "device" - handle_type: VirtualMemoryHandleTypeT = "posix_fd" - granularity: VirtualMemoryGranularityT = "recommended" - gpu_direct_rdma: bool = False - addr_hint: Optional[int] = 0 - addr_align: Optional[int] = None - peers: Iterable[int] = field(default_factory=tuple) - self_access: VirtualMemoryAccessTypeT = "rw" - peer_access: VirtualMemoryAccessTypeT = "rw" - - _a = driver.CUmemAccess_flags - _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, None: 0} - _h = driver.CUmemAllocationHandleType - _handle_types = {None: _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC} - _g = driver.CUmemAllocationGranularity_flags - _granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM} - _l = driver.CUmemLocationType - _location_type = {"device": _l.CU_MEM_LOCATION_TYPE_DEVICE, "host": _l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} - # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not - _a = driver.CUmemAllocationType - _allocation_type = {"pinned": _a.CU_MEM_ALLOCATION_TYPE_PINNED} - ver_major, ver_minor = get_binding_version() - if ver_major >= 13: - _allocation_type["managed"] = _a.CU_MEM_ALLOCATION_TYPE_MANAGED - - @staticmethod - def _access_to_flags(spec: str): - flags = VirtualMemoryResourceOptions._access_flags.get(spec) - if flags is None: - raise ValueError(f"Unknown access spec: {spec!r}") - return flags - - @staticmethod - def _allocation_type_to_driver(spec: str): - alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec) - if alloc_type is None: - raise ValueError(f"Unsupported allocation_type: {spec!r}") - return alloc_type - - @staticmethod - def _location_type_to_driver(spec: str): - loc_type = VirtualMemoryResourceOptions._location_type.get(spec) - if loc_type is None: - raise ValueError(f"Unsupported location_type: {spec!r}") - return loc_type - - @staticmethod - def _handle_type_to_driver(spec: str): - handle_type = VirtualMemoryResourceOptions._handle_types.get(spec) - if handle_type is None: - raise ValueError(f"Unsupported handle_type: {spec!r}") - return handle_type - - @staticmethod - def _granularity_to_driver(spec: str): - granularity = VirtualMemoryResourceOptions._granularity.get(spec) - if granularity is None: - raise ValueError(f"Unsupported granularity: {spec!r}") - return granularity - - -class VirtualMemoryResource(MemoryResource): - """Create a device memory resource that uses the CUDA VMM APIs to allocate memory. - - Parameters - ---------- - device_id : int - Device ordinal for which a memory resource is constructed. - - config : VirtualMemoryResourceOptions - A configuration object for the VirtualMemoryResource - """ - def __init__(self, device, config: VirtualMemoryResourceOptions = None): - self.device = device - self.config = check_or_create_options( - VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False - ) - if self.config.location_type == "host": - self.device = None - if platform.system() == "Windows": - raise NotImplementedError("VirtualMemoryResource is not supported on Windows") - - # Validate RDMA support if requested - if self.config.gpu_direct_rdma and self.device is not None: - if not self.device.properties.gpu_direct_rdma_supported: - raise RuntimeError("GPU Direct RDMA is not supported on this device") - - @staticmethod - def _align_up(size: int, gran: int) -> int: - """ - Align a size up to the nearest multiple of a granularity. - """ - return (size + gran - 1) & ~(gran - 1) - - def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions = None) -> Buffer: - """ - Grow an existing allocation using CUDA VMM, with a configurable policy. - - This implements true growing allocations that preserve the base pointer - by extending the virtual address range and mapping additional physical memory. - - This function uses transactional allocation: if any step fails, the original buffer is not modified and - all steps the function took are rolled back so a new allocation is not created. - - Parameters - ---------- - buf : Buffer - The existing buffer to grow - new_size : int - The new total size for the allocation - config : VirtualMemoryResourceOptions, optional - Configuration for the new physical memory chunks. If None, uses current config. - - Returns - ------- - Buffer - The same buffer with updated size and properties, preserving the original pointer - """ - if config is not None: - self.config = config - - # Build allocation properties for new chunks - prop = driver.CUmemAllocationProp() - prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type) - prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(self.config.location_type) - prop.location.id = self.device.device_id - prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0 - prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(self.config.handle_type) - - # Query granularity - gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(self.config.granularity) - res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) - raise_if_driver_error(res) - - # Calculate sizes - additional_size = new_size - buf.size - if additional_size <= 0: - # Same size: only update access policy if needed; avoid zero-sized driver calls - descs = self._build_access_descriptors(prop) - if descs: - res, = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs)) - raise_if_driver_error(res) - return buf - - aligned_additional_size = VirtualMemoryResource._align_up(additional_size, gran) - total_aligned_size = VirtualMemoryResource._align_up(new_size, gran) - aligned_prev_size = total_aligned_size - aligned_additional_size - addr_align = self.config.addr_align or gran - - # Try to extend the existing VA range first - res, new_ptr = driver.cuMemAddressReserve( - aligned_additional_size, - addr_align, - int(buf.handle) + aligned_prev_size, # fixedAddr hint - aligned end of current range - 0 - ) - - if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size): - # Check for specific errors that are not recoverable with the slow path - if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED): - raise_if_driver_error(res) - res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size) - raise_if_driver_error(res2) - # Fallback: couldn't extend contiguously, need full remapping - return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align) - else: - # Success! We can extend the VA range contiguously - return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr) - - def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, - aligned_additional_size: int, new_ptr: int) -> Buffer: - """ - Fast path for growing a virtual memory allocation when the new region can be - reserved contiguously after the existing buffer. - - This function creates and maps new physical memory for the additional size, - sets access permissions, and updates the buffer size in place (the pointer - remains unchanged). - - Args: - buf (Buffer): The buffer to grow. - new_size (int): The new total size in bytes. - prop (driver.CUmemAllocationProp): Allocation properties for the new memory. - aligned_additional_size (int): The size of the new region to allocate, aligned to granularity. - new_ptr (int): The address of the newly reserved contiguous VA region (should be at the end of the current buffer). - - Returns: - Buffer: The same buffer object with its size updated to `new_size`. - """ - with Transaction() as trans: - # Create new physical memory for the additional size - trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])) - res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) - raise_if_driver_error(res) - # Register undo for creation - trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) - - # Map the new physical memory to the extended VA range - res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) - raise_if_driver_error(res) - # Register undo for mapping - trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0])) - - # Set access permissions for the new portion - descs = self._build_access_descriptors(prop) - if descs: - res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs)) - raise_if_driver_error(res) - - # All succeeded, cancel undo actions - trans.commit() - - # Update the buffer size (pointer stays the same) - buf._size = new_size - return buf - - def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, - aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer: - """ - Slow path for growing a virtual memory allocation when the new region cannot be - reserved contiguously after the existing buffer. - - This function reserves a new, larger virtual address (VA) range, remaps the old - physical memory to the beginning of the new VA range, creates and maps new physical - memory for the additional size, sets access permissions, and updates the buffer's - pointer and size. - - Args: - buf (Buffer): The buffer to grow. - new_size (int): The new total size in bytes. - prop (driver.CUmemAllocationProp): Allocation properties for the new memory. - aligned_additional_size (int): The size of the new region to allocate, aligned to granularity. - total_aligned_size (int): The total new size to reserve, aligned to granularity. - addr_align (int): The required address alignment for the new VA range. - - Returns: - Buffer: The buffer object updated with the new pointer and size. - """ - with Transaction() as trans: - # Reserve a completely new, larger VA range - res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) - raise_if_driver_error(res) - # Register undo for VA reservation - trans.append(lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])) - - # Get the old allocation handle for remapping - result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) - raise_if_driver_error(result) - # Register undo for old_handle - trans.append(lambda h=old_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) - - # Unmap the old VA range (aligned previous size) - aligned_prev_size = total_aligned_size - aligned_additional_size - result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size) - raise_if_driver_error(result) - - def _remap_old(): - # Try to remap the old physical memory back to the original VA range - try: - res, = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0) - raise_if_driver_error(res) - except Exception: - pass - trans.append(_remap_old) - - # Remap the old physical memory to the new VA range (aligned previous size) - res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0) - raise_if_driver_error(res) - - # Register undo for mapping - trans.append(lambda np=new_ptr, s=aligned_prev_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0])) - - # Create new physical memory for the additional size - res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) - raise_if_driver_error(res) - - # Register undo for new physical memory - trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) - - # Map the new physical memory to the extended portion (aligned offset) - res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) - raise_if_driver_error(res) - - # Register undo for mapping - trans.append(lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(base + offs, s)[0])) - - # Set access permissions for the entire new range - descs = self._build_access_descriptors(prop) - if descs: - res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs)) - raise_if_driver_error(res) - - # All succeeded, cancel undo actions - trans.commit() - - # Free the old VA range (aligned previous size) - res2, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) - raise_if_driver_error(res2) - - # Invalidate the old buffer so its destructor won't try to free again - buf._ptr = 0 - buf._ptr_obj = None - buf._size = 0 - buf._mr = None - - # Return a new Buffer for the new mapping - return Buffer.from_handle(ptr=new_ptr, size=new_size, mr=self) - - - def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list: - """ - Build access descriptors for memory access permissions. - - Returns - ------- - list - List of CUmemAccessDesc objects for setting memory access - """ - descs = [] - - # Owner access - owner_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.self_access) - if owner_flags: - d = driver.CUmemAccessDesc() - d.location.type = prop.location.type - d.location.id = prop.location.id - d.flags = owner_flags - descs.append(d) - - # Peer device access - peer_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.peer_access) - if peer_flags: - for peer_dev in self.config.peers: - d = driver.CUmemAccessDesc() - d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - d.location.id = int(peer_dev) - d.flags = peer_flags - descs.append(d) - - return descs - - - def allocate(self, size: int, stream: Stream = None) -> Buffer: - """ - Allocate a buffer of the given size using CUDA virtual memory. - - Parameters - ---------- - size : int - The size in bytes of the buffer to allocate. - stream : Stream, optional - CUDA stream to associate with the allocation (not currently supported). - - Returns - ------- - Buffer - A Buffer object representing the allocated virtual memory. - - Raises - ------ - NotImplementedError - If a stream is provided or if the location type is not device memory. - CUDAError - If any CUDA driver API call fails during allocation. - - Notes - ----- - This method uses transactional allocation: if any step fails, all resources - allocated so far are automatically cleaned up. The allocation is performed - with the configured granularity, access permissions, and peer access as - specified in the resource's configuration. - """ - if stream is not None: - raise NotImplementedError("Stream is not supported with VirtualMemoryResource") - - config = self.config - # ---- Build allocation properties ---- - prop = driver.CUmemAllocationProp() - prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type) - - prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type) - prop.location.id = self.device.device_id if config.location_type == "device" else -1 - prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0 - prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type) - - # ---- Query and apply granularity ---- - # Choose min vs recommended granularity per config - gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(config.granularity) - res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) - raise_if_driver_error(res) - - aligned_size = VirtualMemoryResource._align_up(size, gran) - addr_align = config.addr_align or gran - - # ---- Transactional allocation ---- - with Transaction() as trans: - # ---- Create physical memory ---- - res, handle = driver.cuMemCreate(aligned_size, prop, 0) - raise_if_driver_error(res) - # Register undo for physical memory - trans.append(lambda h=handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) - - # ---- Reserve VA space ---- - # Potentially, use a separate size for the VA reservation from the physical allocation size - res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0) - raise_if_driver_error(res) - # Register undo for VA reservation - trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemAddressFree(p, s)[0])) - - # ---- Map physical memory into VA ---- - res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) - trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemUnmap(p, s)[0])) - raise_if_driver_error(res) - - # ---- Set access for owner + peers ---- - descs = self._build_access_descriptors(prop) - if descs: - res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs)) - raise_if_driver_error(res) - - trans.commit() - - # Done — return a Buffer that tracks this VA range - buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self) - return buf - - def deallocate(self, ptr: int, size: int, stream: Stream=None) -> None: - """ - Deallocate memory on the device using CUDA VMM APIs. - """ - result, handle = driver.cuMemRetainAllocationHandle(ptr) - raise_if_driver_error(result) - result, = driver.cuMemUnmap(ptr, size) - raise_if_driver_error(result) - result, = driver.cuMemAddressFree(ptr, size) - raise_if_driver_error(result) - result, = driver.cuMemRelease(handle) - raise_if_driver_error(result) - - - @property - def is_device_accessible(self) -> bool: - """ - Indicates whether the allocated memory is accessible from the device. - """ - return self.config.location_type == "device" - - @property - def is_host_accessible(self) -> bool: - """ - Indicates whether the allocated memory is accessible from the host. - """ - return self.config.location_type == "host" - - @property - def device_id(self) -> int: - """ - Get the device ID associated with this memory resource. - - Returns: - int: CUDA device ID. -1 if the memory resource allocates host memory - """ - return self.device.device_id if self.config.location_type == "device" else -1 - - def __repr__(self) -> str: - """ - Return a string representation of the VirtualMemoryResource. - - Returns: - str: A string describing the object - """ - return f"" diff --git a/cuda_core/cuda/core/experimental/_memory/vmm.py b/cuda_core/cuda/core/experimental/_memory/vmm.py new file mode 100644 index 0000000000..60ba8280d8 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/vmm.py @@ -0,0 +1,525 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass, field +from typing import Iterable, Literal, Optional, Union +import platform + +from cuda.core.experimental._stream import Stream +from cuda.core.experimental._memory.memory import Buffer, MemoryResource +from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version ) +from cuda.core.experimental._utils.cuda_utils import ( + _check_driver_error as raise_if_driver_error, + check_or_create_options, +) + +VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None] +VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"] +VirtualMemoryGranularityT = Literal["minimum", "recommended"] +VirtualMemoryAccessTypeT = Union[Literal["rw", "r"], None] +VirtualMemoryAllocationTypeT = Literal["pinned", "managed"] + + +@dataclass +class VirtualMemoryResourceOptions: + """A configuration object for the VirtualMemoryResource + Stores configuration information which tells the resource how to use the CUDA VMM APIs + + Attributes + ---------- + allocation_type: :obj:`~_memory.VirtualMemoryAllocationTypeT` + Controls the type of allocation. + location_type: :obj:`~_memory.VirtualMemoryLocationTypeT` + Controls the location of the allocation. + handle_type: :obj:`~_memory.VirtualMemoryHandleTypeT` + Export handle type for the physical allocation. Use + ``"posix_fd"`` on Linux if you plan to + import/export the allocation (required for cuMemRetainAllocationHandle). + Use `None` if you don't need an exportable handle. + gpu_direct_rdma: bool + Hint that the allocation should be GDR-capable (if supported). + granularity: :obj:`~_memory.VirtualMemoryGranularityT` + Controls granularity query and size rounding. + addr_hint: int + A (optional) virtual address hint to try to reserve at. Setting it to 0 lets the CUDA driver decide. + addr_align: int + Alignment for the VA reservation. If `None`, use the queried granularity. + peers: Iterable[int] + Extra device IDs that should be granted access in addition to ``device``. + self_access: :obj:`~_memory.VirtualMemoryAccessTypeT` + Access flags for the owning device. + peer_access: :obj:`~_memory.VirtualMemoryAccessTypeT` + Access flags for peers. + """ + # Human-friendly strings; normalized in __post_init__ + allocation_type: VirtualMemoryAllocationTypeT = "pinned" + location_type: VirtualMemoryLocationTypeT = "device" + handle_type: VirtualMemoryHandleTypeT = "posix_fd" + granularity: VirtualMemoryGranularityT = "recommended" + gpu_direct_rdma: bool = False + addr_hint: Optional[int] = 0 + addr_align: Optional[int] = None + peers: Iterable[int] = field(default_factory=tuple) + self_access: VirtualMemoryAccessTypeT = "rw" + peer_access: VirtualMemoryAccessTypeT = "rw" + + _a = driver.CUmemAccess_flags + _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, None: 0} + _h = driver.CUmemAllocationHandleType + _handle_types = {None: _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC} + _g = driver.CUmemAllocationGranularity_flags + _granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM} + _l = driver.CUmemLocationType + _location_type = {"device": _l.CU_MEM_LOCATION_TYPE_DEVICE, "host": _l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} + # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not + _a = driver.CUmemAllocationType + _allocation_type = {"pinned": _a.CU_MEM_ALLOCATION_TYPE_PINNED} + ver_major, ver_minor = get_binding_version() + if ver_major >= 13: + _allocation_type["managed"] = _a.CU_MEM_ALLOCATION_TYPE_MANAGED + + @staticmethod + def _access_to_flags(spec: str): + flags = VirtualMemoryResourceOptions._access_flags.get(spec) + if flags is None: + raise ValueError(f"Unknown access spec: {spec!r}") + return flags + + @staticmethod + def _allocation_type_to_driver(spec: str): + alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec) + if alloc_type is None: + raise ValueError(f"Unsupported allocation_type: {spec!r}") + return alloc_type + + @staticmethod + def _location_type_to_driver(spec: str): + loc_type = VirtualMemoryResourceOptions._location_type.get(spec) + if loc_type is None: + raise ValueError(f"Unsupported location_type: {spec!r}") + return loc_type + + @staticmethod + def _handle_type_to_driver(spec: str): + handle_type = VirtualMemoryResourceOptions._handle_types.get(spec) + if handle_type is None: + raise ValueError(f"Unsupported handle_type: {spec!r}") + return handle_type + + @staticmethod + def _granularity_to_driver(spec: str): + granularity = VirtualMemoryResourceOptions._granularity.get(spec) + if granularity is None: + raise ValueError(f"Unsupported granularity: {spec!r}") + return granularity + + +class VirtualMemoryResource(MemoryResource): + """Create a device memory resource that uses the CUDA VMM APIs to allocate memory. + + Parameters + ---------- + device_id : int + Device ordinal for which a memory resource is constructed. + + config : VirtualMemoryResourceOptions + A configuration object for the VirtualMemoryResource + """ + def __init__(self, device, config: VirtualMemoryResourceOptions = None): + self.device = device + self.config = check_or_create_options( + VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False + ) + if self.config.location_type == "host": + self.device = None + if platform.system() == "Windows": + raise NotImplementedError("VirtualMemoryResource is not supported on Windows") + + # Validate RDMA support if requested + if self.config.gpu_direct_rdma and self.device is not None: + if not self.device.properties.gpu_direct_rdma_supported: + raise RuntimeError("GPU Direct RDMA is not supported on this device") + + @staticmethod + def _align_up(size: int, gran: int) -> int: + """ + Align a size up to the nearest multiple of a granularity. + """ + return (size + gran - 1) & ~(gran - 1) + + def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions = None) -> Buffer: + """ + Grow an existing allocation using CUDA VMM, with a configurable policy. + + This implements true growing allocations that preserve the base pointer + by extending the virtual address range and mapping additional physical memory. + + This function uses transactional allocation: if any step fails, the original buffer is not modified and + all steps the function took are rolled back so a new allocation is not created. + + Parameters + ---------- + buf : Buffer + The existing buffer to grow + new_size : int + The new total size for the allocation + config : VirtualMemoryResourceOptions, optional + Configuration for the new physical memory chunks. If None, uses current config. + + Returns + ------- + Buffer + The same buffer with updated size and properties, preserving the original pointer + """ + if config is not None: + self.config = config + + # Build allocation properties for new chunks + prop = driver.CUmemAllocationProp() + prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type) + prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(self.config.location_type) + prop.location.id = self.device.device_id + prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0 + prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(self.config.handle_type) + + # Query granularity + gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(self.config.granularity) + res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) + raise_if_driver_error(res) + + # Calculate sizes + additional_size = new_size - buf.size + if additional_size <= 0: + # Same size: only update access policy if needed; avoid zero-sized driver calls + descs = self._build_access_descriptors(prop) + if descs: + res, = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs)) + raise_if_driver_error(res) + return buf + + aligned_additional_size = VirtualMemoryResource._align_up(additional_size, gran) + total_aligned_size = VirtualMemoryResource._align_up(new_size, gran) + aligned_prev_size = total_aligned_size - aligned_additional_size + addr_align = self.config.addr_align or gran + + # Try to extend the existing VA range first + res, new_ptr = driver.cuMemAddressReserve( + aligned_additional_size, + addr_align, + int(buf.handle) + aligned_prev_size, # fixedAddr hint - aligned end of current range + 0 + ) + + if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size): + # Check for specific errors that are not recoverable with the slow path + if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED): + raise_if_driver_error(res) + res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size) + raise_if_driver_error(res2) + # Fallback: couldn't extend contiguously, need full remapping + return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align) + else: + # Success! We can extend the VA range contiguously + return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr) + + def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, + aligned_additional_size: int, new_ptr: int) -> Buffer: + """ + Fast path for growing a virtual memory allocation when the new region can be + reserved contiguously after the existing buffer. + + This function creates and maps new physical memory for the additional size, + sets access permissions, and updates the buffer size in place (the pointer + remains unchanged). + + Args: + buf (Buffer): The buffer to grow. + new_size (int): The new total size in bytes. + prop (driver.CUmemAllocationProp): Allocation properties for the new memory. + aligned_additional_size (int): The size of the new region to allocate, aligned to granularity. + new_ptr (int): The address of the newly reserved contiguous VA region (should be at the end of the current buffer). + + Returns: + Buffer: The same buffer object with its size updated to `new_size`. + """ + with Transaction() as trans: + # Create new physical memory for the additional size + trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])) + res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) + raise_if_driver_error(res) + # Register undo for creation + trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) + + # Map the new physical memory to the extended VA range + res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) + raise_if_driver_error(res) + # Register undo for mapping + trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0])) + + # Set access permissions for the new portion + descs = self._build_access_descriptors(prop) + if descs: + res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs)) + raise_if_driver_error(res) + + # All succeeded, cancel undo actions + trans.commit() + + # Update the buffer size (pointer stays the same) + buf._size = new_size + return buf + + def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, + aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer: + """ + Slow path for growing a virtual memory allocation when the new region cannot be + reserved contiguously after the existing buffer. + + This function reserves a new, larger virtual address (VA) range, remaps the old + physical memory to the beginning of the new VA range, creates and maps new physical + memory for the additional size, sets access permissions, and updates the buffer's + pointer and size. + + Args: + buf (Buffer): The buffer to grow. + new_size (int): The new total size in bytes. + prop (driver.CUmemAllocationProp): Allocation properties for the new memory. + aligned_additional_size (int): The size of the new region to allocate, aligned to granularity. + total_aligned_size (int): The total new size to reserve, aligned to granularity. + addr_align (int): The required address alignment for the new VA range. + + Returns: + Buffer: The buffer object updated with the new pointer and size. + """ + with Transaction() as trans: + # Reserve a completely new, larger VA range + res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) + raise_if_driver_error(res) + # Register undo for VA reservation + trans.append(lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])) + + # Get the old allocation handle for remapping + result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) + raise_if_driver_error(result) + # Register undo for old_handle + trans.append(lambda h=old_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) + + # Unmap the old VA range (aligned previous size) + aligned_prev_size = total_aligned_size - aligned_additional_size + result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size) + raise_if_driver_error(result) + + def _remap_old(): + # Try to remap the old physical memory back to the original VA range + try: + res, = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0) + raise_if_driver_error(res) + except Exception: + pass + trans.append(_remap_old) + + # Remap the old physical memory to the new VA range (aligned previous size) + res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0) + raise_if_driver_error(res) + + # Register undo for mapping + trans.append(lambda np=new_ptr, s=aligned_prev_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0])) + + # Create new physical memory for the additional size + res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) + raise_if_driver_error(res) + + # Register undo for new physical memory + trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) + + # Map the new physical memory to the extended portion (aligned offset) + res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) + raise_if_driver_error(res) + + # Register undo for mapping + trans.append(lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(base + offs, s)[0])) + + # Set access permissions for the entire new range + descs = self._build_access_descriptors(prop) + if descs: + res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs)) + raise_if_driver_error(res) + + # All succeeded, cancel undo actions + trans.commit() + + # Free the old VA range (aligned previous size) + res2, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) + raise_if_driver_error(res2) + + # Invalidate the old buffer so its destructor won't try to free again + buf._clear() + + # Return a new Buffer for the new mapping + return Buffer.from_handle(ptr=new_ptr, size=new_size, mr=self) + + + def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list: + """ + Build access descriptors for memory access permissions. + + Returns + ------- + list + List of CUmemAccessDesc objects for setting memory access + """ + descs = [] + + # Owner access + owner_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.self_access) + if owner_flags: + d = driver.CUmemAccessDesc() + d.location.type = prop.location.type + d.location.id = prop.location.id + d.flags = owner_flags + descs.append(d) + + # Peer device access + peer_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.peer_access) + if peer_flags: + for peer_dev in self.config.peers: + d = driver.CUmemAccessDesc() + d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + d.location.id = int(peer_dev) + d.flags = peer_flags + descs.append(d) + + return descs + + + def allocate(self, size: int, stream: Stream = None) -> Buffer: + """ + Allocate a buffer of the given size using CUDA virtual memory. + + Parameters + ---------- + size : int + The size in bytes of the buffer to allocate. + stream : Stream, optional + CUDA stream to associate with the allocation (not currently supported). + + Returns + ------- + Buffer + A Buffer object representing the allocated virtual memory. + + Raises + ------ + NotImplementedError + If a stream is provided or if the location type is not device memory. + CUDAError + If any CUDA driver API call fails during allocation. + + Notes + ----- + This method uses transactional allocation: if any step fails, all resources + allocated so far are automatically cleaned up. The allocation is performed + with the configured granularity, access permissions, and peer access as + specified in the resource's configuration. + """ + if stream is not None: + raise NotImplementedError("Stream is not supported with VirtualMemoryResource") + + config = self.config + # ---- Build allocation properties ---- + prop = driver.CUmemAllocationProp() + prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type) + + prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type) + prop.location.id = self.device.device_id if config.location_type == "device" else -1 + prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0 + prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type) + + # ---- Query and apply granularity ---- + # Choose min vs recommended granularity per config + gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(config.granularity) + res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag) + raise_if_driver_error(res) + + aligned_size = VirtualMemoryResource._align_up(size, gran) + addr_align = config.addr_align or gran + + # ---- Transactional allocation ---- + with Transaction() as trans: + # ---- Create physical memory ---- + res, handle = driver.cuMemCreate(aligned_size, prop, 0) + raise_if_driver_error(res) + # Register undo for physical memory + trans.append(lambda h=handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) + + # ---- Reserve VA space ---- + # Potentially, use a separate size for the VA reservation from the physical allocation size + res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0) + raise_if_driver_error(res) + # Register undo for VA reservation + trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemAddressFree(p, s)[0])) + + # ---- Map physical memory into VA ---- + res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) + trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemUnmap(p, s)[0])) + raise_if_driver_error(res) + + # ---- Set access for owner + peers ---- + descs = self._build_access_descriptors(prop) + if descs: + res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs)) + raise_if_driver_error(res) + + trans.commit() + + # Done — return a Buffer that tracks this VA range + buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self) + return buf + + def deallocate(self, ptr: int, size: int, stream: Stream=None) -> None: + """ + Deallocate memory on the device using CUDA VMM APIs. + """ + result, handle = driver.cuMemRetainAllocationHandle(ptr) + raise_if_driver_error(result) + result, = driver.cuMemUnmap(ptr, size) + raise_if_driver_error(result) + result, = driver.cuMemAddressFree(ptr, size) + raise_if_driver_error(result) + result, = driver.cuMemRelease(handle) + raise_if_driver_error(result) + + + @property + def is_device_accessible(self) -> bool: + """ + Indicates whether the allocated memory is accessible from the device. + """ + return self.config.location_type == "device" + + @property + def is_host_accessible(self) -> bool: + """ + Indicates whether the allocated memory is accessible from the host. + """ + return self.config.location_type == "host" + + @property + def device_id(self) -> int: + """ + Get the device ID associated with this memory resource. + + Returns: + int: CUDA device ID. -1 if the memory resource allocates host memory + """ + return self.device.device_id if self.config.location_type == "device" else -1 + + def __repr__(self) -> str: + """ + Return a string representation of the VirtualMemoryResource. + + Returns: + str: A string describing the object + """ + return f"" From c5179bc4afda910e6c6ea4754db095346ef2bc81 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 29 Oct 2025 13:49:23 -0700 Subject: [PATCH 06/30] Weaken dependencies from device to memory module. --- cuda_core/cuda/core/experimental/_device.pyx | 9 ++++++--- cuda_core/cuda/core/experimental/_memory/memory.pyx | 5 +---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx index 1db2adbf8d..582585c6df 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/experimental/_device.pyx @@ -10,12 +10,11 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN import threading -from typing import Optional, Union +from typing import Optional, Union, TYPE_CHECKING from cuda.core.experimental._context import Context, ContextOptions from cuda.core.experimental._event import Event, EventOptions from cuda.core.experimental._graph import GraphBuilder -from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, MemoryResource, _SynchronousMemoryResource from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions from cuda.core.experimental._utils.clear_error_support import assert_type from cuda.core.experimental._utils.cuda_utils import ( @@ -27,7 +26,8 @@ from cuda.core.experimental._utils.cuda_utils import ( ) from cuda.core.experimental._stream cimport default_stream - +if TYPE_CHECKING: + from cuda.core.experimental._memory import Buffer, MemoryResource # TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python, # but it seems it is very convenient to expose them for testing purposes... @@ -996,8 +996,10 @@ class Device: ) ) if attr == 1: + from cuda.core.experimental._memory import DeviceMemoryResource device._mr = DeviceMemoryResource(dev_id) else: + from cuda.core.experimental._memory import _SynchronousMemoryResource device._mr = _SynchronousMemoryResource(dev_id) device._has_inited = False @@ -1122,6 +1124,7 @@ class Device: @memory_resource.setter def memory_resource(self, mr): + from cuda.core.experimental._memory import MemoryResource assert_type(mr, MemoryResource) self._mr = mr diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index d25b47fce2..6677d45289 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -34,9 +34,6 @@ if TYPE_CHECKING: import uuid -PyCapsule = TypeVar("PyCapsule") -"""Represent the capsule type.""" - DevicePointerT = Union[driver.CUdeviceptr, int, None] """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" @@ -291,7 +288,7 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes): max_version: tuple[int, int] | None = None, dl_device: tuple[int, int] | None = None, copy: bool | None = None, - ) -> PyCapsule: + ) -> TypeVar("PyCapsule"): # Note: we ignore the stream argument entirely (as if it is -1). # It is the user's responsibility to maintain stream order. if dl_device is not None: From e19274887c7b781bcc98127284ab7ff863415965 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 29 Oct 2025 14:01:53 -0700 Subject: [PATCH 07/30] Move LegacyPinnedMemoryResource to a submodule. --- .../core/experimental/_memory/__init__.py | 1 + .../cuda/core/experimental/_memory/legacy.py | 75 +++++++++++++++++++ .../cuda/core/experimental/_memory/memory.pyx | 62 +-------------- cuda_core/cuda/core/experimental/_stream.pxd | 2 +- cuda_core/cuda/core/experimental/_stream.pyx | 2 +- 5 files changed, 79 insertions(+), 63 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_memory/legacy.py diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index 998009f16c..2d1d500b5b 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -1,4 +1,5 @@ from .memory import * from .memory import _SynchronousMemoryResource +from .legacy import LegacyPinnedMemoryResource from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource diff --git a/cuda_core/cuda/core/experimental/_memory/legacy.py b/cuda_core/cuda/core/experimental/_memory/legacy.py new file mode 100644 index 0000000000..060e664924 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/legacy.py @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Union + +from cuda.core.experimental._memory.memory import Buffer, MemoryResource +from cuda.core.experimental._utils.cuda_utils import ( + driver, + _check_driver_error as raise_if_driver_error, +) + +DevicePointerT = Union[driver.CUdeviceptr, int, None] + + +class LegacyPinnedMemoryResource(MemoryResource): + """Create a pinned memory resource that uses legacy cuMemAllocHost/cudaMallocHost + APIs. + """ + + # TODO: support creating this MR with flags that are later passed to cuMemHostAlloc? + + def allocate(self, size, stream = None) -> Buffer: + """Allocate a buffer of the requested size. + + Parameters + ---------- + size : int + The size of the buffer to allocate, in bytes. + stream : Stream, optional + Currently ignored + + Returns + ------- + Buffer + The allocated buffer object, which is accessible on both host and device. + """ + if stream is None: + from cuda.core.experimental._stream import default_stream + stream = default_stream() + err, ptr = driver.cuMemAllocHost(size) + raise_if_driver_error(err) + return Buffer._init(ptr, size, self, stream) + + def deallocate(self, ptr: DevicePointerT, size, stream): + """Deallocate a buffer previously allocated by this resource. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + The pointer or handle to the buffer to deallocate. + size : int + The size of the buffer to deallocate, in bytes. + stream : Stream + The stream on which to perform the deallocation synchronously. + """ + stream.sync() + err, = driver.cuMemFreeHost(ptr) + raise_if_driver_error(err) + + @property + def is_device_accessible(self) -> bool: + """bool: this memory resource provides device-accessible buffers.""" + return True + + @property + def is_host_accessible(self) -> bool: + """bool: this memory resource provides host-accessible buffers.""" + return True + + @property + def device_id(self) -> int: + """This memory resource is not bound to any GPU.""" + raise RuntimeError("a pinned memory resource is not bound to any GPU") + diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index 6677d45289..f7f17dc74e 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -27,7 +27,7 @@ import weakref from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream -from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version ) +from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version) if TYPE_CHECKING: from .._device import Device @@ -967,66 +967,6 @@ def _deep_reduce_device_memory_resource(mr): multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource) -class LegacyPinnedMemoryResource(MemoryResource): - """Create a pinned memory resource that uses legacy cuMemAllocHost/cudaMallocHost - APIs. - """ - - # TODO: support creating this MR with flags that are later passed to cuMemHostAlloc? - - def allocate(self, size_t size, stream: Stream = None) -> Buffer: - """Allocate a buffer of the requested size. - - Parameters - ---------- - size : int - The size of the buffer to allocate, in bytes. - stream : Stream, optional - Currently ignored - - Returns - ------- - Buffer - The allocated buffer object, which is accessible on both host and device. - """ - if stream is None: - stream = default_stream() - err, ptr = driver.cuMemAllocHost(size) - raise_if_driver_error(err) - return Buffer._init(ptr, size, self, stream) - - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream): - """Deallocate a buffer previously allocated by this resource. - - Parameters - ---------- - ptr : :obj:`~_memory.DevicePointerT` - The pointer or handle to the buffer to deallocate. - size : int - The size of the buffer to deallocate, in bytes. - stream : Stream - The stream on which to perform the deallocation synchronously. - """ - stream.sync() - err, = driver.cuMemFreeHost(ptr) - raise_if_driver_error(err) - - @property - def is_device_accessible(self) -> bool: - """bool: this memory resource provides device-accessible buffers.""" - return True - - @property - def is_host_accessible(self) -> bool: - """bool: this memory resource provides host-accessible buffers.""" - return True - - @property - def device_id(self) -> int: - """This memory resource is not bound to any GPU.""" - raise RuntimeError("a pinned memory resource is not bound to any GPU") - - class _SynchronousMemoryResource(MemoryResource): __slots__ = ("_dev_id",) diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd index d992665a14..8f382e5d01 100644 --- a/cuda_core/cuda/core/experimental/_stream.pxd +++ b/cuda_core/cuda/core/experimental/_stream.pxd @@ -24,4 +24,4 @@ cdef class Stream: cdef int _get_device_and_context(self) except?-1 -cdef Stream default_stream() +cpdef Stream default_stream() diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx index 82406c5598..146a15e573 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/experimental/_stream.pyx @@ -402,7 +402,7 @@ LEGACY_DEFAULT_STREAM = C_LEGACY_DEFAULT_STREAM PER_THREAD_DEFAULT_STREAM = C_PER_THREAD_DEFAULT_STREAM -cdef Stream default_stream(): +cpdef Stream default_stream(): """Return the default CUDA :obj:`~_stream.Stream`. The type of default stream returned depends on if the environment From 729c9009a3fcdd3eb24df18f071f92d8fdd09888 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 29 Oct 2025 14:08:38 -0700 Subject: [PATCH 08/30] Move _SynchronousMemoryResource into a submodule. --- .../core/experimental/_memory/__init__.py | 3 +- .../cuda/core/experimental/_memory/legacy.py | 33 +++++++++++++++++++ .../cuda/core/experimental/_memory/memory.pyx | 31 ----------------- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index 2d1d500b5b..65947675e6 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -1,5 +1,4 @@ from .memory import * -from .memory import _SynchronousMemoryResource -from .legacy import LegacyPinnedMemoryResource +from .legacy import LegacyPinnedMemoryResource, _SynchronousMemoryResource from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource diff --git a/cuda_core/cuda/core/experimental/_memory/legacy.py b/cuda_core/cuda/core/experimental/_memory/legacy.py index 060e664924..d8507967c8 100644 --- a/cuda_core/cuda/core/experimental/_memory/legacy.py +++ b/cuda_core/cuda/core/experimental/_memory/legacy.py @@ -73,3 +73,36 @@ def device_id(self) -> int: """This memory resource is not bound to any GPU.""" raise RuntimeError("a pinned memory resource is not bound to any GPU") + +class _SynchronousMemoryResource(MemoryResource): + __slots__ = ("_dev_id",) + + def __init__(self, device_id): + self._dev_id = getattr(device_id, 'device_id', device_id) + + def allocate(self, size, stream=None) -> Buffer: + if stream is None: + from cuda.core.experimental._stream import default_stream + stream = default_stream() + err, ptr = driver.cuMemAlloc(size) + raise_if_driver_error(err) + return Buffer._init(ptr, size, self) + + def deallocate(self, ptr, size, stream): + stream.sync() + err, = driver.cuMemFree(ptr) + raise_if_driver_error(err) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + @property + def device_id(self) -> int: + return self._dev_id + + diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index f7f17dc74e..c82444fe3e 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -967,34 +967,3 @@ def _deep_reduce_device_memory_resource(mr): multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource) -class _SynchronousMemoryResource(MemoryResource): - __slots__ = ("_dev_id",) - - def __init__(self, device_id : int | Device): - self._dev_id = getattr(device_id, 'device_id', device_id) - - def allocate(self, size, stream=None) -> Buffer: - if stream is None: - stream = default_stream() - err, ptr = driver.cuMemAlloc(size) - raise_if_driver_error(err) - return Buffer._init(ptr, size, self) - - def deallocate(self, ptr, size, stream): - stream.sync() - err, = driver.cuMemFree(ptr) - raise_if_driver_error(err) - - @property - def is_device_accessible(self) -> bool: - return True - - @property - def is_host_accessible(self) -> bool: - return False - - @property - def device_id(self) -> int: - return self._dev_id - - From 87354552a1911732cdff4ba1ac90824e4c75f875 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 29 Oct 2025 14:26:57 -0700 Subject: [PATCH 09/30] Partly separates the IPC implementation. --- .../core/experimental/_memory/__init__.py | 3 +- .../cuda/core/experimental/_memory/ipc.pxd | 18 ++++ .../cuda/core/experimental/_memory/ipc.pyx | 87 +++++++++++++++++++ .../cuda/core/experimental/_memory/memory.pxd | 15 +--- .../cuda/core/experimental/_memory/memory.pyx | 76 +--------------- 5 files changed, 110 insertions(+), 89 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_memory/ipc.pxd create mode 100644 cuda_core/cuda/core/experimental/_memory/ipc.pyx diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index 65947675e6..8c6bc13196 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -1,4 +1,5 @@ -from .memory import * +from .ipc import * from .legacy import LegacyPinnedMemoryResource, _SynchronousMemoryResource +from .memory import * from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd new file mode 100644 index 0000000000..06280c08a4 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd @@ -0,0 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cdef class IPCBufferDescriptor: + cdef: + bytes _reserved + size_t _size + + +cdef class IPCAllocationHandle: + cdef: + int _handle + object _uuid + + cpdef close(self) + diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx new file mode 100644 index 0000000000..82d25087e8 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx @@ -0,0 +1,87 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union +import multiprocessing +import os + +if TYPE_CHECKING: + import uuid + + +cdef class IPCBufferDescriptor: + """Serializable object describing a buffer that can be shared between processes.""" + + def __init__(self, *arg, **kwargs): + raise RuntimeError("IPCBufferDescriptor objects cannot be instantiated directly. Please use MemoryResource APIs.") + + @classmethod + def _init(cls, reserved: bytes, size: int): + cdef IPCBufferDescriptor self = IPCBufferDescriptor.__new__(cls) + self._reserved = reserved + self._size = size + return self + + def __reduce__(self): + return self._init, (self._reserved, self._size) + + @property + def size(self): + return self._size + + +cdef class IPCAllocationHandle: + """Shareable handle to an IPC-enabled device memory pool.""" + + def __init__(self, *arg, **kwargs): + raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.") + + @classmethod + def _init(cls, handle: int, uuid: uuid.UUID): + cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls) + assert handle >= 0 + self._handle = handle + self._uuid = uuid + return self + + cpdef close(self): + """Close the handle.""" + if self._handle >= 0: + try: + os.close(self._handle) + finally: + self._handle = -1 + self._uuid = None + + def __dealloc__(self): + self.close() + + def __int__(self) -> int: + if self._handle < 0: + raise ValueError( + f"Cannot convert IPCAllocationHandle to int: the handle (id={id(self)}) is closed." + ) + return self._handle + + @property + def handle(self) -> int: + return self._handle + + @property + def uuid(self) -> uuid.UUID: + return self._uuid + + +def _reduce_allocation_handle(alloc_handle): + df = multiprocessing.reduction.DupFd(alloc_handle.handle) + return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid) + +def _reconstruct_allocation_handle(cls, df, uuid): + return cls._init(df.detach(), uuid) + + +multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle) + + diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd index 858901825b..ce3362fbd0 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pxd +++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd @@ -5,6 +5,7 @@ from libc.stdint cimport uintptr_t, intptr_t from cuda.bindings cimport cydriver +from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle from cuda.core.experimental._stream cimport Stream as _cyStream @@ -30,20 +31,6 @@ cdef class MemoryResource(_cyMemoryResource): cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept -cdef class IPCBufferDescriptor: - cdef: - bytes _reserved - size_t _size - - -cdef class IPCAllocationHandle: - cdef: - int _handle - object _uuid - - cpdef close(self) - - cdef class DeviceMemoryResource(MemoryResource): cdef: int _dev_id diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index c82444fe3e..3a1ee4d300 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -9,6 +9,7 @@ from libc.limits cimport ULLONG_MAX from libc.stdint cimport uintptr_t, intptr_t from libc.string cimport memset, memcpy from cuda.bindings cimport cydriver +from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle, IPCBufferDescriptor from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, @@ -343,6 +344,7 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes): # TODO: It is better to take a stream for latter deallocation return Buffer._init(ptr, size, mr=mr) + cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC): """Abstract base class for memory resources that manage allocation and deallocation of buffers. @@ -400,80 +402,6 @@ cdef cydriver.CUmemAllocationHandleType _IPC_HANDLE_TYPE = cydriver.CUmemAllocat if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE -cdef class IPCBufferDescriptor: - """Serializable object describing a buffer that can be shared between processes.""" - - def __init__(self, *arg, **kwargs): - raise RuntimeError("IPCBufferDescriptor objects cannot be instantiated directly. Please use MemoryResource APIs.") - - @classmethod - def _init(cls, reserved: bytes, size: int): - cdef IPCBufferDescriptor self = IPCBufferDescriptor.__new__(cls) - self._reserved = reserved - self._size = size - return self - - def __reduce__(self): - return self._init, (self._reserved, self._size) - - @property - def size(self): - return self._size - - -cdef class IPCAllocationHandle: - """Shareable handle to an IPC-enabled device memory pool.""" - - def __init__(self, *arg, **kwargs): - raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.") - - @classmethod - def _init(cls, handle: int, uuid: uuid.UUID): - cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls) - assert handle >= 0 - self._handle = handle - self._uuid = uuid - return self - - cpdef close(self): - """Close the handle.""" - if self._handle >= 0: - try: - os.close(self._handle) - finally: - self._handle = -1 - self._uuid = None - - def __dealloc__(self): - self.close() - - def __int__(self) -> int: - if self._handle < 0: - raise ValueError( - f"Cannot convert IPCAllocationHandle to int: the handle (id={id(self)}) is closed." - ) - return self._handle - - @property - def handle(self) -> int: - return self._handle - - @property - def uuid(self) -> uuid.UUID: - return self._uuid - - -def _reduce_allocation_handle(alloc_handle): - df = multiprocessing.reduction.DupFd(alloc_handle.handle) - return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid) - -def _reconstruct_allocation_handle(cls, df, uuid): - return cls._init(df.detach(), uuid) - - -multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle) - - @dataclass cdef class DeviceMemoryResourceOptions: """Customizable :obj:`~_memory.DeviceMemoryResource` options. From b2517f683cd9b770dc649fc6f0996b6e847f78c4 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 30 Oct 2025 10:04:08 -0700 Subject: [PATCH 10/30] Move IPC registry to ipc module. --- cuda_core/cuda/core/experimental/_memory/ipc.pxd | 8 ++++++++ cuda_core/cuda/core/experimental/_memory/ipc.pyx | 3 +++ .../cuda/core/experimental/_memory/memory.pyx | 16 +++++----------- 3 files changed, 16 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd index 06280c08a4..e8b67e2455 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd @@ -3,6 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 +# Holds DeviceMemoryResource objects imported by this process. This enables +# buffer serialization, as buffers can reduce to a pair comprising the memory +# resource UUID (the key into this registry) and the serialized buffer +# descriptor. +cdef object registry + + cdef class IPCBufferDescriptor: cdef: bytes _reserved @@ -16,3 +23,4 @@ cdef class IPCAllocationHandle: cpdef close(self) + diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx index 82d25087e8..579baa6a1a 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx @@ -6,11 +6,14 @@ from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union import multiprocessing import os +import weakref if TYPE_CHECKING: import uuid +cdef object registry = weakref.WeakValueDictionary() + cdef class IPCBufferDescriptor: """Serializable object describing a buffer that can be shared between processes.""" diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index 3a1ee4d300..f20a0d0c2f 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -10,6 +10,7 @@ from libc.stdint cimport uintptr_t, intptr_t from libc.string cimport memset, memcpy from cuda.bindings cimport cydriver from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle, IPCBufferDescriptor +from cuda.core.experimental._memory cimport ipc from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, @@ -482,13 +483,6 @@ class DeviceMemoryResourceAttributes: del mempool_property -# Holds DeviceMemoryResource objects imported by this process. -# This enables buffer serialization, as buffers can reduce to a pair -# of comprising the memory resource UUID (the key into this registry) -# and the serialized buffer descriptor. -cdef object _ipc_registry = weakref.WeakValueDictionary() - - cdef class DeviceMemoryResource(MemoryResource): """ Create a device memory resource managing a stream-ordered memory pool. @@ -673,7 +667,7 @@ cdef class DeviceMemoryResource(MemoryResource): """ try: - return _ipc_registry[uuid] + return ipc.registry[uuid] except KeyError: raise RuntimeError(f"Memory resource {uuid} was not found") from None @@ -686,11 +680,11 @@ cdef class DeviceMemoryResource(MemoryResource): The registered mapped memory resource. If one was previously registered with the given key, it is returned. """ - existing = _ipc_registry.get(uuid) + existing = ipc.registry.get(uuid) if existing is not None: return existing assert self._uuid is None or self._uuid == uuid - _ipc_registry[uuid] = self + ipc.registry[uuid] = self self._uuid = uuid return self @@ -725,7 +719,7 @@ cdef class DeviceMemoryResource(MemoryResource): """ # Quick exit for registry hits. uuid = getattr(alloc_handle, 'uuid', None) - mr = _ipc_registry.get(uuid) + mr = ipc.registry.get(uuid) if mr is not None: return mr From a61317aa610c54609210ebd910c2fd3e6ab04d81 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 30 Oct 2025 10:32:33 -0700 Subject: [PATCH 11/30] Collect and reorder DeviceMemoryResource properties. --- .../cuda/core/experimental/_memory/memory.pyx | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index f20a0d0c2f..32b45af2a2 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -688,14 +688,6 @@ cdef class DeviceMemoryResource(MemoryResource): self._uuid = uuid return self - @property - def uuid(self) -> Optional[uuid.UUID]: - """ - A universally unique identifier for this memory resource. Meaningful - only for IPC-enabled memory resources. - """ - return self._uuid - @classmethod def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource: """Create a device memory resource from an allocation handle. @@ -850,24 +842,16 @@ cdef class DeviceMemoryResource(MemoryResource): """Handle to the underlying memory pool.""" return driver.CUmemoryPool((self._mempool_handle)) - @property - def is_handle_owned(self) -> bool: - """Whether the memory resource handle is owned. If False, ``close`` has no effect.""" - return self._mempool_owned - - @property - def is_mapped(self) -> bool: - """ - Whether this is a mapping of an IPC-enabled memory resource from - another process. If True, allocation is not permitted. - """ - return self._is_mapped - @property def is_device_accessible(self) -> bool: """Return True. This memory resource provides device-accessible buffers.""" return True + @property + def is_handle_owned(self) -> bool: + """Whether the memory resource handle is owned. If False, ``close`` has no effect.""" + return self._mempool_owned + @property def is_host_accessible(self) -> bool: """Return False. This memory resource does not provide host-accessible buffers.""" @@ -878,6 +862,22 @@ cdef class DeviceMemoryResource(MemoryResource): """Whether this memory resource has IPC enabled.""" return self._ipc_handle_type != cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + @property + def is_mapped(self) -> bool: + """ + Whether this is a mapping of an IPC-enabled memory resource from + another process. If True, allocation is not permitted. + """ + return self._is_mapped + + @property + def uuid(self) -> Optional[uuid.UUID]: + """ + A universally unique identifier for this memory resource. Meaningful + only for IPC-enabled memory resources. + """ + return self._uuid + def _deep_reduce_device_memory_resource(mr): from .._device import Device From 0e2d1d8f404dce975580651449a0ba770b9f7762 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 30 Oct 2025 11:28:48 -0700 Subject: [PATCH 12/30] Move more IPC implementation out of DeviceMemoryResource. --- .../cuda/core/experimental/_memory/ipc.pxd | 17 ++++ .../cuda/core/experimental/_memory/ipc.pyx | 81 ++++++++++++++++++- .../cuda/core/experimental/_memory/memory.pyx | 73 ++--------------- 3 files changed, 101 insertions(+), 70 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd index e8b67e2455..46a7911d39 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd @@ -2,6 +2,11 @@ # # SPDX-License-Identifier: Apache-2.0 +from cuda.bindings cimport cydriver +from cuda.core.experimental._memory.memory cimport DeviceMemoryResource + +import uuid as uuid_module + # Holds DeviceMemoryResource objects imported by this process. This enables # buffer serialization, as buffers can reduce to a pair comprising the memory @@ -9,6 +14,10 @@ # descriptor. cdef object registry +# IPC is currently only supported on Linux. On other platforms, the IPC handle +# type is set equal to the no-IPC handle type. +cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE + cdef class IPCBufferDescriptor: cdef: @@ -24,3 +33,11 @@ cdef class IPCAllocationHandle: cpdef close(self) +# DeviceMemoryResource IPC Implementation +# ------ +cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self) +cpdef DeviceMemoryResource DMR_from_allocation_handle( + cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle +) +cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UUID) +cpdef DeviceMemoryResource DMR_from_registry(uuid: uuid.UUID) diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx index 579baa6a1a..0955a43ed8 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx @@ -2,18 +2,26 @@ # # SPDX-License-Identifier: Apache-2.0 +from libc.stdint cimport intptr_t -from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union +from cuda.bindings cimport cydriver +from cuda.core.experimental._utils.cuda_utils cimport ( + HANDLE_RETURN, +) + +from typing import Iterable, Literal, Optional, TypeVar, Union import multiprocessing import os +import platform +import uuid import weakref -if TYPE_CHECKING: - import uuid - cdef object registry = weakref.WeakValueDictionary() +cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \ + if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + cdef class IPCBufferDescriptor: """Serializable object describing a buffer that can be shared between processes.""" @@ -88,3 +96,68 @@ def _reconstruct_allocation_handle(cls, df, uuid): multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle) +cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self): + # Note: This is Linux only (int for file descriptor) + cdef int alloc_handle + + if self._alloc_handle is None: + if not self.is_ipc_enabled: + raise RuntimeError("Memory resource is not IPC-enabled") + if self._is_mapped: + raise RuntimeError("Imported memory resource cannot be exported") + + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( + &alloc_handle, self._mempool_handle, IPC_HANDLE_TYPE, 0) + ) + try: + assert self._uuid is None + self._uuid = uuid.uuid4() + self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid) + except: + os.close(alloc_handle) + raise + return self._alloc_handle + + +cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle): + # Quick exit for registry hits. + uuid = getattr(alloc_handle, 'uuid', None) + mr = registry.get(uuid) + if mr is not None: + return mr + + device_id = getattr(device_id, 'device_id', device_id) + + cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) + self._dev_id = device_id + self._ipc_handle_type = IPC_HANDLE_TYPE + self._mempool_owned = True + self._is_mapped = True + #self._alloc_handle = None # only used for non-imported + + cdef int handle = int(alloc_handle) + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle( + &(self._mempool_handle), (handle), IPC_HANDLE_TYPE, 0) + ) + if uuid is not None: + registered = self.register(uuid) + assert registered is self + return self + + +cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UUID): + existing = registry.get(uuid) + if existing is not None: + return existing + assert self._uuid is None or self._uuid == uuid + registry[uuid] = self + self._uuid = uuid + return self + +cpdef DeviceMemoryResource DMR_from_registry(uuid: uuid.UUID): + try: + return registry[uuid] + except KeyError: + raise RuntimeError(f"Memory resource {uuid} was not found") from None diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index 32b45af2a2..fbba679095 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -397,12 +397,6 @@ cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC): ... -# IPC is currently only supported on Linux. On other platforms, the IPC handle -# type is set equal to the no-IPC handle type. -cdef cydriver.CUmemAllocationHandleType _IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \ - if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE - - @dataclass cdef class DeviceMemoryResourceOptions: """Customizable :obj:`~_memory.DeviceMemoryResource` options. @@ -607,12 +601,12 @@ cdef class DeviceMemoryResource(MemoryResource): )) else: # Create a new memory pool. - if opts.ipc_enabled and _IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE: + if opts.ipc_enabled and ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE: raise RuntimeError("IPC is not available on {platform.system()}") memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED - properties.handleTypes = _IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + properties.handleTypes = ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE properties.location.id = dev_id properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE properties.maxSize = opts.max_size @@ -665,11 +659,7 @@ cdef class DeviceMemoryResource(MemoryResource): RuntimeError If no mapped memory resource is found in the registry. """ - - try: - return ipc.registry[uuid] - except KeyError: - raise RuntimeError(f"Memory resource {uuid} was not found") from None + return ipc.DMR_from_registry(uuid) def register(self, uuid: uuid.UUID) -> DeviceMemoryResource: """ @@ -680,13 +670,7 @@ cdef class DeviceMemoryResource(MemoryResource): The registered mapped memory resource. If one was previously registered with the given key, it is returned. """ - existing = ipc.registry.get(uuid) - if existing is not None: - return existing - assert self._uuid is None or self._uuid == uuid - ipc.registry[uuid] = self - self._uuid = uuid - return self + return ipc.DMR_register(self, uuid) @classmethod def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource: @@ -709,30 +693,8 @@ cdef class DeviceMemoryResource(MemoryResource): ------- A new device memory resource instance with the imported handle. """ - # Quick exit for registry hits. - uuid = getattr(alloc_handle, 'uuid', None) - mr = ipc.registry.get(uuid) - if mr is not None: - return mr - - device_id = getattr(device_id, 'device_id', device_id) - - cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) - self._dev_id = device_id - self._ipc_handle_type = _IPC_HANDLE_TYPE - self._mempool_owned = True - self._is_mapped = True - #self._alloc_handle = None # only used for non-imported - - cdef int handle = int(alloc_handle) - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle( - &(self._mempool_handle), (handle), _IPC_HANDLE_TYPE, 0) - ) - if uuid is not None: - registered = self.register(uuid) - assert registered is self - return self + return ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle) + cpdef IPCAllocationHandle get_allocation_handle(self): """Export the memory pool handle to be shared (requires IPC). @@ -744,28 +706,7 @@ cdef class DeviceMemoryResource(MemoryResource): ------- The shareable handle for the memory pool. """ - # Note: This is Linux only (int for file descriptor) - cdef int alloc_handle - - if self._alloc_handle is None: - if not self.is_ipc_enabled: - raise RuntimeError("Memory resource is not IPC-enabled") - if self._is_mapped: - raise RuntimeError("Imported memory resource cannot be exported") - - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( - &alloc_handle, self._mempool_handle, _IPC_HANDLE_TYPE, 0) - ) - try: - assert self._uuid is None - import uuid - self._uuid = uuid.uuid4() - self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid) - except: - os.close(alloc_handle) - raise - return self._alloc_handle + return ipc.DMR_get_allocation_handle(self) cdef Buffer _allocate(self, size_t size, _cyStream stream): cdef cydriver.CUstream s = stream._handle From 538762991052eb8fd75f0332bf3a5b9c2e64aa2e Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 30 Oct 2025 12:53:07 -0700 Subject: [PATCH 13/30] Minor refactoring. --- .../cuda/core/experimental/_memory/ipc.pxd | 12 +- .../cuda/core/experimental/_memory/ipc.pyx | 22 +- .../cuda/core/experimental/_memory/memory.pxd | 1 - .../cuda/core/experimental/_memory/memory.pyx | 218 +++++++----------- 4 files changed, 111 insertions(+), 142 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd index 46a7911d39..36f7721ed3 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd @@ -5,8 +5,6 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory.memory cimport DeviceMemoryResource -import uuid as uuid_module - # Holds DeviceMemoryResource objects imported by this process. This enables # buffer serialization, as buffers can reduce to a pair comprising the memory @@ -35,9 +33,7 @@ cdef class IPCAllocationHandle: # DeviceMemoryResource IPC Implementation # ------ -cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self) -cpdef DeviceMemoryResource DMR_from_allocation_handle( - cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle -) -cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UUID) -cpdef DeviceMemoryResource DMR_from_registry(uuid: uuid.UUID) +cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource) +cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle) +cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid) +cpdef DeviceMemoryResource DMR_from_registry(uuid) diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx index 0955a43ed8..428fede67e 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx @@ -50,7 +50,7 @@ cdef class IPCAllocationHandle: raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.") @classmethod - def _init(cls, handle: int, uuid: uuid.UUID): + def _init(cls, handle: int, uuid): cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls) assert handle >= 0 self._handle = handle @@ -89,6 +89,7 @@ def _reduce_allocation_handle(alloc_handle): df = multiprocessing.reduction.DupFd(alloc_handle.handle) return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid) + def _reconstruct_allocation_handle(cls, df, uuid): return cls._init(df.detach(), uuid) @@ -96,6 +97,19 @@ def _reconstruct_allocation_handle(cls, df, uuid): multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle) +def _deep_reduce_device_memory_resource(mr): + from .._device import Device + device = Device(mr.device_id) + alloc_handle = mr.get_allocation_handle() + return mr.from_allocation_handle, (device, alloc_handle) + + +multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource) + + +# DeviceMemoryResource IPC Implementation +# ------ + cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self): # Note: This is Linux only (int for file descriptor) cdef int alloc_handle @@ -120,7 +134,7 @@ cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self): return self._alloc_handle -cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle): +cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle): # Quick exit for registry hits. uuid = getattr(alloc_handle, 'uuid', None) mr = registry.get(uuid) @@ -147,7 +161,7 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id: int | Devi return self -cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UUID): +cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid): existing = registry.get(uuid) if existing is not None: return existing @@ -156,7 +170,7 @@ cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UU self._uuid = uuid return self -cpdef DeviceMemoryResource DMR_from_registry(uuid: uuid.UUID): +cpdef DeviceMemoryResource DMR_from_registry(uuid): try: return registry[uuid] except KeyError: diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd index ce3362fbd0..84018cd30f 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pxd +++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd @@ -44,7 +44,6 @@ cdef class DeviceMemoryResource(MemoryResource): object __weakref__ cpdef close(self) - cpdef IPCAllocationHandle get_allocation_handle(self) cdef Buffer _allocate(self, size_t size, _cyStream stream) cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept cpdef deallocate(self, ptr, size_t size, stream=*) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index fbba679095..bc971d1df1 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -22,7 +22,6 @@ import abc import cython from dataclasses import dataclass, field from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union -import multiprocessing import os import platform import weakref @@ -50,36 +49,7 @@ cdef class _cyMemoryResource: raise NotImplementedError -class MemoryResourceAttributes(abc.ABC): - - __slots__ = () - - @property - @abc.abstractmethod - def is_device_accessible(self) -> bool: - """bool: True if buffers allocated by this resource can be accessed on the device.""" - ... - - @property - @abc.abstractmethod - def is_host_accessible(self) -> bool: - """bool: True if buffers allocated by this resource can be accessed on the host.""" - ... - - @property - @abc.abstractmethod - def device_id(self) -> int: - """int: The device ordinal for which this memory resource is responsible. - - Raises - ------ - RuntimeError - If the resource is not bound to a specific device. - """ - ... - - -cdef class Buffer(_cyBuffer, MemoryResourceAttributes): +cdef class Buffer(_cyBuffer): """Represent a handle to allocated memory. This generic object provides a unified representation for how @@ -122,6 +92,47 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes): # Must not serialize the parent's stream! return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor()) + @staticmethod + def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer: + """Create a new :class:`Buffer` object from a pointer. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + Allocated buffer handle object + size : int + Memory size of the buffer + mr : :obj:`~_memory.MemoryResource`, optional + Memory resource associated with the buffer + """ + # TODO: It is better to take a stream for latter deallocation + return Buffer._init(ptr, size, mr=mr) + + @classmethod + def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer: + """Import a buffer that was exported from another process.""" + if not mr.is_ipc_enabled: + raise RuntimeError("Memory resource is not IPC-enabled") + if stream is None: + # Note: match this behavior to DeviceMemoryResource.allocate() + stream = default_stream() + cdef cydriver.CUmemPoolPtrExportData data + memcpy(data.reserved, (ipc_buffer._reserved), sizeof(data.reserved)) + cdef cydriver.CUdeviceptr ptr + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data)) + return Buffer._init(ptr, ipc_buffer.size, mr, stream) + + def get_ipc_descriptor(self) -> IPCBufferDescriptor: + """Export a buffer allocated for sharing between processes.""" + if not self._mr.is_ipc_enabled: + raise RuntimeError("Memory resource is not IPC-enabled") + cdef cydriver.CUmemPoolPtrExportData data + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, (self._ptr))) + cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) + return IPCBufferDescriptor._init(data_b, self.size) + cpdef close(self, stream: Stream = None): """Deallocate this buffer asynchronously on the given stream. @@ -150,79 +161,6 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes): self._ptr_obj = None self._alloc_stream = None - @property - def handle(self) -> DevicePointerT: - """Return the buffer handle object. - - .. caution:: - - This handle is a Python object. To get the memory address of the underlying C - handle, call ``int(Buffer.handle)``. - """ - if self._ptr_obj is not None: - return self._ptr_obj - elif self._ptr: - return self._ptr - else: - # contract: Buffer is closed - return 0 - - @property - def size(self) -> int: - """Return the memory size of this buffer.""" - return self._size - - @property - def memory_resource(self) -> MemoryResource: - """Return the memory resource associated with this buffer.""" - return self._mr - - @property - def is_device_accessible(self) -> bool: - """Return True if this buffer can be accessed by the GPU, otherwise False.""" - if self._mr is not None: - return self._mr.is_device_accessible - raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") - - @property - def is_host_accessible(self) -> bool: - """Return True if this buffer can be accessed by the CPU, otherwise False.""" - if self._mr is not None: - return self._mr.is_host_accessible - raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") - - @property - def device_id(self) -> int: - """Return the device ordinal of this buffer.""" - if self._mr is not None: - return self._mr.device_id - raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") - - def get_ipc_descriptor(self) -> IPCBufferDescriptor: - """Export a buffer allocated for sharing between processes.""" - if not self._mr.is_ipc_enabled: - raise RuntimeError("Memory resource is not IPC-enabled") - cdef cydriver.CUmemPoolPtrExportData data - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, (self._ptr))) - cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) - return IPCBufferDescriptor._init(data_b, self.size) - - @classmethod - def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer: - """Import a buffer that was exported from another process.""" - if not mr.is_ipc_enabled: - raise RuntimeError("Memory resource is not IPC-enabled") - if stream is None: - # Note: match this behavior to DeviceMemoryResource.allocate() - stream = default_stream() - cdef cydriver.CUmemPoolPtrExportData data - memcpy(data.reserved, (ipc_buffer._reserved), sizeof(data.reserved)) - cdef cydriver.CUdeviceptr ptr - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data)) - return Buffer._init(ptr, ipc_buffer.size, mr, stream) - def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. @@ -329,24 +267,56 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes): # Supporting method paired with __buffer__. raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.") - @staticmethod - def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer: - """Create a new :class:`Buffer` object from a pointer. + @property + def device_id(self) -> int: + """Return the device ordinal of this buffer.""" + if self._mr is not None: + return self._mr.device_id + raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") - Parameters - ---------- - ptr : :obj:`~_memory.DevicePointerT` - Allocated buffer handle object - size : int - Memory size of the buffer - mr : :obj:`~_memory.MemoryResource`, optional - Memory resource associated with the buffer + @property + def handle(self) -> DevicePointerT: + """Return the buffer handle object. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Buffer.handle)``. """ - # TODO: It is better to take a stream for latter deallocation - return Buffer._init(ptr, size, mr=mr) + if self._ptr_obj is not None: + return self._ptr_obj + elif self._ptr: + return self._ptr + else: + # contract: Buffer is closed + return 0 + + @property + def is_device_accessible(self) -> bool: + """Return True if this buffer can be accessed by the GPU, otherwise False.""" + if self._mr is not None: + return self._mr.is_device_accessible + raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") + @property + def is_host_accessible(self) -> bool: + """Return True if this buffer can be accessed by the CPU, otherwise False.""" + if self._mr is not None: + return self._mr.is_host_accessible + raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") -cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC): + @property + def memory_resource(self) -> MemoryResource: + """Return the memory resource associated with this buffer.""" + return self._mr + + @property + def size(self) -> int: + """Return the memory size of this buffer.""" + return self._size + + +cdef class MemoryResource(_cyMemoryResource): """Abstract base class for memory resources that manage allocation and deallocation of buffers. Subclasses must implement methods for allocating and deallocation, as well as properties @@ -696,7 +666,7 @@ cdef class DeviceMemoryResource(MemoryResource): return ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle) - cpdef IPCAllocationHandle get_allocation_handle(self): + def get_allocation_handle(self) -> IPCAllocationHandle: """Export the memory pool handle to be shared (requires IPC). The handle can be used to share the memory pool with other processes. @@ -820,13 +790,3 @@ cdef class DeviceMemoryResource(MemoryResource): return self._uuid -def _deep_reduce_device_memory_resource(mr): - from .._device import Device - device = Device(mr.device_id) - alloc_handle = mr.get_allocation_handle() - return mr.from_allocation_handle, (device, alloc_handle) - - -multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource) - - From 7fa38cad5161f68baf23988fc0cb42133cda6260 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 30 Oct 2025 14:42:46 -0700 Subject: [PATCH 14/30] Move Buffer IPC implementation. --- .../cuda/core/experimental/_memory/ipc.pxd | 8 +++++- .../cuda/core/experimental/_memory/ipc.pyx | 28 +++++++++++++++++++ .../cuda/core/experimental/_memory/memory.pyx | 21 ++------------ 3 files changed, 37 insertions(+), 20 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd index 36f7721ed3..d2096a6299 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver -from cuda.core.experimental._memory.memory cimport DeviceMemoryResource +from cuda.core.experimental._memory.memory cimport Buffer, DeviceMemoryResource # Holds DeviceMemoryResource objects imported by this process. This enables @@ -31,6 +31,12 @@ cdef class IPCAllocationHandle: cpdef close(self) +# Buffer IPC Implementation +# ------ +cpdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer) +cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource, IPCBufferDescriptor, stream) + + # DeviceMemoryResource IPC Implementation # ------ cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource) diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx index 428fede67e..2062a1c06e 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx @@ -2,9 +2,12 @@ # # SPDX-License-Identifier: Apache-2.0 +cimport cpython from libc.stdint cimport intptr_t +from libc.string cimport memcpy from cuda.bindings cimport cydriver +from cuda.core.experimental._stream cimport default_stream from cuda.core.experimental._utils.cuda_utils cimport ( HANDLE_RETURN, ) @@ -107,6 +110,31 @@ def _deep_reduce_device_memory_resource(mr): multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource) +# Buffer IPC Implementation +# ------ +cpdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self): + if not self._mr.is_ipc_enabled: + raise RuntimeError("Memory resource is not IPC-enabled") + cdef cydriver.CUmemPoolPtrExportData data + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, (self._ptr))) + cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) + return IPCBufferDescriptor._init(data_b, self.size) + +cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferDescriptor ipc_buffer, stream): + """Import a buffer that was exported from another process.""" + if not mr.is_ipc_enabled: + raise RuntimeError("Memory resource is not IPC-enabled") + if stream is None: + # Note: match this behavior to DeviceMemoryResource.allocate() + stream = default_stream() + cdef cydriver.CUmemPoolPtrExportData data + memcpy(data.reserved, (ipc_buffer._reserved), sizeof(data.reserved)) + cdef cydriver.CUdeviceptr ptr + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data)) + return Buffer._init(ptr, ipc_buffer.size, mr, stream) + # DeviceMemoryResource IPC Implementation # ------ diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index bc971d1df1..23c55d881b 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -4,7 +4,6 @@ from __future__ import annotations -cimport cpython from libc.limits cimport ULLONG_MAX from libc.stdint cimport uintptr_t, intptr_t from libc.string cimport memset, memcpy @@ -111,27 +110,11 @@ cdef class Buffer(_cyBuffer): @classmethod def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer: """Import a buffer that was exported from another process.""" - if not mr.is_ipc_enabled: - raise RuntimeError("Memory resource is not IPC-enabled") - if stream is None: - # Note: match this behavior to DeviceMemoryResource.allocate() - stream = default_stream() - cdef cydriver.CUmemPoolPtrExportData data - memcpy(data.reserved, (ipc_buffer._reserved), sizeof(data.reserved)) - cdef cydriver.CUdeviceptr ptr - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data)) - return Buffer._init(ptr, ipc_buffer.size, mr, stream) + return ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream) def get_ipc_descriptor(self) -> IPCBufferDescriptor: """Export a buffer allocated for sharing between processes.""" - if not self._mr.is_ipc_enabled: - raise RuntimeError("Memory resource is not IPC-enabled") - cdef cydriver.CUmemPoolPtrExportData data - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, (self._ptr))) - cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) - return IPCBufferDescriptor._init(data_b, self.size) + return ipc.Buffer_get_ipc_descriptor(self) cpdef close(self, stream: Stream = None): """Deallocate this buffer asynchronously on the given stream. From f357abdc104159e1e0188e3f9d3ca67e391ce7e5 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 30 Oct 2025 14:54:29 -0700 Subject: [PATCH 15/30] Simplify the class hierarchy (remove _cyBuffer and _cyMemoryResource). --- .../cuda/core/experimental/_memory/memory.pxd | 13 +++---------- .../cuda/core/experimental/_memory/memory.pyx | 15 ++------------- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd index 84018cd30f..b582d8de5b 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pxd +++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd @@ -9,25 +9,18 @@ from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle from cuda.core.experimental._stream cimport Stream as _cyStream -cdef class _cyBuffer: +cdef class Buffer: cdef: intptr_t _ptr size_t _size - _cyMemoryResource _mr + MemoryResource _mr object _ptr_obj _cyStream _alloc_stream - -cdef class _cyMemoryResource: - cdef Buffer _allocate(self, size_t size, _cyStream stream) - cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept - - -cdef class Buffer(_cyBuffer): cpdef close(self, stream=*) -cdef class MemoryResource(_cyMemoryResource): +cdef class MemoryResource: cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index 23c55d881b..2a7c0eca2d 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -37,18 +37,7 @@ if TYPE_CHECKING: DevicePointerT = Union[driver.CUdeviceptr, int, None] """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" -cdef class _cyMemoryResource: - """ - Internal only. Responsible for offering fast C method access. - """ - cdef Buffer _allocate(self, size_t size, _cyStream stream): - raise NotImplementedError - - cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept: - raise NotImplementedError - - -cdef class Buffer(_cyBuffer): +cdef class Buffer: """Represent a handle to allocated memory. This generic object provides a unified representation for how @@ -299,7 +288,7 @@ cdef class Buffer(_cyBuffer): return self._size -cdef class MemoryResource(_cyMemoryResource): +cdef class MemoryResource: """Abstract base class for memory resources that manage allocation and deallocation of buffers. Subclasses must implement methods for allocating and deallocation, as well as properties From 89057f9efb83b49b22fd3196bd4bd46c8d62f8cb Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 30 Oct 2025 15:02:09 -0700 Subject: [PATCH 16/30] Refactor to shrink Cython interface. --- .../cuda/core/experimental/_memory/memory.pxd | 4 - .../cuda/core/experimental/_memory/memory.pyx | 85 ++++++++++--------- 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd index b582d8de5b..c0013b01ac 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pxd +++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd @@ -36,7 +36,3 @@ cdef class DeviceMemoryResource(MemoryResource): IPCAllocationHandle _alloc_handle object __weakref__ - cpdef close(self) - cdef Buffer _allocate(self, size_t size, _cyStream stream) - cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept - cpdef deallocate(self, ptr, size_t size, stream=*) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index 2a7c0eca2d..ac58295156 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -567,26 +567,11 @@ cdef class DeviceMemoryResource(MemoryResource): self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid def __dealloc__(self): - self.close() + DMR_close(self) - cpdef close(self): + def close(self): """Close the device memory resource and destroy the associated memory pool if owned.""" - if self._mempool_handle == NULL: - return - - try: - if self._mempool_owned: - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._mempool_handle)) - finally: - self._dev_id = cydriver.CU_DEVICE_INVALID - self._mempool_handle = NULL - self._attributes = None - self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX - self._mempool_owned = False - self._is_mapped = False - self._uuid = None - self._alloc_handle = None + DMR_close(self) def __reduce__(self): return DeviceMemoryResource.from_registry, (self.uuid,) @@ -637,7 +622,6 @@ cdef class DeviceMemoryResource(MemoryResource): """ return ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle) - def get_allocation_handle(self) -> IPCAllocationHandle: """Export the memory pool handle to be shared (requires IPC). @@ -650,19 +634,6 @@ cdef class DeviceMemoryResource(MemoryResource): """ return ipc.DMR_get_allocation_handle(self) - cdef Buffer _allocate(self, size_t size, _cyStream stream): - cdef cydriver.CUstream s = stream._handle - cdef cydriver.CUdeviceptr devptr - with nogil: - HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s)) - cdef Buffer buf = Buffer.__new__(Buffer) - buf._ptr = (devptr) - buf._ptr_obj = None - buf._size = size - buf._mr = self - buf._alloc_stream = stream - return buf - def allocate(self, size_t size, stream: Stream = None) -> Buffer: """Allocate a buffer of the requested size. @@ -684,15 +655,9 @@ cdef class DeviceMemoryResource(MemoryResource): raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") if stream is None: stream = default_stream() - return self._allocate(size, <_cyStream>stream) - - cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept: - cdef cydriver.CUstream s = stream._handle - cdef cydriver.CUdeviceptr devptr = ptr - with nogil: - HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) + return DMR_allocate(self, size, <_cyStream>stream) - cpdef deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): """Deallocate a buffer previously allocated by this resource. Parameters @@ -706,7 +671,7 @@ cdef class DeviceMemoryResource(MemoryResource): If the buffer is deallocated without an explicit stream, the allocation stream is used. """ - self._deallocate(ptr, size, <_cyStream>stream) + DMR_deallocate(self, ptr, size, <_cyStream>stream) @property def attributes(self) -> DeviceMemoryResourceAttributes: @@ -762,3 +727,41 @@ cdef class DeviceMemoryResource(MemoryResource): return self._uuid +cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream): + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr + with nogil: + HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s)) + cdef Buffer buf = Buffer.__new__(Buffer) + buf._ptr = (devptr) + buf._ptr_obj = None + buf._size = size + buf._mr = self + buf._alloc_stream = stream + return buf + + +cdef void DMR_deallocate(DeviceMemoryResource self, intptr_t ptr, size_t size, _cyStream stream) noexcept: + cdef cydriver.CUstream s = stream._handle + cdef cydriver.CUdeviceptr devptr = ptr + with nogil: + HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) + + +cdef DMR_close(DeviceMemoryResource self): + if self._mempool_handle == NULL: + return + + try: + if self._mempool_owned: + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._mempool_handle)) + finally: + self._dev_id = cydriver.CU_DEVICE_INVALID + self._mempool_handle = NULL + self._attributes = None + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX + self._mempool_owned = False + self._is_mapped = False + self._uuid = None + self._alloc_handle = None From 00b60ebe75175907e38132a034c3796e4d7c9499 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 30 Oct 2025 15:25:53 -0700 Subject: [PATCH 17/30] Simplify Buffer close. --- .../cuda/core/experimental/_memory/memory.pxd | 4 +- .../cuda/core/experimental/_memory/memory.pyx | 40 ++++++++++--------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd index c0013b01ac..0334bba731 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pxd +++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd @@ -17,11 +17,9 @@ cdef class Buffer: object _ptr_obj _cyStream _alloc_stream - cpdef close(self, stream=*) - cdef class MemoryResource: - cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept + pass cdef class DeviceMemoryResource(MemoryResource): diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index ac58295156..00a7feac78 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -105,7 +105,7 @@ cdef class Buffer: """Export a buffer allocated for sharing between processes.""" return ipc.Buffer_get_ipc_descriptor(self) - cpdef close(self, stream: Stream = None): + def close(self, stream: Stream = None): """Deallocate this buffer asynchronously on the given stream. This buffer is released back to their memory resource @@ -117,21 +117,7 @@ cdef class Buffer: The stream object to use for asynchronous deallocation. If None, the behavior depends on the underlying memory resource. """ - cdef _cyStream s - if self._ptr and self._mr is not None: - if stream is None: - if self._alloc_stream is not None: - s = self._alloc_stream - else: - # TODO: remove this branch when from_handle takes a stream - s = <_cyStream>(default_stream()) - else: - s = <_cyStream>stream - self._mr._deallocate(self._ptr, self._size, s) - self._ptr = 0 - self._mr = None - self._ptr_obj = None - self._alloc_stream = None + Buffer_close(self, stream) def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. @@ -288,6 +274,24 @@ cdef class Buffer: return self._size +cdef Buffer_close(Buffer self, stream): + cdef _cyStream s + if self._ptr and self._mr is not None: + if stream is None: + if self._alloc_stream is not None: + s = self._alloc_stream + else: + # TODO: remove this branch when from_handle takes a stream + s = <_cyStream>(default_stream()) + else: + s = <_cyStream>stream + self._mr.deallocate(self._ptr, self._size, s) + self._ptr = 0 + self._mr = None + self._ptr_obj = None + self._alloc_stream = None + + cdef class MemoryResource: """Abstract base class for memory resources that manage allocation and deallocation of buffers. @@ -297,8 +301,6 @@ cdef class MemoryResource: hold a reference to self, the buffer properties are retrieved simply by looking up the underlying memory resource's respective property.) """ - cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept: - self.deallocate(ptr, size, stream) @abc.abstractmethod def allocate(self, size_t size, stream: Stream = None) -> Buffer: @@ -745,7 +747,7 @@ cdef void DMR_deallocate(DeviceMemoryResource self, intptr_t ptr, size_t size, _ cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr with nogil: - HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) + HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s)) cdef DMR_close(DeviceMemoryResource self): From ecc9405a7c99003d1088fc5938d24e423a3d7e26 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Thu, 30 Oct 2025 15:43:17 -0700 Subject: [PATCH 18/30] Refactor DeviceMemoryResource.__init__. --- .../cuda/core/experimental/_memory/memory.pyx | 113 ++++++++++-------- 1 file changed, 63 insertions(+), 50 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx index 00a7feac78..55b2fd5d0e 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx @@ -515,58 +515,11 @@ cdef class DeviceMemoryResource(MemoryResource): opts = check_or_create_options( DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", keep_none=True ) - cdef cydriver.cuuint64_t current_threshold - cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX - cdef cydriver.CUmemPoolProps properties if opts is None: - # Get the current memory pool. - self._dev_id = dev_id - self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE - self._mempool_owned = False - - with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id)) - - # Set a higher release threshold to improve performance when there are no active allocations. - # By default, the release threshold is 0, which means memory is immediately released back - # to the OS when there are no active suballocations, causing performance issues. - # Check current release threshold - HANDLE_RETURN(cydriver.cuMemPoolGetAttribute( - self._mempool_handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, ¤t_threshold) - ) - - # If threshold is 0 (default), set it to maximum to retain memory in the pool - if current_threshold == 0: - HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( - self._mempool_handle, - cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - &max_threshold - )) + DMR_init_current(self, dev_id) else: - # Create a new memory pool. - if opts.ipc_enabled and ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE: - raise RuntimeError("IPC is not available on {platform.system()}") - - memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) - properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED - properties.handleTypes = ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE - properties.location.id = dev_id - properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE - properties.maxSize = opts.max_size - properties.win32SecurityAttributes = NULL - properties.usage = 0 - - self._dev_id = dev_id - self._ipc_handle_type = properties.handleTypes - self._mempool_owned = True - - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties)) - # TODO: should we also set the threshold here? - - if opts.ipc_enabled: - self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid + DMR_init_create(self, dev_id, opts) def __dealloc__(self): DMR_close(self) @@ -602,7 +555,9 @@ cdef class DeviceMemoryResource(MemoryResource): return ipc.DMR_register(self, uuid) @classmethod - def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource: + def from_allocation_handle( + cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle + ) -> DeviceMemoryResource: """Create a device memory resource from an allocation handle. Construct a new `DeviceMemoryResource` instance that imports a memory @@ -729,6 +684,63 @@ cdef class DeviceMemoryResource(MemoryResource): return self._uuid +cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): + # Get the current memory pool. + cdef cydriver.cuuint64_t current_threshold + cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX + + self._dev_id = dev_id + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + self._mempool_owned = False + + with nogil: + HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id)) + + # Set a higher release threshold to improve performance when there are no active allocations. + # By default, the release threshold is 0, which means memory is immediately released back + # to the OS when there are no active suballocations, causing performance issues. + # Check current release threshold + HANDLE_RETURN(cydriver.cuMemPoolGetAttribute( + self._mempool_handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, ¤t_threshold) + ) + + # If threshold is 0 (default), set it to maximum to retain memory in the pool + if current_threshold == 0: + HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( + self._mempool_handle, + cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + &max_threshold + )) + + +cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts): + # Create a new memory pool. + cdef cydriver.CUmemPoolProps properties + + if opts.ipc_enabled and ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE: + raise RuntimeError("IPC is not available on {platform.system()}") + + memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) + properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED + properties.handleTypes = ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + properties.location.id = dev_id + properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE + properties.maxSize = opts.max_size + properties.win32SecurityAttributes = NULL + properties.usage = 0 + + self._dev_id = dev_id + self._ipc_handle_type = properties.handleTypes + self._mempool_owned = True + + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties)) + # TODO: should we also set the threshold here? + + if opts.ipc_enabled: + self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid + + cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr @@ -767,3 +779,4 @@ cdef DMR_close(DeviceMemoryResource self): self._is_mapped = False self._uuid = None self._alloc_handle = None + From 228936bbd6ed1a5ed2ff9971dae33dc84a2a69e4 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 31 Oct 2025 09:40:28 -0700 Subject: [PATCH 19/30] Move Buffer into a separate module. --- .../core/experimental/_memory/__init__.py | 11 +- .../cuda/core/experimental/_memory/buffer.pxd | 22 ++ .../cuda/core/experimental/_memory/buffer.pyx | 331 ++++++++++++++++++ .../_memory/{memory.pxd => dmr.pxd} | 16 +- .../_memory/{memory.pyx => dmr.pyx} | 323 +---------------- .../cuda/core/experimental/_memory/ipc.pxd | 3 +- .../cuda/core/experimental/_memory/legacy.py | 11 +- .../cuda/core/experimental/_memory/vmm.py | 4 +- 8 files changed, 383 insertions(+), 338 deletions(-) create mode 100644 cuda_core/cuda/core/experimental/_memory/buffer.pxd create mode 100644 cuda_core/cuda/core/experimental/_memory/buffer.pyx rename cuda_core/cuda/core/experimental/_memory/{memory.pxd => dmr.pxd} (66%) rename cuda_core/cuda/core/experimental/_memory/{memory.pyx => dmr.pyx} (60%) diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index 8c6bc13196..d5dbccee64 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -1,5 +1,10 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from .buffer import * +from .dmr import * from .ipc import * -from .legacy import LegacyPinnedMemoryResource, _SynchronousMemoryResource -from .memory import * -from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource +from .legacy import * +from .vmm import * diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pxd b/cuda_core/cuda/core/experimental/_memory/buffer.pxd new file mode 100644 index 0000000000..b6c75f63cc --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/buffer.pxd @@ -0,0 +1,22 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t + +from cuda.core.experimental._stream cimport Stream as _cyStream + + +cdef class Buffer: + cdef: + intptr_t _ptr + size_t _size + MemoryResource _mr + object _ptr_obj + _cyStream _alloc_stream + + +cdef class MemoryResource: + pass + + diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pyx b/cuda_core/cuda/core/experimental/_memory/buffer.pyx new file mode 100644 index 0000000000..6d7c238d7d --- /dev/null +++ b/cuda_core/cuda/core/experimental/_memory/buffer.pyx @@ -0,0 +1,331 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from libc.stdint cimport intptr_t + +from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource +from cuda.core.experimental._memory.ipc cimport IPCBufferDescriptor +from cuda.core.experimental._memory cimport ipc +from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream +from cuda.core.experimental._utils.cuda_utils cimport ( + _check_driver_error as raise_if_driver_error, +) + +import abc +from typing import TypeVar, Union + +from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule +from cuda.core.experimental._stream import Stream +from cuda.core.experimental._utils.cuda_utils import driver + + +DevicePointerT = Union[driver.CUdeviceptr, int, None] +"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" + +cdef class Buffer: + """Represent a handle to allocated memory. + + This generic object provides a unified representation for how + different memory resources are to give access to their memory + allocations. + + Support for data interchange mechanisms are provided by DLPack. + """ + def __cinit__(self): + self._ptr = 0 + self._size = 0 + self._mr = None + self._ptr_obj = None + self._alloc_stream = None + + def _clear(self): + self._ptr = 0 + self._size = 0 + self._mr = None + self._ptr_obj = None + self._alloc_stream = None + + def __init__(self, *args, **kwargs): + raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.") + + @classmethod + def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, stream: Stream | None = None): + cdef Buffer self = Buffer.__new__(cls) + self._ptr = (int(ptr)) + self._ptr_obj = ptr + self._size = size + self._mr = mr + self._alloc_stream = <_cyStream>(stream) if stream is not None else None + return self + + def __dealloc__(self): + self.close(self._alloc_stream) + + def __reduce__(self): + # Must not serialize the parent's stream! + return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor()) + + @staticmethod + def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer: + """Create a new :class:`Buffer` object from a pointer. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + Allocated buffer handle object + size : int + Memory size of the buffer + mr : :obj:`~_memory.MemoryResource`, optional + Memory resource associated with the buffer + """ + # TODO: It is better to take a stream for latter deallocation + return Buffer._init(ptr, size, mr=mr) + + @classmethod + def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer: + """Import a buffer that was exported from another process.""" + return ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream) + + def get_ipc_descriptor(self) -> IPCBufferDescriptor: + """Export a buffer allocated for sharing between processes.""" + return ipc.Buffer_get_ipc_descriptor(self) + + def close(self, stream: Stream = None): + """Deallocate this buffer asynchronously on the given stream. + + This buffer is released back to their memory resource + asynchronously on the given stream. + + Parameters + ---------- + stream : Stream, optional + The stream object to use for asynchronous deallocation. If None, + the behavior depends on the underlying memory resource. + """ + Buffer_close(self, stream) + + def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: + """Copy from this buffer to the dst buffer asynchronously on the given stream. + + Copies the data from this buffer to the provided dst buffer. + If the dst buffer is not provided, then a new buffer is first + allocated using the associated memory resource before the copy. + + Parameters + ---------- + dst : :obj:`~_memory.Buffer` + Source buffer to copy data from + stream : Stream + Keyword argument specifying the stream for the + asynchronous copy + + """ + if stream is None: + raise ValueError("stream must be provided") + + cdef size_t src_size = self._size + + if dst is None: + if self._mr is None: + raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)") + dst = self._mr.allocate(src_size, stream) + + cdef size_t dst_size = dst._size + if dst_size != src_size: + raise ValueError( + f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})" + ) + err, = driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle) + raise_if_driver_error(err) + return dst + + def copy_from(self, src: Buffer, *, stream: Stream): + """Copy from the src buffer to this buffer asynchronously on the given stream. + + Parameters + ---------- + src : :obj:`~_memory.Buffer` + Source buffer to copy data from + stream : Stream + Keyword argument specifying the stream for the + asynchronous copy + + """ + if stream is None: + raise ValueError("stream must be provided") + + cdef size_t dst_size = self._size + cdef size_t src_size = src._size + + if src_size != dst_size: + raise ValueError( + f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})" + ) + err, = driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle) + raise_if_driver_error(err) + + def __dlpack__( + self, + *, + stream: int | None = None, + max_version: tuple[int, int] | None = None, + dl_device: tuple[int, int] | None = None, + copy: bool | None = None, + ) -> TypeVar("PyCapsule"): + # Note: we ignore the stream argument entirely (as if it is -1). + # It is the user's responsibility to maintain stream order. + if dl_device is not None: + raise BufferError("Sorry, not supported: dl_device other than None") + if copy is True: + raise BufferError("Sorry, not supported: copy=True") + if max_version is None: + versioned = False + else: + if not isinstance(max_version, tuple) or len(max_version) != 2: + raise BufferError(f"Expected max_version tuple[int, int], got {max_version}") + versioned = max_version >= (1, 0) + capsule = make_py_capsule(self, versioned) + return capsule + + def __dlpack_device__(self) -> tuple[int, int]: + cdef bint d = self.is_device_accessible + cdef bint h = self.is_host_accessible + if d and (not h): + return (DLDeviceType.kDLCUDA, self.device_id) + if d and h: + # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks + return (DLDeviceType.kDLCUDAHost, 0) + if (not d) and h: + return (DLDeviceType.kDLCPU, 0) + raise BufferError("buffer is neither device-accessible nor host-accessible") + + def __buffer__(self, flags: int, /) -> memoryview: + # Support for Python-level buffer protocol as per PEP 688. + # This raises a BufferError unless: + # 1. Python is 3.12+ + # 2. This Buffer object is host accessible + raise NotImplementedError("WIP: Buffer.__buffer__ hasn't been implemented yet.") + + def __release_buffer__(self, buffer: memoryview, /): + # Supporting method paired with __buffer__. + raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.") + + @property + def device_id(self) -> int: + """Return the device ordinal of this buffer.""" + if self._mr is not None: + return self._mr.device_id + raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") + + @property + def handle(self) -> DevicePointerT: + """Return the buffer handle object. + + .. caution:: + + This handle is a Python object. To get the memory address of the underlying C + handle, call ``int(Buffer.handle)``. + """ + if self._ptr_obj is not None: + return self._ptr_obj + elif self._ptr: + return self._ptr + else: + # contract: Buffer is closed + return 0 + + @property + def is_device_accessible(self) -> bool: + """Return True if this buffer can be accessed by the GPU, otherwise False.""" + if self._mr is not None: + return self._mr.is_device_accessible + raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") + + @property + def is_host_accessible(self) -> bool: + """Return True if this buffer can be accessed by the CPU, otherwise False.""" + if self._mr is not None: + return self._mr.is_host_accessible + raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") + + @property + def memory_resource(self) -> MemoryResource: + """Return the memory resource associated with this buffer.""" + return self._mr + + @property + def size(self) -> int: + """Return the memory size of this buffer.""" + return self._size + + +cdef Buffer_close(Buffer self, stream): + cdef _cyStream s + if self._ptr and self._mr is not None: + if stream is None: + if self._alloc_stream is not None: + s = self._alloc_stream + else: + # TODO: remove this branch when from_handle takes a stream + s = <_cyStream>(default_stream()) + else: + s = <_cyStream>stream + self._mr.deallocate(self._ptr, self._size, s) + self._ptr = 0 + self._mr = None + self._ptr_obj = None + self._alloc_stream = None + + +cdef class MemoryResource: + """Abstract base class for memory resources that manage allocation and deallocation of buffers. + + Subclasses must implement methods for allocating and deallocation, as well as properties + associated with this memory resource from which all allocated buffers will inherit. (Since + all :class:`Buffer` instances allocated and returned by the :meth:`allocate` method would + hold a reference to self, the buffer properties are retrieved simply by looking up the underlying + memory resource's respective property.) + """ + + @abc.abstractmethod + def allocate(self, size_t size, stream: Stream = None) -> Buffer: + """Allocate a buffer of the requested size. + + Parameters + ---------- + size : int + The size of the buffer to allocate, in bytes. + stream : Stream, optional + The stream on which to perform the allocation asynchronously. + If None, it is up to each memory resource implementation to decide + and document the behavior. + + Returns + ------- + Buffer + The allocated buffer object, which can be used for device or host operations + depending on the resource's properties. + """ + ... + + @abc.abstractmethod + def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): + """Deallocate a buffer previously allocated by this resource. + + Parameters + ---------- + ptr : :obj:`~_memory.DevicePointerT` + The pointer or handle to the buffer to deallocate. + size : int + The size of the buffer to deallocate, in bytes. + stream : Stream, optional + The stream on which to perform the deallocation asynchronously. + If None, it is up to each memory resource implementation to decide + and document the behavior. + """ + ... + + diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/dmr.pxd similarity index 66% rename from cuda_core/cuda/core/experimental/_memory/memory.pxd rename to cuda_core/cuda/core/experimental/_memory/dmr.pxd index 0334bba731..c3572d34b7 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pxd +++ b/cuda_core/cuda/core/experimental/_memory/dmr.pxd @@ -2,24 +2,10 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport uintptr_t, intptr_t from cuda.bindings cimport cydriver +from cuda.core.experimental._memory.buffer cimport MemoryResource from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle -from cuda.core.experimental._stream cimport Stream as _cyStream - - -cdef class Buffer: - cdef: - intptr_t _ptr - size_t _size - MemoryResource _mr - object _ptr_obj - _cyStream _alloc_stream - - -cdef class MemoryResource: - pass cdef class DeviceMemoryResource(MemoryResource): diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/dmr.pyx similarity index 60% rename from cuda_core/cuda/core/experimental/_memory/memory.pyx rename to cuda_core/cuda/core/experimental/_memory/dmr.pyx index 55b2fd5d0e..bdfb65f04e 100644 --- a/cuda_core/cuda/core/experimental/_memory/memory.pyx +++ b/cuda_core/cuda/core/experimental/_memory/dmr.pyx @@ -6,10 +6,12 @@ from __future__ import annotations from libc.limits cimport ULLONG_MAX from libc.stdint cimport uintptr_t, intptr_t -from libc.string cimport memset, memcpy +from libc.string cimport memset + from cuda.bindings cimport cydriver -from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle, IPCBufferDescriptor +from cuda.core.experimental._memory.buffer cimport Buffer, MemoryResource from cuda.core.experimental._memory cimport ipc +from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, @@ -17,330 +19,21 @@ from cuda.core.experimental._utils.cuda_utils cimport ( HANDLE_RETURN, ) -import abc import cython -from dataclasses import dataclass, field -from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union -import os +from dataclasses import dataclass +from typing import Optional, TYPE_CHECKING import platform import weakref -from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream -from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version) +from cuda.core.experimental._utils.cuda_utils import driver if TYPE_CHECKING: + from cuda.core.experimental._memory.buffer import DevicePointerT from .._device import Device import uuid -DevicePointerT = Union[driver.CUdeviceptr, int, None] -"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" - -cdef class Buffer: - """Represent a handle to allocated memory. - - This generic object provides a unified representation for how - different memory resources are to give access to their memory - allocations. - - Support for data interchange mechanisms are provided by DLPack. - """ - def __cinit__(self): - self._ptr = 0 - self._size = 0 - self._mr = None - self._ptr_obj = None - self._alloc_stream = None - - def _clear(self): - self._ptr = 0 - self._size = 0 - self._mr = None - self._ptr_obj = None - self._alloc_stream = None - - def __init__(self, *args, **kwargs): - raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.") - - @classmethod - def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, stream: Stream | None = None): - cdef Buffer self = Buffer.__new__(cls) - self._ptr = (int(ptr)) - self._ptr_obj = ptr - self._size = size - self._mr = mr - self._alloc_stream = <_cyStream>(stream) if stream is not None else None - return self - - def __dealloc__(self): - self.close(self._alloc_stream) - - def __reduce__(self): - # Must not serialize the parent's stream! - return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor()) - - @staticmethod - def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer: - """Create a new :class:`Buffer` object from a pointer. - - Parameters - ---------- - ptr : :obj:`~_memory.DevicePointerT` - Allocated buffer handle object - size : int - Memory size of the buffer - mr : :obj:`~_memory.MemoryResource`, optional - Memory resource associated with the buffer - """ - # TODO: It is better to take a stream for latter deallocation - return Buffer._init(ptr, size, mr=mr) - - @classmethod - def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer: - """Import a buffer that was exported from another process.""" - return ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream) - - def get_ipc_descriptor(self) -> IPCBufferDescriptor: - """Export a buffer allocated for sharing between processes.""" - return ipc.Buffer_get_ipc_descriptor(self) - - def close(self, stream: Stream = None): - """Deallocate this buffer asynchronously on the given stream. - - This buffer is released back to their memory resource - asynchronously on the given stream. - - Parameters - ---------- - stream : Stream, optional - The stream object to use for asynchronous deallocation. If None, - the behavior depends on the underlying memory resource. - """ - Buffer_close(self, stream) - - def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer: - """Copy from this buffer to the dst buffer asynchronously on the given stream. - - Copies the data from this buffer to the provided dst buffer. - If the dst buffer is not provided, then a new buffer is first - allocated using the associated memory resource before the copy. - - Parameters - ---------- - dst : :obj:`~_memory.Buffer` - Source buffer to copy data from - stream : Stream - Keyword argument specifying the stream for the - asynchronous copy - - """ - if stream is None: - raise ValueError("stream must be provided") - - cdef size_t src_size = self._size - - if dst is None: - if self._mr is None: - raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)") - dst = self._mr.allocate(src_size, stream) - - cdef size_t dst_size = dst._size - if dst_size != src_size: - raise ValueError( - f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})" - ) - err, = driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle) - raise_if_driver_error(err) - return dst - - def copy_from(self, src: Buffer, *, stream: Stream): - """Copy from the src buffer to this buffer asynchronously on the given stream. - - Parameters - ---------- - src : :obj:`~_memory.Buffer` - Source buffer to copy data from - stream : Stream - Keyword argument specifying the stream for the - asynchronous copy - - """ - if stream is None: - raise ValueError("stream must be provided") - - cdef size_t dst_size = self._size - cdef size_t src_size = src._size - - if src_size != dst_size: - raise ValueError( - f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})" - ) - err, = driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle) - raise_if_driver_error(err) - - def __dlpack__( - self, - *, - stream: int | None = None, - max_version: tuple[int, int] | None = None, - dl_device: tuple[int, int] | None = None, - copy: bool | None = None, - ) -> TypeVar("PyCapsule"): - # Note: we ignore the stream argument entirely (as if it is -1). - # It is the user's responsibility to maintain stream order. - if dl_device is not None: - raise BufferError("Sorry, not supported: dl_device other than None") - if copy is True: - raise BufferError("Sorry, not supported: copy=True") - if max_version is None: - versioned = False - else: - if not isinstance(max_version, tuple) or len(max_version) != 2: - raise BufferError(f"Expected max_version tuple[int, int], got {max_version}") - versioned = max_version >= (1, 0) - capsule = make_py_capsule(self, versioned) - return capsule - - def __dlpack_device__(self) -> tuple[int, int]: - cdef bint d = self.is_device_accessible - cdef bint h = self.is_host_accessible - if d and (not h): - return (DLDeviceType.kDLCUDA, self.device_id) - if d and h: - # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks - return (DLDeviceType.kDLCUDAHost, 0) - if (not d) and h: - return (DLDeviceType.kDLCPU, 0) - raise BufferError("buffer is neither device-accessible nor host-accessible") - - def __buffer__(self, flags: int, /) -> memoryview: - # Support for Python-level buffer protocol as per PEP 688. - # This raises a BufferError unless: - # 1. Python is 3.12+ - # 2. This Buffer object is host accessible - raise NotImplementedError("WIP: Buffer.__buffer__ hasn't been implemented yet.") - - def __release_buffer__(self, buffer: memoryview, /): - # Supporting method paired with __buffer__. - raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.") - - @property - def device_id(self) -> int: - """Return the device ordinal of this buffer.""" - if self._mr is not None: - return self._mr.device_id - raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") - - @property - def handle(self) -> DevicePointerT: - """Return the buffer handle object. - - .. caution:: - - This handle is a Python object. To get the memory address of the underlying C - handle, call ``int(Buffer.handle)``. - """ - if self._ptr_obj is not None: - return self._ptr_obj - elif self._ptr: - return self._ptr - else: - # contract: Buffer is closed - return 0 - - @property - def is_device_accessible(self) -> bool: - """Return True if this buffer can be accessed by the GPU, otherwise False.""" - if self._mr is not None: - return self._mr.is_device_accessible - raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") - - @property - def is_host_accessible(self) -> bool: - """Return True if this buffer can be accessed by the CPU, otherwise False.""" - if self._mr is not None: - return self._mr.is_host_accessible - raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") - - @property - def memory_resource(self) -> MemoryResource: - """Return the memory resource associated with this buffer.""" - return self._mr - - @property - def size(self) -> int: - """Return the memory size of this buffer.""" - return self._size - - -cdef Buffer_close(Buffer self, stream): - cdef _cyStream s - if self._ptr and self._mr is not None: - if stream is None: - if self._alloc_stream is not None: - s = self._alloc_stream - else: - # TODO: remove this branch when from_handle takes a stream - s = <_cyStream>(default_stream()) - else: - s = <_cyStream>stream - self._mr.deallocate(self._ptr, self._size, s) - self._ptr = 0 - self._mr = None - self._ptr_obj = None - self._alloc_stream = None - - -cdef class MemoryResource: - """Abstract base class for memory resources that manage allocation and deallocation of buffers. - - Subclasses must implement methods for allocating and deallocation, as well as properties - associated with this memory resource from which all allocated buffers will inherit. (Since - all :class:`Buffer` instances allocated and returned by the :meth:`allocate` method would - hold a reference to self, the buffer properties are retrieved simply by looking up the underlying - memory resource's respective property.) - """ - - @abc.abstractmethod - def allocate(self, size_t size, stream: Stream = None) -> Buffer: - """Allocate a buffer of the requested size. - - Parameters - ---------- - size : int - The size of the buffer to allocate, in bytes. - stream : Stream, optional - The stream on which to perform the allocation asynchronously. - If None, it is up to each memory resource implementation to decide - and document the behavior. - - Returns - ------- - Buffer - The allocated buffer object, which can be used for device or host operations - depending on the resource's properties. - """ - ... - - @abc.abstractmethod - def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): - """Deallocate a buffer previously allocated by this resource. - - Parameters - ---------- - ptr : :obj:`~_memory.DevicePointerT` - The pointer or handle to the buffer to deallocate. - size : int - The size of the buffer to deallocate, in bytes. - stream : Stream, optional - The stream on which to perform the deallocation asynchronously. - If None, it is up to each memory resource implementation to decide - and document the behavior. - """ - ... - - @dataclass cdef class DeviceMemoryResourceOptions: """Customizable :obj:`~_memory.DeviceMemoryResource` options. diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd index d2096a6299..f3444028e4 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd @@ -3,7 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver -from cuda.core.experimental._memory.memory cimport Buffer, DeviceMemoryResource +from cuda.core.experimental._memory.buffer cimport Buffer +from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource # Holds DeviceMemoryResource objects imported by this process. This enables diff --git a/cuda_core/cuda/core/experimental/_memory/legacy.py b/cuda_core/cuda/core/experimental/_memory/legacy.py index d8507967c8..487ddeae5a 100644 --- a/cuda_core/cuda/core/experimental/_memory/legacy.py +++ b/cuda_core/cuda/core/experimental/_memory/legacy.py @@ -2,15 +2,20 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import Union +from __future__ import annotations -from cuda.core.experimental._memory.memory import Buffer, MemoryResource +from typing import TYPE_CHECKING + +from cuda.core.experimental._memory.buffer import Buffer, MemoryResource from cuda.core.experimental._utils.cuda_utils import ( driver, _check_driver_error as raise_if_driver_error, ) -DevicePointerT = Union[driver.CUdeviceptr, int, None] +if TYPE_CHECKING: + from cuda.core.experimental._memory.buffer import DevicePointerT + +__all__ = ["LegacyPinnedMemoryResource", "_SynchronousMemoryResource"] class LegacyPinnedMemoryResource(MemoryResource): diff --git a/cuda_core/cuda/core/experimental/_memory/vmm.py b/cuda_core/cuda/core/experimental/_memory/vmm.py index 60ba8280d8..44c9250de3 100644 --- a/cuda_core/cuda/core/experimental/_memory/vmm.py +++ b/cuda_core/cuda/core/experimental/_memory/vmm.py @@ -7,13 +7,15 @@ import platform from cuda.core.experimental._stream import Stream -from cuda.core.experimental._memory.memory import Buffer, MemoryResource +from cuda.core.experimental._memory.buffer import Buffer, MemoryResource from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version ) from cuda.core.experimental._utils.cuda_utils import ( _check_driver_error as raise_if_driver_error, check_or_create_options, ) +__all__ = ["VirtualMemoryResourceOptions", "VirtualMemoryResource"] + VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None] VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"] VirtualMemoryGranularityT = Literal["minimum", "recommended"] From 9a86bde8bd1fb4ff1cda5fe20cfbfc196d2cb3c7 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 31 Oct 2025 11:27:27 -0700 Subject: [PATCH 20/30] Refactors DeviceMemoryResource IPC implementation. --- .../cuda/core/experimental/_memory/dmr.pxd | 5 ++-- .../cuda/core/experimental/_memory/dmr.pyx | 26 +++++++++++++----- .../cuda/core/experimental/_memory/ipc.pyx | 27 +++++-------------- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pxd b/cuda_core/cuda/core/experimental/_memory/dmr.pxd index c3572d34b7..b34c08f287 100644 --- a/cuda_core/cuda/core/experimental/_memory/dmr.pxd +++ b/cuda_core/cuda/core/experimental/_memory/dmr.pxd @@ -13,10 +13,11 @@ cdef class DeviceMemoryResource(MemoryResource): int _dev_id cydriver.CUmemoryPool _mempool_handle object _attributes - cydriver.CUmemAllocationHandleType _ipc_handle_type bint _mempool_owned + object __weakref__ + + cydriver.CUmemAllocationHandleType _ipc_handle_type bint _is_mapped object _uuid IPCAllocationHandle _alloc_handle - object __weakref__ diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pyx b/cuda_core/cuda/core/experimental/_memory/dmr.pyx index bdfb65f04e..5193a2d7a8 100644 --- a/cuda_core/cuda/core/experimental/_memory/dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/dmr.pyx @@ -22,7 +22,9 @@ from cuda.core.experimental._utils.cuda_utils cimport ( import cython from dataclasses import dataclass from typing import Optional, TYPE_CHECKING +import os import platform +import uuid import weakref from cuda.core.experimental._stream import Stream @@ -31,7 +33,6 @@ from cuda.core.experimental._utils.cuda_utils import driver if TYPE_CHECKING: from cuda.core.experimental._memory.buffer import DevicePointerT from .._device import Device - import uuid @dataclass @@ -197,8 +198,8 @@ cdef class DeviceMemoryResource(MemoryResource): self._dev_id = cydriver.CU_DEVICE_INVALID self._mempool_handle = NULL self._attributes = None - self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX self._mempool_owned = False + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE self._is_mapped = False self._uuid = None self._alloc_handle = None @@ -383,8 +384,8 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX self._dev_id = dev_id - self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE self._mempool_owned = False + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE with nogil: HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id)) @@ -423,15 +424,28 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes properties.usage = 0 self._dev_id = dev_id - self._ipc_handle_type = properties.handleTypes self._mempool_owned = True with nogil: HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties)) # TODO: should we also set the threshold here? + # Note: This is Linux only (int for file descriptor) + cdef int alloc_handle + if opts.ipc_enabled: - self.get_allocation_handle() # enables Buffer.get_ipc_descriptor, sets uuid + self._ipc_handle_type = ipc.IPC_HANDLE_TYPE + self._is_mapped = False + self._uuid = uuid.uuid4() + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( + &alloc_handle, self._mempool_handle, ipc.IPC_HANDLE_TYPE, 0) + ) + try: + self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid) + except: + os.close(alloc_handle) + raise cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream): @@ -467,8 +481,8 @@ cdef DMR_close(DeviceMemoryResource self): self._dev_id = cydriver.CU_DEVICE_INVALID self._mempool_handle = NULL self._attributes = None - self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX self._mempool_owned = False + self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE self._is_mapped = False self._uuid = None self._alloc_handle = None diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx index 2062a1c06e..a3a9f03a18 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx @@ -139,26 +139,11 @@ cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferD # ------ cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self): - # Note: This is Linux only (int for file descriptor) - cdef int alloc_handle - - if self._alloc_handle is None: - if not self.is_ipc_enabled: - raise RuntimeError("Memory resource is not IPC-enabled") - if self._is_mapped: - raise RuntimeError("Imported memory resource cannot be exported") - - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( - &alloc_handle, self._mempool_handle, IPC_HANDLE_TYPE, 0) - ) - try: - assert self._uuid is None - self._uuid = uuid.uuid4() - self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid) - except: - os.close(alloc_handle) - raise + if not self.is_ipc_enabled: + raise RuntimeError("Memory resource is not IPC-enabled") + if self._is_mapped: + raise RuntimeError("Imported memory resource cannot be exported") + assert self._alloc_handle is not None return self._alloc_handle @@ -173,8 +158,8 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) self._dev_id = device_id - self._ipc_handle_type = IPC_HANDLE_TYPE self._mempool_owned = True + self._ipc_handle_type = IPC_HANDLE_TYPE self._is_mapped = True #self._alloc_handle = None # only used for non-imported From c7f6cdee48ae8d6f9d238d99501c8149b2a27ef7 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 31 Oct 2025 12:37:28 -0700 Subject: [PATCH 21/30] Removes superfluous _uuid member of DeviceMemoryResource. --- cuda_core/cuda/core/experimental/_memory/dmr.pxd | 1 - cuda_core/cuda/core/experimental/_memory/dmr.pyx | 8 +++----- cuda_core/cuda/core/experimental/_memory/ipc.pyx | 11 +++++++---- cuda_core/tests/memory_ipc/test_serialize.py | 1 - 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pxd b/cuda_core/cuda/core/experimental/_memory/dmr.pxd index b34c08f287..d8e3a2622a 100644 --- a/cuda_core/cuda/core/experimental/_memory/dmr.pxd +++ b/cuda_core/cuda/core/experimental/_memory/dmr.pxd @@ -18,6 +18,5 @@ cdef class DeviceMemoryResource(MemoryResource): cydriver.CUmemAllocationHandleType _ipc_handle_type bint _is_mapped - object _uuid IPCAllocationHandle _alloc_handle diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pyx b/cuda_core/cuda/core/experimental/_memory/dmr.pyx index 5193a2d7a8..8870f7677b 100644 --- a/cuda_core/cuda/core/experimental/_memory/dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/dmr.pyx @@ -201,7 +201,6 @@ cdef class DeviceMemoryResource(MemoryResource): self._mempool_owned = False self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE self._is_mapped = False - self._uuid = None self._alloc_handle = None def __init__(self, device_id: int | Device, options=None): @@ -375,7 +374,8 @@ cdef class DeviceMemoryResource(MemoryResource): A universally unique identifier for this memory resource. Meaningful only for IPC-enabled memory resources. """ - return self._uuid + if self._alloc_handle is not None: + return self._alloc_handle._uuid cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): @@ -436,13 +436,12 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes if opts.ipc_enabled: self._ipc_handle_type = ipc.IPC_HANDLE_TYPE self._is_mapped = False - self._uuid = uuid.uuid4() with nogil: HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( &alloc_handle, self._mempool_handle, ipc.IPC_HANDLE_TYPE, 0) ) try: - self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid) + self._alloc_handle = IPCAllocationHandle._init(alloc_handle, uuid.uuid4()) except: os.close(alloc_handle) raise @@ -484,6 +483,5 @@ cdef DMR_close(DeviceMemoryResource self): self._mempool_owned = False self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE self._is_mapped = False - self._uuid = None self._alloc_handle = None diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx index a3a9f03a18..8aee686ebb 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx @@ -67,7 +67,6 @@ cdef class IPCAllocationHandle: os.close(self._handle) finally: self._handle = -1 - self._uuid = None def __dealloc__(self): self.close() @@ -149,6 +148,8 @@ cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self): cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle): # Quick exit for registry hits. + if isinstance(alloc_handle, int): + alloc_handle = IPCAllocationHandle._init(alloc_handle, None) uuid = getattr(alloc_handle, 'uuid', None) mr = registry.get(uuid) if mr is not None: @@ -161,7 +162,7 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand self._mempool_owned = True self._ipc_handle_type = IPC_HANDLE_TYPE self._is_mapped = True - #self._alloc_handle = None # only used for non-imported + self._alloc_handle = alloc_handle cdef int handle = int(alloc_handle) with nogil: @@ -171,6 +172,7 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand if uuid is not None: registered = self.register(uuid) assert registered is self + self._alloc_handle.close() return self @@ -178,9 +180,10 @@ cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid): existing = registry.get(uuid) if existing is not None: return existing - assert self._uuid is None or self._uuid == uuid + assert self._alloc_handle is not None + assert self._alloc_handle._uuid is None or self._alloc_handle._uuid == uuid registry[uuid] = self - self._uuid = uuid + self._alloc_handle._uuid = uuid return self cpdef DeviceMemoryResource DMR_from_registry(uuid): diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index ceac50e502..0be8513d58 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -60,7 +60,6 @@ def child_main(self, conn): # Receive the memory resource. handle = mp.reduction.recv_handle(conn) mr = DeviceMemoryResource.from_allocation_handle(device, handle) - os.close(handle) # Receive the buffers. buffer1 = conn.recv() # directly From 216b4fb357da5186f0f74ccb1a76580347520ee5 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 31 Oct 2025 12:48:22 -0700 Subject: [PATCH 22/30] Adds __all__ lists. --- cuda_core/cuda/core/experimental/_memory/buffer.pyx | 2 ++ cuda_core/cuda/core/experimental/_memory/dmr.pyx | 2 ++ cuda_core/cuda/core/experimental/_memory/ipc.pxd | 1 + cuda_core/cuda/core/experimental/_memory/ipc.pyx | 2 ++ 4 files changed, 7 insertions(+) diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pyx b/cuda_core/cuda/core/experimental/_memory/buffer.pyx index 6d7c238d7d..4e3896c0fd 100644 --- a/cuda_core/cuda/core/experimental/_memory/buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/buffer.pyx @@ -21,6 +21,8 @@ from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.cuda_utils import driver +__all__ = ['Buffer', 'MemoryResource'] + DevicePointerT = Union[driver.CUdeviceptr, int, None] """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pyx b/cuda_core/cuda/core/experimental/_memory/dmr.pyx index 8870f7677b..dbe3a065c6 100644 --- a/cuda_core/cuda/core/experimental/_memory/dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/dmr.pyx @@ -34,6 +34,8 @@ if TYPE_CHECKING: from cuda.core.experimental._memory.buffer import DevicePointerT from .._device import Device +__all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions'] + @dataclass cdef class DeviceMemoryResourceOptions: diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd index f3444028e4..810b88cad0 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd @@ -13,6 +13,7 @@ from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource # descriptor. cdef object registry + # IPC is currently only supported on Linux. On other platforms, the IPC handle # type is set equal to the no-IPC handle type. cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx index 8aee686ebb..e1ff68fde7 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx @@ -19,6 +19,8 @@ import platform import uuid import weakref +__all__ = ['IPCBufferDescriptor', 'IPCAllocationHandle'] + cdef object registry = weakref.WeakValueDictionary() From 6a30a39949cf70208856e8eff5fd1f12cb9e2e0c Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 31 Oct 2025 13:07:22 -0700 Subject: [PATCH 23/30] Prepend underscore to submodules, add a test for package contents. --- .../core/experimental/_memory/__init__.py | 10 ++++----- .../_memory/{buffer.pxd => _buffer.pxd} | 0 .../_memory/{buffer.pyx => _buffer.pyx} | 10 ++++----- .../_memory/{dmr.pxd => _dmr.pxd} | 4 ++-- .../_memory/{dmr.pyx => _dmr.pyx} | 22 +++++++++---------- .../_memory/{ipc.pxd => _ipc.pxd} | 4 ++-- .../_memory/{ipc.pyx => _ipc.pyx} | 0 .../_memory/{legacy.py => _legacy.py} | 2 +- .../experimental/_memory/{vmm.py => _vmm.py} | 2 +- cuda_core/tests/test_memory.py | 19 +++++++++++++++- 10 files changed, 45 insertions(+), 28 deletions(-) rename cuda_core/cuda/core/experimental/_memory/{buffer.pxd => _buffer.pxd} (100%) rename cuda_core/cuda/core/experimental/_memory/{buffer.pyx => _buffer.pyx} (97%) rename cuda_core/cuda/core/experimental/_memory/{dmr.pxd => _dmr.pxd} (79%) rename cuda_core/cuda/core/experimental/_memory/{dmr.pyx => _dmr.pyx} (95%) rename cuda_core/cuda/core/experimental/_memory/{ipc.pxd => _ipc.pxd} (91%) rename cuda_core/cuda/core/experimental/_memory/{ipc.pyx => _ipc.pyx} (100%) rename cuda_core/cuda/core/experimental/_memory/{legacy.py => _legacy.py} (97%) rename cuda_core/cuda/core/experimental/_memory/{vmm.py => _vmm.py} (99%) diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index d5dbccee64..f9d528ac64 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -2,9 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from .buffer import * -from .dmr import * -from .ipc import * -from .legacy import * -from .vmm import * +from ._buffer import * +from ._dmr import * +from ._ipc import * +from ._legacy import * +from ._vmm import * diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd similarity index 100% rename from cuda_core/cuda/core/experimental/_memory/buffer.pxd rename to cuda_core/cuda/core/experimental/_memory/_buffer.pxd diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx similarity index 97% rename from cuda_core/cuda/core/experimental/_memory/buffer.pyx rename to cuda_core/cuda/core/experimental/_memory/_buffer.pyx index 4e3896c0fd..a3a5fa48b8 100644 --- a/cuda_core/cuda/core/experimental/_memory/buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -6,9 +6,9 @@ from __future__ import annotations from libc.stdint cimport intptr_t -from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource -from cuda.core.experimental._memory.ipc cimport IPCBufferDescriptor -from cuda.core.experimental._memory cimport ipc +from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource +from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor +from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, @@ -89,11 +89,11 @@ cdef class Buffer: @classmethod def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer: """Import a buffer that was exported from another process.""" - return ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream) + return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream) def get_ipc_descriptor(self) -> IPCBufferDescriptor: """Export a buffer allocated for sharing between processes.""" - return ipc.Buffer_get_ipc_descriptor(self) + return _ipc.Buffer_get_ipc_descriptor(self) def close(self, stream: Stream = None): """Deallocate this buffer asynchronously on the given stream. diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd similarity index 79% rename from cuda_core/cuda/core/experimental/_memory/dmr.pxd rename to cuda_core/cuda/core/experimental/_memory/_dmr.pxd index d8e3a2622a..5a64a4cdac 100644 --- a/cuda_core/cuda/core/experimental/_memory/dmr.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd @@ -4,8 +4,8 @@ from cuda.bindings cimport cydriver -from cuda.core.experimental._memory.buffer cimport MemoryResource -from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle +from cuda.core.experimental._memory._buffer cimport MemoryResource +from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle cdef class DeviceMemoryResource(MemoryResource): diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx similarity index 95% rename from cuda_core/cuda/core/experimental/_memory/dmr.pyx rename to cuda_core/cuda/core/experimental/_memory/_dmr.pyx index dbe3a065c6..daf91bf77d 100644 --- a/cuda_core/cuda/core/experimental/_memory/dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx @@ -9,9 +9,9 @@ from libc.stdint cimport uintptr_t, intptr_t from libc.string cimport memset from cuda.bindings cimport cydriver -from cuda.core.experimental._memory.buffer cimport Buffer, MemoryResource -from cuda.core.experimental._memory cimport ipc -from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle +from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource +from cuda.core.experimental._memory cimport _ipc +from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, @@ -236,7 +236,7 @@ cdef class DeviceMemoryResource(MemoryResource): RuntimeError If no mapped memory resource is found in the registry. """ - return ipc.DMR_from_registry(uuid) + return _ipc.DMR_from_registry(uuid) def register(self, uuid: uuid.UUID) -> DeviceMemoryResource: """ @@ -247,7 +247,7 @@ cdef class DeviceMemoryResource(MemoryResource): The registered mapped memory resource. If one was previously registered with the given key, it is returned. """ - return ipc.DMR_register(self, uuid) + return _ipc.DMR_register(self, uuid) @classmethod def from_allocation_handle( @@ -272,7 +272,7 @@ cdef class DeviceMemoryResource(MemoryResource): ------- A new device memory resource instance with the imported handle. """ - return ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle) + return _ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle) def get_allocation_handle(self) -> IPCAllocationHandle: """Export the memory pool handle to be shared (requires IPC). @@ -284,7 +284,7 @@ cdef class DeviceMemoryResource(MemoryResource): ------- The shareable handle for the memory pool. """ - return ipc.DMR_get_allocation_handle(self) + return _ipc.DMR_get_allocation_handle(self) def allocate(self, size_t size, stream: Stream = None) -> Buffer: """Allocate a buffer of the requested size. @@ -413,12 +413,12 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes # Create a new memory pool. cdef cydriver.CUmemPoolProps properties - if opts.ipc_enabled and ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE: + if opts.ipc_enabled and _ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE: raise RuntimeError("IPC is not available on {platform.system()}") memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED - properties.handleTypes = ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + properties.handleTypes = _ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE properties.location.id = dev_id properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE properties.maxSize = opts.max_size @@ -436,11 +436,11 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes cdef int alloc_handle if opts.ipc_enabled: - self._ipc_handle_type = ipc.IPC_HANDLE_TYPE + self._ipc_handle_type = _ipc.IPC_HANDLE_TYPE self._is_mapped = False with nogil: HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( - &alloc_handle, self._mempool_handle, ipc.IPC_HANDLE_TYPE, 0) + &alloc_handle, self._mempool_handle, _ipc.IPC_HANDLE_TYPE, 0) ) try: self._alloc_handle = IPCAllocationHandle._init(alloc_handle, uuid.uuid4()) diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd similarity index 91% rename from cuda_core/cuda/core/experimental/_memory/ipc.pxd rename to cuda_core/cuda/core/experimental/_memory/_ipc.pxd index 810b88cad0..006d835320 100644 --- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver -from cuda.core.experimental._memory.buffer cimport Buffer -from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource +from cuda.core.experimental._memory._buffer cimport Buffer +from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource # Holds DeviceMemoryResource objects imported by this process. This enables diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx similarity index 100% rename from cuda_core/cuda/core/experimental/_memory/ipc.pyx rename to cuda_core/cuda/core/experimental/_memory/_ipc.pyx diff --git a/cuda_core/cuda/core/experimental/_memory/legacy.py b/cuda_core/cuda/core/experimental/_memory/_legacy.py similarity index 97% rename from cuda_core/cuda/core/experimental/_memory/legacy.py rename to cuda_core/cuda/core/experimental/_memory/_legacy.py index 487ddeae5a..6af415433a 100644 --- a/cuda_core/cuda/core/experimental/_memory/legacy.py +++ b/cuda_core/cuda/core/experimental/_memory/_legacy.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING -from cuda.core.experimental._memory.buffer import Buffer, MemoryResource +from cuda.core.experimental._memory._buffer import Buffer, MemoryResource from cuda.core.experimental._utils.cuda_utils import ( driver, _check_driver_error as raise_if_driver_error, diff --git a/cuda_core/cuda/core/experimental/_memory/vmm.py b/cuda_core/cuda/core/experimental/_memory/_vmm.py similarity index 99% rename from cuda_core/cuda/core/experimental/_memory/vmm.py rename to cuda_core/cuda/core/experimental/_memory/_vmm.py index 44c9250de3..ebf7895076 100644 --- a/cuda_core/cuda/core/experimental/_memory/vmm.py +++ b/cuda_core/cuda/core/experimental/_memory/_vmm.py @@ -7,7 +7,7 @@ import platform from cuda.core.experimental._stream import Stream -from cuda.core.experimental._memory.buffer import Buffer, MemoryResource +from cuda.core.experimental._memory._buffer import Buffer, MemoryResource from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version ) from cuda.core.experimental._utils.cuda_utils import ( _check_driver_error as raise_if_driver_error, diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 261454bf59..8879d2dee1 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -24,7 +24,8 @@ VirtualMemoryResource, VirtualMemoryResourceOptions, ) -from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor +from cuda.core.experimental._dlpack import DLDeviceType +from cuda.core.experimental._memory import IPCBufferDescriptor from cuda.core.experimental._utils.cuda_utils import handle_return from cuda.core.experimental.utils import StridedMemoryView from helpers.buffers import DummyUnifiedMemoryResource @@ -125,6 +126,22 @@ class NullMemoryResource(DummyHostMemoryResource): def is_host_accessible(self) -> bool: return False +def test_package_contents(): + expected = [ + 'Buffer', + 'MemoryResource', + 'DeviceMemoryResource', + 'DeviceMemoryResourceOptions', + 'IPCBufferDescriptor', + 'IPCAllocationHandle', + 'LegacyPinnedMemoryResource', + 'VirtualMemoryResourceOptions', + 'VirtualMemoryResource' + ] + d = {} + exec("from cuda.core.experimental._memory import *", d) + d = {k:v for k,v in d.items() if not k.startswith("__")} + assert sorted(expected) == sorted(d.keys()) def buffer_initialization(dummy_mr: MemoryResource): buffer = dummy_mr.allocate(size=1024) From 229ddc6ca56288d959c2438cda00c73754cefaba Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 31 Oct 2025 13:46:29 -0700 Subject: [PATCH 24/30] Refactor IPC data of DMR into IPCData class. --- .../cuda/core/experimental/_memory/_dmr.pxd | 11 +-- .../cuda/core/experimental/_memory/_dmr.pyx | 62 ++++++------- .../cuda/core/experimental/_memory/_ipc.pxd | 10 ++- .../cuda/core/experimental/_memory/_ipc.pyx | 86 +++++++++++++------ 4 files changed, 96 insertions(+), 73 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd index 5a64a4cdac..2d1420dd49 100644 --- a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd @@ -5,18 +5,15 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport MemoryResource -from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle +from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData cdef class DeviceMemoryResource(MemoryResource): cdef: + object __weakref__ int _dev_id - cydriver.CUmemoryPool _mempool_handle + cydriver.CUmemoryPool _handle object _attributes bint _mempool_owned - object __weakref__ - - cydriver.CUmemAllocationHandleType _ipc_handle_type - bint _is_mapped - IPCAllocationHandle _alloc_handle + IPCData _ipc_data diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx index daf91bf77d..0601e93ea6 100644 --- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx @@ -11,7 +11,7 @@ from libc.string cimport memset from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource from cuda.core.experimental._memory cimport _ipc -from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle +from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, @@ -198,12 +198,10 @@ cdef class DeviceMemoryResource(MemoryResource): def __cinit__(self): self._dev_id = cydriver.CU_DEVICE_INVALID - self._mempool_handle = NULL + self._handle = NULL self._attributes = None self._mempool_owned = False - self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE - self._is_mapped = False - self._alloc_handle = None + self._ipc_data = None def __init__(self, device_id: int | Device, options=None): cdef int dev_id = getattr(device_id, 'device_id', device_id) @@ -284,7 +282,11 @@ cdef class DeviceMemoryResource(MemoryResource): ------- The shareable handle for the memory pool. """ - return _ipc.DMR_get_allocation_handle(self) + if not self.is_ipc_enabled: + raise RuntimeError("Memory resource is not IPC-enabled") + if self.is_mapped: + raise RuntimeError("Imported memory resource cannot be exported") + return self._ipc_data._alloc_handle def allocate(self, size_t size, stream: Stream = None) -> Buffer: """Allocate a buffer of the requested size. @@ -303,7 +305,7 @@ cdef class DeviceMemoryResource(MemoryResource): The allocated buffer object, which is accessible on the device that this memory resource was created for. """ - if self._is_mapped: + if self.is_mapped: raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") if stream is None: stream = default_stream() @@ -340,7 +342,7 @@ cdef class DeviceMemoryResource(MemoryResource): @property def handle(self) -> driver.CUmemoryPool: """Handle to the underlying memory pool.""" - return driver.CUmemoryPool((self._mempool_handle)) + return driver.CUmemoryPool((self._handle)) @property def is_device_accessible(self) -> bool: @@ -360,7 +362,7 @@ cdef class DeviceMemoryResource(MemoryResource): @property def is_ipc_enabled(self) -> bool: """Whether this memory resource has IPC enabled.""" - return self._ipc_handle_type != cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + return self._ipc_data is not None @property def is_mapped(self) -> bool: @@ -368,7 +370,7 @@ cdef class DeviceMemoryResource(MemoryResource): Whether this is a mapping of an IPC-enabled memory resource from another process. If True, allocation is not permitted. """ - return self._is_mapped + return self._ipc_data is not None and self._ipc_data._is_mapped @property def uuid(self) -> Optional[uuid.UUID]: @@ -376,8 +378,7 @@ cdef class DeviceMemoryResource(MemoryResource): A universally unique identifier for this memory resource. Meaningful only for IPC-enabled memory resources. """ - if self._alloc_handle is not None: - return self._alloc_handle._uuid + return getattr(self._ipc_data, 'uuid', None) cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): @@ -387,23 +388,22 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): self._dev_id = dev_id self._mempool_owned = False - self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE with nogil: - HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id)) + HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id)) # Set a higher release threshold to improve performance when there are no active allocations. # By default, the release threshold is 0, which means memory is immediately released back # to the OS when there are no active suballocations, causing performance issues. # Check current release threshold HANDLE_RETURN(cydriver.cuMemPoolGetAttribute( - self._mempool_handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, ¤t_threshold) + self._handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, ¤t_threshold) ) # If threshold is 0 (default), set it to maximum to retain memory in the pool if current_threshold == 0: HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( - self._mempool_handle, + self._handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &max_threshold )) @@ -429,31 +429,19 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes self._mempool_owned = True with nogil: - HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties)) + HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._handle), &properties)) # TODO: should we also set the threshold here? - # Note: This is Linux only (int for file descriptor) - cdef int alloc_handle - if opts.ipc_enabled: - self._ipc_handle_type = _ipc.IPC_HANDLE_TYPE - self._is_mapped = False - with nogil: - HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( - &alloc_handle, self._mempool_handle, _ipc.IPC_HANDLE_TYPE, 0) - ) - try: - self._alloc_handle = IPCAllocationHandle._init(alloc_handle, uuid.uuid4()) - except: - os.close(alloc_handle) - raise + alloc_handle = _ipc.DMR_export_mempool(self) + self._ipc_data = IPCData(alloc_handle, mapped=False) cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr with nogil: - HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s)) + HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) cdef Buffer buf = Buffer.__new__(Buffer) buf._ptr = (devptr) buf._ptr_obj = None @@ -471,19 +459,17 @@ cdef void DMR_deallocate(DeviceMemoryResource self, intptr_t ptr, size_t size, _ cdef DMR_close(DeviceMemoryResource self): - if self._mempool_handle == NULL: + if self._handle == NULL: return try: if self._mempool_owned: with nogil: - HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._mempool_handle)) + HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle)) finally: self._dev_id = cydriver.CU_DEVICE_INVALID - self._mempool_handle = NULL + self._handle = NULL self._attributes = None self._mempool_owned = False - self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE - self._is_mapped = False - self._alloc_handle = None + self._ipc_data = None diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd index 006d835320..c81fcc532a 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd @@ -19,6 +19,12 @@ cdef object registry cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE +cdef class IPCData: + cdef: + bint _is_mapped + IPCAllocationHandle _alloc_handle + + cdef class IPCBufferDescriptor: cdef: bytes _reserved @@ -41,7 +47,7 @@ cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource, IPCBufferDesc # DeviceMemoryResource IPC Implementation # ------ -cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource) cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle) -cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid) cpdef DeviceMemoryResource DMR_from_registry(uuid) +cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid) +cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource) diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx index e1ff68fde7..93fa6d0dcb 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 cimport cpython -from libc.stdint cimport intptr_t +from libc.stdint cimport intptr_t, uintptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver @@ -27,6 +27,30 @@ cdef object registry = weakref.WeakValueDictionary() cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \ if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + +cdef class IPCData: + """Data members related to sharing memory pools via IPC.""" + def __cinit__(self): + self._is_mapped = False + self._alloc_handle = None + + def __init__(self, IPCAllocationHandle alloc_handle, bint mapped): + self._is_mapped = mapped + self._alloc_handle = alloc_handle + + @property + def alloc_handle(self): + return self._alloc_handle + + @property + def is_mapped(self): + return self._is_mapped + + @property + def uuid(self): + return getattr(self._alloc_handle, 'uuid', None) + + cdef class IPCBufferDescriptor: """Serializable object describing a buffer that can be shared between processes.""" @@ -133,21 +157,12 @@ cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferD memcpy(data.reserved, (ipc_buffer._reserved), sizeof(data.reserved)) cdef cydriver.CUdeviceptr ptr with nogil: - HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data)) + HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data)) return Buffer._init(ptr, ipc_buffer.size, mr, stream) # DeviceMemoryResource IPC Implementation # ------ -cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self): - if not self.is_ipc_enabled: - raise RuntimeError("Memory resource is not IPC-enabled") - if self._is_mapped: - raise RuntimeError("Imported memory resource cannot be exported") - assert self._alloc_handle is not None - return self._alloc_handle - - cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle): # Quick exit for registry hits. if isinstance(alloc_handle, int): @@ -157,39 +172,58 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand if mr is not None: return mr - device_id = getattr(device_id, 'device_id', device_id) - + # Construct a new DMR. cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) - self._dev_id = device_id + self._dev_id = getattr(device_id, 'device_id', device_id) self._mempool_owned = True - self._ipc_handle_type = IPC_HANDLE_TYPE - self._is_mapped = True - self._alloc_handle = alloc_handle + self._ipc_data = IPCData(alloc_handle, mapped=True) + # Map the mempool into this process. cdef int handle = int(alloc_handle) with nogil: HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle( - &(self._mempool_handle), (handle), IPC_HANDLE_TYPE, 0) + &(self._handle), (handle), IPC_HANDLE_TYPE, 0) ) + + # Register it. if uuid is not None: registered = self.register(uuid) assert registered is self - self._alloc_handle.close() + + # Always close the file handle (caller can dup it, if needed). + alloc_handle.close() + return self +cpdef DeviceMemoryResource DMR_from_registry(uuid): + try: + return registry[uuid] + except KeyError: + raise RuntimeError(f"Memory resource {uuid} was not found") from None + + cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid): existing = registry.get(uuid) if existing is not None: return existing - assert self._alloc_handle is not None - assert self._alloc_handle._uuid is None or self._alloc_handle._uuid == uuid + assert self.uuid is None or self.uuid == uuid registry[uuid] = self - self._alloc_handle._uuid = uuid + self._ipc_data._alloc_handle._uuid = uuid return self -cpdef DeviceMemoryResource DMR_from_registry(uuid): + +cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self): + # Note: This is Linux only (int for file descriptor) + cdef int fd + cdef IPCAllocationHandle alloc_handle + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( + &fd, self._handle, IPC_HANDLE_TYPE, 0) + ) try: - return registry[uuid] - except KeyError: - raise RuntimeError(f"Memory resource {uuid} was not found") from None + return IPCAllocationHandle._init(fd, uuid.uuid4()) + except: + os.close(fd) + raise + From 0fd3ca9a519e1fb211bf934efaff50a63e605975 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 31 Oct 2025 14:47:48 -0700 Subject: [PATCH 25/30] General clean up. --- .../core/experimental/_memory/__init__.py | 11 +- .../core/experimental/_memory/_buffer.pxd | 12 +- .../core/experimental/_memory/_buffer.pyx | 64 +++++--- .../cuda/core/experimental/_memory/_dmr.pxd | 11 +- .../cuda/core/experimental/_memory/_dmr.pyx | 60 ++++--- .../cuda/core/experimental/_memory/_ipc.pxd | 31 ++-- .../cuda/core/experimental/_memory/_ipc.pyx | 61 ++++--- .../cuda/core/experimental/_memory/_legacy.py | 16 +- .../cuda/core/experimental/_memory/_vmm.py | 149 ++++++++++++------ cuda_core/tests/memory_ipc/test_serialize.py | 1 - cuda_core/tests/test_memory.py | 24 +-- 11 files changed, 271 insertions(+), 169 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index f9d528ac64..9781935cdc 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -2,9 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from ._buffer import * -from ._dmr import * -from ._ipc import * -from ._legacy import * -from ._vmm import * - +from ._buffer import * # noqa: F403 +from ._dmr import * # noqa: F403 +from ._ipc import * # noqa: F403 +from ._legacy import * # noqa: F403 +from ._vmm import * # noqa: F403 diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd index b6c75f63cc..a684c97f98 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd @@ -4,19 +4,17 @@ from libc.stdint cimport intptr_t -from cuda.core.experimental._stream cimport Stream as _cyStream +from cuda.core.experimental._stream cimport Stream cdef class Buffer: cdef: - intptr_t _ptr - size_t _size + intptr_t _ptr + size_t _size MemoryResource _mr - object _ptr_obj - _cyStream _alloc_stream + object _ptr_obj + Stream _alloc_stream cdef class MemoryResource: pass - - diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index a3a5fa48b8..69910c9869 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -9,7 +9,7 @@ from libc.stdint cimport intptr_t from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor from cuda.core.experimental._memory cimport _ipc -from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream +from cuda.core.experimental._stream cimport default_stream, Stream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, ) @@ -18,14 +18,16 @@ import abc from typing import TypeVar, Union from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule -from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.cuda_utils import driver __all__ = ['Buffer', 'MemoryResource'] DevicePointerT = Union[driver.CUdeviceptr, int, None] -"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`.""" +""" +A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting +:attr:`Buffer.handle`. +""" cdef class Buffer: """Represent a handle to allocated memory. @@ -51,16 +53,20 @@ cdef class Buffer: self._alloc_stream = None def __init__(self, *args, **kwargs): - raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.") + raise RuntimeError("Buffer objects cannot be instantiated directly. " + "Please use MemoryResource APIs.") @classmethod - def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, stream: Stream | None = None): + def _init( + cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, + stream: Stream | None = None + ): cdef Buffer self = Buffer.__new__(cls) self._ptr = (int(ptr)) self._ptr_obj = ptr self._size = size self._mr = mr - self._alloc_stream = <_cyStream>(stream) if stream is not None else None + self._alloc_stream = (stream) if stream is not None else None return self def __dealloc__(self): @@ -71,7 +77,9 @@ cdef class Buffer: return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor()) @staticmethod - def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer: + def from_handle( + ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None + ) -> Buffer: """Create a new :class:`Buffer` object from a pointer. Parameters @@ -87,7 +95,10 @@ cdef class Buffer: return Buffer._init(ptr, size, mr=mr) @classmethod - def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer: + def from_ipc_descriptor( + cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, + stream: Stream = None + ) -> Buffer: """Import a buffer that was exported from another process.""" return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream) @@ -132,13 +143,14 @@ cdef class Buffer: if dst is None: if self._mr is None: - raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)") + raise ValueError("a destination buffer must be provided (this " + "buffer does not have a memory_resource)") dst = self._mr.allocate(src_size, stream) cdef size_t dst_size = dst._size if dst_size != src_size: - raise ValueError( - f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})" + raise ValueError( "buffer sizes mismatch between src and dst (sizes " + f"are: src={src_size}, dst={dst_size})" ) err, = driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle) raise_if_driver_error(err) @@ -163,8 +175,8 @@ cdef class Buffer: cdef size_t src_size = src._size if src_size != dst_size: - raise ValueError( - f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})" + raise ValueError( "buffer sizes mismatch between src and dst (sizes " + f"are: src={src_size}, dst={dst_size})" ) err, = driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle) raise_if_driver_error(err) @@ -264,17 +276,19 @@ cdef class Buffer: return self._size +# Buffer Implementation +# --------------------- cdef Buffer_close(Buffer self, stream): - cdef _cyStream s + cdef Stream s if self._ptr and self._mr is not None: if stream is None: if self._alloc_stream is not None: s = self._alloc_stream else: # TODO: remove this branch when from_handle takes a stream - s = <_cyStream>(default_stream()) + s = (default_stream()) else: - s = <_cyStream>stream + s = stream self._mr.deallocate(self._ptr, self._size, s) self._ptr = 0 self._mr = None @@ -283,13 +297,15 @@ cdef Buffer_close(Buffer self, stream): cdef class MemoryResource: - """Abstract base class for memory resources that manage allocation and deallocation of buffers. - - Subclasses must implement methods for allocating and deallocation, as well as properties - associated with this memory resource from which all allocated buffers will inherit. (Since - all :class:`Buffer` instances allocated and returned by the :meth:`allocate` method would - hold a reference to self, the buffer properties are retrieved simply by looking up the underlying - memory resource's respective property.) + """Abstract base class for memory resources that manage allocation and + deallocation of buffers. + + Subclasses must implement methods for allocating and deallocation, as well + as properties associated with this memory resource from which all allocated + buffers will inherit. (Since all :class:`Buffer` instances allocated and + returned by the :meth:`allocate` method would hold a reference to self, the + buffer properties are retrieved simply by looking up the underlying memory + resource's respective property.) """ @abc.abstractmethod @@ -329,5 +345,3 @@ cdef class MemoryResource: and document the behavior. """ ... - - diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd index 2d1420dd49..945291b6e4 100644 --- a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd @@ -10,10 +10,9 @@ from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData cdef class DeviceMemoryResource(MemoryResource): cdef: - object __weakref__ - int _dev_id + int _dev_id cydriver.CUmemoryPool _handle - object _attributes - bint _mempool_owned - IPCData _ipc_data - + bint _mempool_owned + IPCData _ipc_data + object _attributes + object __weakref__ diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx index 0601e93ea6..fc541406a0 100644 --- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx @@ -12,7 +12,7 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData -from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream +from cuda.core.experimental._stream cimport default_stream, Stream from cuda.core.experimental._utils.cuda_utils cimport ( _check_driver_error as raise_if_driver_error, check_or_create_options, @@ -22,12 +22,10 @@ from cuda.core.experimental._utils.cuda_utils cimport ( import cython from dataclasses import dataclass from typing import Optional, TYPE_CHECKING -import os -import platform +import platform # no-cython-lint import uuid import weakref -from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.cuda_utils import driver if TYPE_CHECKING: @@ -69,7 +67,9 @@ class DeviceMemoryResourceAttributes: def mempool_property(property_type: type): def decorator(stub): - attr_enum = getattr(driver.CUmemPool_attribute, f"CU_MEMPOOL_ATTR_{stub.__name__.upper()}") + attr_enum = getattr( + driver.CUmemPool_attribute, f"CU_MEMPOOL_ATTR_{stub.__name__.upper()}" + ) def fget(self) -> property_type: mr = self._mr() @@ -206,7 +206,8 @@ cdef class DeviceMemoryResource(MemoryResource): def __init__(self, device_id: int | Device, options=None): cdef int dev_id = getattr(device_id, 'device_id', device_id) opts = check_or_create_options( - DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", keep_none=True + DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", + keep_none=True ) if opts is None: @@ -218,14 +219,17 @@ cdef class DeviceMemoryResource(MemoryResource): DMR_close(self) def close(self): - """Close the device memory resource and destroy the associated memory pool if owned.""" + """ + Close the device memory resource and destroy the associated memory pool + if owned. + """ DMR_close(self) def __reduce__(self): return DeviceMemoryResource.from_registry, (self.uuid,) @staticmethod - def from_registry(uuid: uuid.UUID) -> DeviceMemoryResource: + def from_registry(uuid: uuid.UUID) -> DeviceMemoryResource: # no-cython-lint """ Obtain a registered mapped memory resource. @@ -236,7 +240,7 @@ cdef class DeviceMemoryResource(MemoryResource): """ return _ipc.DMR_from_registry(uuid) - def register(self, uuid: uuid.UUID) -> DeviceMemoryResource: + def register(self, uuid: uuid.UUID) -> DeviceMemoryResource: # no-cython-lint """ Register a mapped memory resource. @@ -309,7 +313,7 @@ cdef class DeviceMemoryResource(MemoryResource): raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource") if stream is None: stream = default_stream() - return DMR_allocate(self, size, <_cyStream>stream) + return DMR_allocate(self, size, stream) def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None): """Deallocate a buffer previously allocated by this resource. @@ -325,7 +329,7 @@ cdef class DeviceMemoryResource(MemoryResource): If the buffer is deallocated without an explicit stream, the allocation stream is used. """ - DMR_deallocate(self, ptr, size, <_cyStream>stream) + DMR_deallocate(self, ptr, size, stream) @property def attributes(self) -> DeviceMemoryResourceAttributes: @@ -381,6 +385,9 @@ cdef class DeviceMemoryResource(MemoryResource): return getattr(self._ipc_data, 'uuid', None) +# DeviceMemoryResource Implementation +# ----------------------------------- + cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): # Get the current memory pool. cdef cydriver.cuuint64_t current_threshold @@ -392,15 +399,19 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): with nogil: HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id)) - # Set a higher release threshold to improve performance when there are no active allocations. - # By default, the release threshold is 0, which means memory is immediately released back - # to the OS when there are no active suballocations, causing performance issues. - # Check current release threshold - HANDLE_RETURN(cydriver.cuMemPoolGetAttribute( - self._handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, ¤t_threshold) + # Set a higher release threshold to improve performance when there are + # no active allocations. By default, the release threshold is 0, which + # means memory is immediately released back to the OS when there are no + # active suballocations, causing performance issues. + HANDLE_RETURN( + cydriver.cuMemPoolGetAttribute( + self._handle, + cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + ¤t_threshold + ) ) - # If threshold is 0 (default), set it to maximum to retain memory in the pool + # If threshold is 0 (default), set it to maximum to retain memory in the pool. if current_threshold == 0: HANDLE_RETURN(cydriver.cuMemPoolSetAttribute( self._handle, @@ -409,11 +420,13 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id): )) -cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts): +cdef void DMR_init_create( + DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts +): # Create a new memory pool. cdef cydriver.CUmemPoolProps properties - if opts.ipc_enabled and _ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE: + if opts.ipc_enabled and not _ipc.is_supported(): raise RuntimeError("IPC is not available on {platform.system()}") memset(&properties, 0, sizeof(cydriver.CUmemPoolProps)) @@ -437,7 +450,7 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes self._ipc_data = IPCData(alloc_handle, mapped=False) -cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream): +cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr with nogil: @@ -451,7 +464,9 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream strea return buf -cdef void DMR_deallocate(DeviceMemoryResource self, intptr_t ptr, size_t size, _cyStream stream) noexcept: +cdef void DMR_deallocate( + DeviceMemoryResource self, intptr_t ptr, size_t size, Stream stream +) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr with nogil: @@ -472,4 +487,3 @@ cdef DMR_close(DeviceMemoryResource self): self._attributes = None self._mempool_owned = False self._ipc_data = None - diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd index c81fcc532a..6480f32619 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd @@ -14,40 +14,45 @@ from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource cdef object registry -# IPC is currently only supported on Linux. On other platforms, the IPC handle -# type is set equal to the no-IPC handle type. +# The IPC handle type for this platform. IPC is currently only supported on +# Linux. On other platforms, the IPC handle type is set equal to the no-IPC +# handle type. cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE +# Whether IPC is supported on this platform. +cdef is_supported() + + cdef class IPCData: cdef: - bint _is_mapped IPCAllocationHandle _alloc_handle + bint _is_mapped cdef class IPCBufferDescriptor: cdef: - bytes _reserved + bytes _payload size_t _size cdef class IPCAllocationHandle: cdef: - int _handle + int _handle object _uuid cpdef close(self) # Buffer IPC Implementation -# ------ -cpdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer) -cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource, IPCBufferDescriptor, stream) +# ------------------------- +cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer) +cdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource, IPCBufferDescriptor, stream) # DeviceMemoryResource IPC Implementation -# ------ -cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle) -cpdef DeviceMemoryResource DMR_from_registry(uuid) -cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid) -cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource) +# --------------------------------------- +cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle) +cdef DeviceMemoryResource DMR_from_registry(uuid) +cdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid) +cdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource) diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx index 93fa6d0dcb..4856ce5546 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 cimport cpython -from libc.stdint cimport intptr_t, uintptr_t +from libc.stdint cimport intptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver @@ -12,7 +12,6 @@ from cuda.core.experimental._utils.cuda_utils cimport ( HANDLE_RETURN, ) -from typing import Iterable, Literal, Optional, TypeVar, Union import multiprocessing import os import platform @@ -24,19 +23,24 @@ __all__ = ['IPCBufferDescriptor', 'IPCAllocationHandle'] cdef object registry = weakref.WeakValueDictionary() -cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \ - if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE +cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE = \ + cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \ + if platform.system() == "Linux" else \ + cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE + +cdef is_supported(): + return IPC_HANDLE_TYPE != cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE cdef class IPCData: """Data members related to sharing memory pools via IPC.""" def __cinit__(self): - self._is_mapped = False self._alloc_handle = None + self._is_mapped = False def __init__(self, IPCAllocationHandle alloc_handle, bint mapped): - self._is_mapped = mapped self._alloc_handle = alloc_handle + self._is_mapped = mapped @property def alloc_handle(self): @@ -60,12 +64,12 @@ cdef class IPCBufferDescriptor: @classmethod def _init(cls, reserved: bytes, size: int): cdef IPCBufferDescriptor self = IPCBufferDescriptor.__new__(cls) - self._reserved = reserved + self._payload = reserved self._size = size return self def __reduce__(self): - return self._init, (self._reserved, self._size) + return self._init, (self._payload, self._size) @property def size(self): @@ -79,7 +83,7 @@ cdef class IPCAllocationHandle: raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.") @classmethod - def _init(cls, handle: int, uuid): + def _init(cls, handle: int, uuid): # no-cython-lint cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls) assert handle >= 0 self._handle = handle @@ -118,7 +122,7 @@ def _reduce_allocation_handle(alloc_handle): return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid) -def _reconstruct_allocation_handle(cls, df, uuid): +def _reconstruct_allocation_handle(cls, df, uuid): # no-cython-lint return cls._init(df.detach(), uuid) @@ -136,17 +140,23 @@ multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_mem # Buffer IPC Implementation -# ------ -cpdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self): +# ------------------------- +cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self): if not self._mr.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") cdef cydriver.CUmemPoolPtrExportData data with nogil: - HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, (self._ptr))) - cdef bytes data_b = cpython.PyBytes_FromStringAndSize((data.reserved), sizeof(data.reserved)) + HANDLE_RETURN( + cydriver.cuMemPoolExportPointer(&data, (self._ptr)) + ) + cdef bytes data_b = cpython.PyBytes_FromStringAndSize( + (data.reserved), sizeof(data.reserved) + ) return IPCBufferDescriptor._init(data_b, self.size) -cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferDescriptor ipc_buffer, stream): +cdef Buffer Buffer_from_ipc_descriptor( + cls, DeviceMemoryResource mr, IPCBufferDescriptor ipc_buffer, stream +): """Import a buffer that was exported from another process.""" if not mr.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") @@ -154,20 +164,25 @@ cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferD # Note: match this behavior to DeviceMemoryResource.allocate() stream = default_stream() cdef cydriver.CUmemPoolPtrExportData data - memcpy(data.reserved, (ipc_buffer._reserved), sizeof(data.reserved)) + memcpy( + data.reserved, + (ipc_buffer._payload), + sizeof(data.reserved) + ) cdef cydriver.CUdeviceptr ptr with nogil: HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data)) return Buffer._init(ptr, ipc_buffer.size, mr, stream) + # DeviceMemoryResource IPC Implementation -# ------ +# --------------------------------------- -cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle): +cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle): # Quick exit for registry hits. if isinstance(alloc_handle, int): alloc_handle = IPCAllocationHandle._init(alloc_handle, None) - uuid = getattr(alloc_handle, 'uuid', None) + uuid = getattr(alloc_handle, 'uuid', None) # no-cython-lint mr = registry.get(uuid) if mr is not None: return mr @@ -196,14 +211,14 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand return self -cpdef DeviceMemoryResource DMR_from_registry(uuid): +cdef DeviceMemoryResource DMR_from_registry(uuid): try: return registry[uuid] except KeyError: raise RuntimeError(f"Memory resource {uuid} was not found") from None -cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid): +cdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid): existing = registry.get(uuid) if existing is not None: return existing @@ -213,10 +228,9 @@ cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid): return self -cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self): +cdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self): # Note: This is Linux only (int for file descriptor) cdef int fd - cdef IPCAllocationHandle alloc_handle with nogil: HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle( &fd, self._handle, IPC_HANDLE_TYPE, 0) @@ -226,4 +240,3 @@ cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self): except: os.close(fd) raise - diff --git a/cuda_core/cuda/core/experimental/_memory/_legacy.py b/cuda_core/cuda/core/experimental/_memory/_legacy.py index 6af415433a..523835a79d 100644 --- a/cuda_core/cuda/core/experimental/_memory/_legacy.py +++ b/cuda_core/cuda/core/experimental/_memory/_legacy.py @@ -8,9 +8,11 @@ from cuda.core.experimental._memory._buffer import Buffer, MemoryResource from cuda.core.experimental._utils.cuda_utils import ( - driver, _check_driver_error as raise_if_driver_error, ) +from cuda.core.experimental._utils.cuda_utils import ( + driver, +) if TYPE_CHECKING: from cuda.core.experimental._memory.buffer import DevicePointerT @@ -25,7 +27,7 @@ class LegacyPinnedMemoryResource(MemoryResource): # TODO: support creating this MR with flags that are later passed to cuMemHostAlloc? - def allocate(self, size, stream = None) -> Buffer: + def allocate(self, size, stream=None) -> Buffer: """Allocate a buffer of the requested size. Parameters @@ -42,6 +44,7 @@ def allocate(self, size, stream = None) -> Buffer: """ if stream is None: from cuda.core.experimental._stream import default_stream + stream = default_stream() err, ptr = driver.cuMemAllocHost(size) raise_if_driver_error(err) @@ -60,7 +63,7 @@ def deallocate(self, ptr: DevicePointerT, size, stream): The stream on which to perform the deallocation synchronously. """ stream.sync() - err, = driver.cuMemFreeHost(ptr) + (err,) = driver.cuMemFreeHost(ptr) raise_if_driver_error(err) @property @@ -83,11 +86,12 @@ class _SynchronousMemoryResource(MemoryResource): __slots__ = ("_dev_id",) def __init__(self, device_id): - self._dev_id = getattr(device_id, 'device_id', device_id) + self._dev_id = getattr(device_id, "device_id", device_id) def allocate(self, size, stream=None) -> Buffer: if stream is None: from cuda.core.experimental._stream import default_stream + stream = default_stream() err, ptr = driver.cuMemAlloc(size) raise_if_driver_error(err) @@ -95,7 +99,7 @@ def allocate(self, size, stream=None) -> Buffer: def deallocate(self, ptr, size, stream): stream.sync() - err, = driver.cuMemFree(ptr) + (err,) = driver.cuMemFree(ptr) raise_if_driver_error(err) @property @@ -109,5 +113,3 @@ def is_host_accessible(self) -> bool: @property def device_id(self) -> int: return self._dev_id - - diff --git a/cuda_core/cuda/core/experimental/_memory/_vmm.py b/cuda_core/cuda/core/experimental/_memory/_vmm.py index ebf7895076..3f55614839 100644 --- a/cuda_core/cuda/core/experimental/_memory/_vmm.py +++ b/cuda_core/cuda/core/experimental/_memory/_vmm.py @@ -2,16 +2,20 @@ # # SPDX-License-Identifier: Apache-2.0 +import platform from dataclasses import dataclass, field from typing import Iterable, Literal, Optional, Union -import platform -from cuda.core.experimental._stream import Stream from cuda.core.experimental._memory._buffer import Buffer, MemoryResource -from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version ) +from cuda.core.experimental._stream import Stream from cuda.core.experimental._utils.cuda_utils import ( - _check_driver_error as raise_if_driver_error, + Transaction, check_or_create_options, + driver, + get_binding_version, +) +from cuda.core.experimental._utils.cuda_utils import ( + _check_driver_error as raise_if_driver_error, ) __all__ = ["VirtualMemoryResourceOptions", "VirtualMemoryResource"] @@ -54,6 +58,7 @@ class VirtualMemoryResourceOptions: peer_access: :obj:`~_memory.VirtualMemoryAccessTypeT` Access flags for peers. """ + # Human-friendly strings; normalized in __post_init__ allocation_type: VirtualMemoryAllocationTypeT = "pinned" location_type: VirtualMemoryLocationTypeT = "device" @@ -69,11 +74,25 @@ class VirtualMemoryResourceOptions: _a = driver.CUmemAccess_flags _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, None: 0} _h = driver.CUmemAllocationHandleType - _handle_types = {None: _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC} + _handle_types = { + None: _h.CU_MEM_HANDLE_TYPE_NONE, + "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, + "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, + "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, + "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC, + } _g = driver.CUmemAllocationGranularity_flags - _granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM} + _granularity = { + "recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, + "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM, + } _l = driver.CUmemLocationType - _location_type = {"device": _l.CU_MEM_LOCATION_TYPE_DEVICE, "host": _l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT} + _location_type = { + "device": _l.CU_MEM_LOCATION_TYPE_DEVICE, + "host": _l.CU_MEM_LOCATION_TYPE_HOST, + "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA, + "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT, + } # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not _a = driver.CUmemAllocationType _allocation_type = {"pinned": _a.CU_MEM_ALLOCATION_TYPE_PINNED} @@ -128,6 +147,7 @@ class VirtualMemoryResource(MemoryResource): config : VirtualMemoryResourceOptions A configuration object for the VirtualMemoryResource """ + def __init__(self, device, config: VirtualMemoryResourceOptions = None): self.device = device self.config = check_or_create_options( @@ -139,9 +159,12 @@ def __init__(self, device, config: VirtualMemoryResourceOptions = None): raise NotImplementedError("VirtualMemoryResource is not supported on Windows") # Validate RDMA support if requested - if self.config.gpu_direct_rdma and self.device is not None: - if not self.device.properties.gpu_direct_rdma_supported: - raise RuntimeError("GPU Direct RDMA is not supported on this device") + if ( + self.config.gpu_direct_rdma + and self.device is not None + and not self.device.properties.gpu_direct_rdma_supported + ): + raise RuntimeError("GPU Direct RDMA is not supported on this device") @staticmethod def _align_up(size: int, gran: int) -> int: @@ -196,7 +219,7 @@ def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryRes # Same size: only update access policy if needed; avoid zero-sized driver calls descs = self._build_access_descriptors(prop) if descs: - res, = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs)) + (res,) = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs)) raise_if_driver_error(res) return buf @@ -210,23 +233,31 @@ def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryRes aligned_additional_size, addr_align, int(buf.handle) + aligned_prev_size, # fixedAddr hint - aligned end of current range - 0 + 0, ) if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size): # Check for specific errors that are not recoverable with the slow path - if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED): + if res in ( + driver.CUresult.CUDA_ERROR_INVALID_VALUE, + driver.CUresult.CUDA_ERROR_NOT_PERMITTED, + driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, + driver.CUresult.CUDA_ERROR_NOT_SUPPORTED, + ): raise_if_driver_error(res) - res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size) + (res2,) = driver.cuMemAddressFree(new_ptr, aligned_additional_size) raise_if_driver_error(res2) # Fallback: couldn't extend contiguously, need full remapping - return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align) + return self._grow_allocation_slow_path( + buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align + ) else: # Success! We can extend the VA range contiguously return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr) - def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, - aligned_additional_size: int, new_ptr: int) -> Buffer: + def _grow_allocation_fast_path( + self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, aligned_additional_size: int, new_ptr: int + ) -> Buffer: """ Fast path for growing a virtual memory allocation when the new region can be reserved contiguously after the existing buffer. @@ -236,33 +267,47 @@ def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CU remains unchanged). Args: - buf (Buffer): The buffer to grow. - new_size (int): The new total size in bytes. - prop (driver.CUmemAllocationProp): Allocation properties for the new memory. - aligned_additional_size (int): The size of the new region to allocate, aligned to granularity. - new_ptr (int): The address of the newly reserved contiguous VA region (should be at the end of the current buffer). + buf (Buffer): + The buffer to grow. + + new_size (int): + The new total size in bytes. + + prop (driver.CUmemAllocationProp): + Allocation properties for the new memory. + + aligned_additional_size (int): + The size of the new region to allocate, aligned to granularity. + + new_ptr (int): + The address of the newly reserved contiguous VA region (should + be at the end of the current buffer). Returns: Buffer: The same buffer object with its size updated to `new_size`. """ with Transaction() as trans: # Create new physical memory for the additional size - trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])) + trans.append( + lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]) + ) res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0) raise_if_driver_error(res) # Register undo for creation trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) # Map the new physical memory to the extended VA range - res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) + (res,) = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0])) + trans.append( + lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0]) + ) # Set access permissions for the new portion descs = self._build_access_descriptors(prop) if descs: - res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs)) + (res,) = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs)) raise_if_driver_error(res) # All succeeded, cancel undo actions @@ -272,8 +317,15 @@ def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CU buf._size = new_size return buf - def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, - aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer: + def _grow_allocation_slow_path( + self, + buf: Buffer, + new_size: int, + prop: driver.CUmemAllocationProp, + aligned_additional_size: int, + total_aligned_size: int, + addr_align: int, + ) -> Buffer: """ Slow path for growing a virtual memory allocation when the new region cannot be reserved contiguously after the existing buffer. @@ -299,7 +351,9 @@ def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CU res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0) raise_if_driver_error(res) # Register undo for VA reservation - trans.append(lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])) + trans.append( + lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]) + ) # Get the old allocation handle for remapping result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle) @@ -309,20 +363,22 @@ def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CU # Unmap the old VA range (aligned previous size) aligned_prev_size = total_aligned_size - aligned_additional_size - result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size) + (result,) = driver.cuMemUnmap(int(buf.handle), aligned_prev_size) raise_if_driver_error(result) def _remap_old(): # Try to remap the old physical memory back to the original VA range try: - res, = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0) + (res,) = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0) raise_if_driver_error(res) - except Exception: + except Exception: # noqa: S110 + # TODO: consider logging this exception pass + trans.append(_remap_old) # Remap the old physical memory to the new VA range (aligned previous size) - res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0) + (res,) = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0) raise_if_driver_error(res) # Register undo for mapping @@ -336,23 +392,27 @@ def _remap_old(): trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0])) # Map the new physical memory to the extended portion (aligned offset) - res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) + (res,) = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0) raise_if_driver_error(res) # Register undo for mapping - trans.append(lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(base + offs, s)[0])) + trans.append( + lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error( + driver.cuMemUnmap(base + offs, s)[0] + ) + ) # Set access permissions for the entire new range descs = self._build_access_descriptors(prop) if descs: - res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs)) + (res,) = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs)) raise_if_driver_error(res) # All succeeded, cancel undo actions trans.commit() # Free the old VA range (aligned previous size) - res2, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) + (res2,) = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size) raise_if_driver_error(res2) # Invalidate the old buffer so its destructor won't try to free again @@ -361,7 +421,6 @@ def _remap_old(): # Return a new Buffer for the new mapping return Buffer.from_handle(ptr=new_ptr, size=new_size, mr=self) - def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list: """ Build access descriptors for memory access permissions. @@ -394,7 +453,6 @@ def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list: return descs - def allocate(self, size: int, stream: Stream = None) -> Buffer: """ Allocate a buffer of the given size using CUDA virtual memory. @@ -463,14 +521,14 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer: trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemAddressFree(p, s)[0])) # ---- Map physical memory into VA ---- - res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) + (res,) = driver.cuMemMap(ptr, aligned_size, 0, handle, 0) trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemUnmap(p, s)[0])) raise_if_driver_error(res) # ---- Set access for owner + peers ---- descs = self._build_access_descriptors(prop) if descs: - res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs)) + (res,) = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs)) raise_if_driver_error(res) trans.commit() @@ -479,20 +537,19 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer: buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self) return buf - def deallocate(self, ptr: int, size: int, stream: Stream=None) -> None: + def deallocate(self, ptr: int, size: int, stream: Stream = None) -> None: """ Deallocate memory on the device using CUDA VMM APIs. """ result, handle = driver.cuMemRetainAllocationHandle(ptr) raise_if_driver_error(result) - result, = driver.cuMemUnmap(ptr, size) + (result,) = driver.cuMemUnmap(ptr, size) raise_if_driver_error(result) - result, = driver.cuMemAddressFree(ptr, size) + (result,) = driver.cuMemAddressFree(ptr, size) raise_if_driver_error(result) - result, = driver.cuMemRelease(handle) + (result,) = driver.cuMemRelease(handle) raise_if_driver_error(result) - @property def is_device_accessible(self) -> bool: """ diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index 0be8513d58..df6865c915 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -3,7 +3,6 @@ import multiprocessing as mp import multiprocessing.reduction -import os from cuda.core.experimental import Buffer, Device, DeviceMemoryResource from helpers.buffers import PatternGen diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 8879d2dee1..a261ec7a3d 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -126,23 +126,25 @@ class NullMemoryResource(DummyHostMemoryResource): def is_host_accessible(self) -> bool: return False + def test_package_contents(): expected = [ - 'Buffer', - 'MemoryResource', - 'DeviceMemoryResource', - 'DeviceMemoryResourceOptions', - 'IPCBufferDescriptor', - 'IPCAllocationHandle', - 'LegacyPinnedMemoryResource', - 'VirtualMemoryResourceOptions', - 'VirtualMemoryResource' + "Buffer", + "MemoryResource", + "DeviceMemoryResource", + "DeviceMemoryResourceOptions", + "IPCBufferDescriptor", + "IPCAllocationHandle", + "LegacyPinnedMemoryResource", + "VirtualMemoryResourceOptions", + "VirtualMemoryResource", ] d = {} - exec("from cuda.core.experimental._memory import *", d) - d = {k:v for k,v in d.items() if not k.startswith("__")} + exec("from cuda.core.experimental._memory import *", d) # noqa: S102 + d = {k: v for k, v in d.items() if not k.startswith("__")} assert sorted(expected) == sorted(d.keys()) + def buffer_initialization(dummy_mr: MemoryResource): buffer = dummy_mr.allocate(size=1024) assert buffer.handle != 0 From 44f7587af494c8ab0261396c8c71c51d2465625b Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 31 Oct 2025 15:53:37 -0700 Subject: [PATCH 26/30] Touch-ups --- .../cuda/core/experimental/_memory/_buffer.pyx | 6 +----- .../cuda/core/experimental/_memory/_dmr.pxd | 3 +-- .../cuda/core/experimental/_memory/_dmr.pyx | 17 ++++++++++++----- .../cuda/core/experimental/_memory/_vmm.py | 6 +++--- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index 69910c9869..94aa2ee871 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -39,11 +39,7 @@ cdef class Buffer: Support for data interchange mechanisms are provided by DLPack. """ def __cinit__(self): - self._ptr = 0 - self._size = 0 - self._mr = None - self._ptr_obj = None - self._alloc_stream = None + self._clear() def _clear(self): self._ptr = 0 diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd index 945291b6e4..cdd00de067 100644 --- a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd @@ -3,9 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver - from cuda.core.experimental._memory._buffer cimport MemoryResource -from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData +from cuda.core.experimental._memory._ipc cimport IPCData cdef class DeviceMemoryResource(MemoryResource): diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx index fc541406a0..e2de36088e 100644 --- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx @@ -14,7 +14,6 @@ from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData from cuda.core.experimental._stream cimport default_stream, Stream from cuda.core.experimental._utils.cuda_utils cimport ( - _check_driver_error as raise_if_driver_error, check_or_create_options, HANDLE_RETURN, ) @@ -75,9 +74,8 @@ class DeviceMemoryResourceAttributes: mr = self._mr() if mr is None: raise RuntimeError("DeviceMemoryResource is expired") - # TODO: this implementation does not allow lowering to Cython + nogil - err, value = driver.cuMemPoolGetAttribute(mr.handle, attr_enum) - raise_if_driver_error(err) + value = DMRA_getattribute( mr.handle, + attr_enum) return property_type(value) return property(fget=fget, doc=stub.__doc__) return decorator @@ -117,6 +115,15 @@ class DeviceMemoryResourceAttributes: del mempool_property +cdef int DMRA_getattribute( + cydriver.CUmemoryPool pool_handle, cydriver.CUmemPool_attribute attr_enum +): + cdef int value + with nogil: + HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, &value)) + return value + + cdef class DeviceMemoryResource(MemoryResource): """ Create a device memory resource managing a stream-ordered memory pool. @@ -199,9 +206,9 @@ cdef class DeviceMemoryResource(MemoryResource): def __cinit__(self): self._dev_id = cydriver.CU_DEVICE_INVALID self._handle = NULL - self._attributes = None self._mempool_owned = False self._ipc_data = None + self._attributes = None def __init__(self, device_id: int | Device, options=None): cdef int dev_id = getattr(device_id, 'device_id', device_id) diff --git a/cuda_core/cuda/core/experimental/_memory/_vmm.py b/cuda_core/cuda/core/experimental/_memory/_vmm.py index 3f55614839..ab742e4273 100644 --- a/cuda_core/cuda/core/experimental/_memory/_vmm.py +++ b/cuda_core/cuda/core/experimental/_memory/_vmm.py @@ -4,7 +4,7 @@ import platform from dataclasses import dataclass, field -from typing import Iterable, Literal, Optional, Union +from typing import Iterable, Literal, Union from cuda.core.experimental._memory._buffer import Buffer, MemoryResource from cuda.core.experimental._stream import Stream @@ -65,8 +65,8 @@ class VirtualMemoryResourceOptions: handle_type: VirtualMemoryHandleTypeT = "posix_fd" granularity: VirtualMemoryGranularityT = "recommended" gpu_direct_rdma: bool = False - addr_hint: Optional[int] = 0 - addr_align: Optional[int] = None + addr_hint: int | None = 0 + addr_align: int | None = None peers: Iterable[int] = field(default_factory=tuple) self_access: VirtualMemoryAccessTypeT = "rw" peer_access: VirtualMemoryAccessTypeT = "rw" From 0fac80086a75b3e17ba39ff7f1217db2b78be000 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Mon, 3 Nov 2025 17:24:53 -0800 Subject: [PATCH 27/30] Cythonize DeviceMemoryResourceAttributes. --- .../cuda/core/experimental/_memory/_dmr.pyx | 61 ++++++++++--------- 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx index e2de36088e..b64b6a6842 100644 --- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx @@ -53,66 +53,67 @@ cdef class DeviceMemoryResourceOptions: max_size : cython.size_t = 0 -# TODO: cythonize this? -class DeviceMemoryResourceAttributes: +cdef class DeviceMemoryResourceAttributes: + cdef: + object _mr_weakref + def __init__(self, *args, **kwargs): raise RuntimeError("DeviceMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.") @classmethod - def _init(cls, mr : DeviceMemoryReference): - self = DeviceMemoryResourceAttributes.__new__(cls) - self._mr = mr + def _init(cls, mr): + cdef DeviceMemoryResourceAttributes self = DeviceMemoryResourceAttributes.__new__(cls) + self._mr_weakref = mr return self - def mempool_property(property_type: type): - def decorator(stub): - attr_enum = getattr( - driver.CUmemPool_attribute, f"CU_MEMPOOL_ATTR_{stub.__name__.upper()}" - ) - - def fget(self) -> property_type: - mr = self._mr() - if mr is None: - raise RuntimeError("DeviceMemoryResource is expired") - value = DMRA_getattribute( mr.handle, - attr_enum) - return property_type(value) - return property(fget=fget, doc=stub.__doc__) - return decorator - - @mempool_property(bool) + @DMRA_mempool_attribute(bool) def reuse_follow_event_dependencies(self): """Allow memory to be reused when there are event dependencies between streams.""" - @mempool_property(bool) + @DMRA_mempool_attribute(bool) def reuse_allow_opportunistic(self): """Allow reuse of completed frees without dependencies.""" - @mempool_property(bool) + @DMRA_mempool_attribute(bool) def reuse_allow_internal_dependencies(self): """Allow insertion of new stream dependencies for memory reuse.""" - @mempool_property(int) + @DMRA_mempool_attribute(int) def release_threshold(self): """Amount of reserved memory to hold before OS release.""" - @mempool_property(int) + @DMRA_mempool_attribute(int) def reserved_mem_current(self): """Current amount of backing memory allocated.""" - @mempool_property(int) + @DMRA_mempool_attribute(int) def reserved_mem_high(self): """High watermark of backing memory allocated.""" - @mempool_property(int) + @DMRA_mempool_attribute(int) def used_mem_current(self): """Current amount of memory in use.""" - @mempool_property(int) + @DMRA_mempool_attribute(int) def used_mem_high(self): """High watermark of memory in use.""" - del mempool_property + +cdef DMRA_mempool_attribute(property_type: type): + def decorator(stub): + attr_enum = getattr( + driver.CUmemPool_attribute, f"CU_MEMPOOL_ATTR_{stub.__name__.upper()}" + ) + + def fget(DeviceMemoryResourceAttributes self) -> property_type: + cdef DeviceMemoryResource mr = self._mr_weakref() + if mr is None: + raise RuntimeError("DeviceMemoryResource is expired") + value = DMRA_getattribute( mr.handle, + attr_enum) + return property_type(value) + return property(fget=fget, doc=stub.__doc__) + return decorator cdef int DMRA_getattribute( From 0d5f08b9be79f69bb5ec3ba404903287dd58f04b Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Tue, 4 Nov 2025 09:30:27 -0800 Subject: [PATCH 28/30] Restore previous behavior for DMR.from_allocation_handle when passed a file descriptor (caller closes the fd). --- cuda_core/cuda/core/experimental/_memory/_dmr.pyx | 4 +++- cuda_core/cuda/core/experimental/_memory/_ipc.pyx | 14 +++++++++++--- cuda_core/tests/memory_ipc/test_serialize.py | 2 ++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx index b64b6a6842..db631bfb3d 100644 --- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx @@ -276,7 +276,9 @@ cdef class DeviceMemoryResource(MemoryResource): resource is created. alloc_handle : int | IPCAllocationHandle - The shareable handle of the device memory resource to import. + The shareable handle of the device memory resource to import. If an + integer is supplied, it must represent a valid platform-specific + handle. It is the caller's responsibility to close that handle. Returns ------- diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx index 4856ce5546..706119c3fb 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx @@ -180,13 +180,21 @@ cdef Buffer Buffer_from_ipc_descriptor( cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle): # Quick exit for registry hits. - if isinstance(alloc_handle, int): - alloc_handle = IPCAllocationHandle._init(alloc_handle, None) uuid = getattr(alloc_handle, 'uuid', None) # no-cython-lint mr = registry.get(uuid) if mr is not None: return mr + # Ensure we have an allocation handle. Duplicate the file descriptor, if + # necessary. + if isinstance(alloc_handle, int): + fd = os.dup(alloc_handle) + try: + alloc_handle = IPCAllocationHandle._init(fd, None) + except: + os.close(fd) + raise + # Construct a new DMR. cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls) self._dev_id = getattr(device_id, 'device_id', device_id) @@ -205,7 +213,7 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl registered = self.register(uuid) assert registered is self - # Always close the file handle (caller can dup it, if needed). + # Always close the file handle. alloc_handle.close() return self diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index df6865c915..ceac50e502 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -3,6 +3,7 @@ import multiprocessing as mp import multiprocessing.reduction +import os from cuda.core.experimental import Buffer, Device, DeviceMemoryResource from helpers.buffers import PatternGen @@ -59,6 +60,7 @@ def child_main(self, conn): # Receive the memory resource. handle = mp.reduction.recv_handle(conn) mr = DeviceMemoryResource.from_allocation_handle(device, handle) + os.close(handle) # Receive the buffers. buffer1 = conn.recv() # directly From 7315e285dce18cca74634eb60942794c8eb9b01d Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Fri, 7 Nov 2025 12:42:35 -0800 Subject: [PATCH 29/30] Rename _mr to _memory_resource. Change pointer types from intptr_t to uintptr_t. --- .../core/experimental/_memory/_buffer.pxd | 6 ++-- .../core/experimental/_memory/_buffer.pyx | 32 +++++++++---------- .../cuda/core/experimental/_memory/_dmr.pyx | 10 +++--- .../cuda/core/experimental/_memory/_ipc.pyx | 8 ++--- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd index a684c97f98..12da84b2bd 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd @@ -2,16 +2,16 @@ # # SPDX-License-Identifier: Apache-2.0 -from libc.stdint cimport intptr_t +from libc.stdint cimport uintptr_t from cuda.core.experimental._stream cimport Stream cdef class Buffer: cdef: - intptr_t _ptr + uintptr_t _ptr size_t _size - MemoryResource _mr + MemoryResource _memory_resource object _ptr_obj Stream _alloc_stream diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index 94aa2ee871..61d4f191d0 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -4,7 +4,7 @@ from __future__ import annotations -from libc.stdint cimport intptr_t +from libc.stdint cimport uintptr_t from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor @@ -44,7 +44,7 @@ cdef class Buffer: def _clear(self): self._ptr = 0 self._size = 0 - self._mr = None + self._memory_resource = None self._ptr_obj = None self._alloc_stream = None @@ -58,10 +58,10 @@ cdef class Buffer: stream: Stream | None = None ): cdef Buffer self = Buffer.__new__(cls) - self._ptr = (int(ptr)) + self._ptr = (int(ptr)) self._ptr_obj = ptr self._size = size - self._mr = mr + self._memory_resource = mr self._alloc_stream = (stream) if stream is not None else None return self @@ -138,10 +138,10 @@ cdef class Buffer: cdef size_t src_size = self._size if dst is None: - if self._mr is None: + if self._memory_resource is None: raise ValueError("a destination buffer must be provided (this " "buffer does not have a memory_resource)") - dst = self._mr.allocate(src_size, stream) + dst = self._memory_resource.allocate(src_size, stream) cdef size_t dst_size = dst._size if dst_size != src_size: @@ -226,8 +226,8 @@ cdef class Buffer: @property def device_id(self) -> int: """Return the device ordinal of this buffer.""" - if self._mr is not None: - return self._mr.device_id + if self._memory_resource is not None: + return self._memory_resource.device_id raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") @property @@ -250,21 +250,21 @@ cdef class Buffer: @property def is_device_accessible(self) -> bool: """Return True if this buffer can be accessed by the GPU, otherwise False.""" - if self._mr is not None: - return self._mr.is_device_accessible + if self._memory_resource is not None: + return self._memory_resource.is_device_accessible raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") @property def is_host_accessible(self) -> bool: """Return True if this buffer can be accessed by the CPU, otherwise False.""" - if self._mr is not None: - return self._mr.is_host_accessible + if self._memory_resource is not None: + return self._memory_resource.is_host_accessible raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") @property def memory_resource(self) -> MemoryResource: """Return the memory resource associated with this buffer.""" - return self._mr + return self._memory_resource @property def size(self) -> int: @@ -276,7 +276,7 @@ cdef class Buffer: # --------------------- cdef Buffer_close(Buffer self, stream): cdef Stream s - if self._ptr and self._mr is not None: + if self._ptr and self._memory_resource is not None: if stream is None: if self._alloc_stream is not None: s = self._alloc_stream @@ -285,9 +285,9 @@ cdef Buffer_close(Buffer self, stream): s = (default_stream()) else: s = stream - self._mr.deallocate(self._ptr, self._size, s) + self._memory_resource.deallocate(self._ptr, self._size, s) self._ptr = 0 - self._mr = None + self._memory_resource = None self._ptr_obj = None self._alloc_stream = None diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx index db631bfb3d..47b6fd114e 100644 --- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx @@ -5,7 +5,7 @@ from __future__ import annotations from libc.limits cimport ULLONG_MAX -from libc.stdint cimport uintptr_t, intptr_t +from libc.stdint cimport uintptr_t from libc.string cimport memset from cuda.bindings cimport cydriver @@ -339,7 +339,7 @@ cdef class DeviceMemoryResource(MemoryResource): If the buffer is deallocated without an explicit stream, the allocation stream is used. """ - DMR_deallocate(self, ptr, size, stream) + DMR_deallocate(self, ptr, size, stream) @property def attributes(self) -> DeviceMemoryResourceAttributes: @@ -466,16 +466,16 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream): with nogil: HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s)) cdef Buffer buf = Buffer.__new__(Buffer) - buf._ptr = (devptr) + buf._ptr = (devptr) buf._ptr_obj = None buf._size = size - buf._mr = self + buf._memory_resource = self buf._alloc_stream = stream return buf cdef void DMR_deallocate( - DeviceMemoryResource self, intptr_t ptr, size_t size, Stream stream + DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream ) noexcept: cdef cydriver.CUstream s = stream._handle cdef cydriver.CUdeviceptr devptr = ptr diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx index 706119c3fb..5aa13af8fb 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 cimport cpython -from libc.stdint cimport intptr_t +from libc.stdint cimport uintptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver @@ -142,7 +142,7 @@ multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_mem # Buffer IPC Implementation # ------------------------- cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self): - if not self._mr.is_ipc_enabled: + if not self.memory_resource.is_ipc_enabled: raise RuntimeError("Memory resource is not IPC-enabled") cdef cydriver.CUmemPoolPtrExportData data with nogil: @@ -172,7 +172,7 @@ cdef Buffer Buffer_from_ipc_descriptor( cdef cydriver.CUdeviceptr ptr with nogil: HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data)) - return Buffer._init(ptr, ipc_buffer.size, mr, stream) + return Buffer._init(ptr, ipc_buffer.size, mr, stream) # DeviceMemoryResource IPC Implementation @@ -205,7 +205,7 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl cdef int handle = int(alloc_handle) with nogil: HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle( - &(self._handle), (handle), IPC_HANDLE_TYPE, 0) + &(self._handle), (handle), IPC_HANDLE_TYPE, 0) ) # Register it. From cce7f6c2958f52ad985c5d78d3db1f921e90eb59 Mon Sep 17 00:00:00 2001 From: Andy Jost Date: Wed, 12 Nov 2025 10:30:03 -0800 Subject: [PATCH 30/30] Rename files _dmr.* and _vmm.py to avoid abbreviations. --- cuda_core/cuda/core/experimental/_memory/__init__.py | 4 ++-- cuda_core/cuda/core/experimental/_memory/_buffer.pyx | 2 +- .../_memory/{_dmr.pxd => _device_memory_resource.pxd} | 0 .../_memory/{_dmr.pyx => _device_memory_resource.pyx} | 0 cuda_core/cuda/core/experimental/_memory/_ipc.pxd | 2 +- .../_memory/{_vmm.py => _virtual_memory_resource.py} | 0 6 files changed, 4 insertions(+), 4 deletions(-) rename cuda_core/cuda/core/experimental/_memory/{_dmr.pxd => _device_memory_resource.pxd} (100%) rename cuda_core/cuda/core/experimental/_memory/{_dmr.pyx => _device_memory_resource.pyx} (100%) rename cuda_core/cuda/core/experimental/_memory/{_vmm.py => _virtual_memory_resource.py} (100%) diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py index 9781935cdc..3c07fbdde6 100644 --- a/cuda_core/cuda/core/experimental/_memory/__init__.py +++ b/cuda_core/cuda/core/experimental/_memory/__init__.py @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 from ._buffer import * # noqa: F403 -from ._dmr import * # noqa: F403 +from ._device_memory_resource import * # noqa: F403 from ._ipc import * # noqa: F403 from ._legacy import * # noqa: F403 -from ._vmm import * # noqa: F403 +from ._virtual_memory_resource import * # noqa: F403 diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx index 61d4f191d0..2251272742 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx @@ -6,7 +6,7 @@ from __future__ import annotations from libc.stdint cimport uintptr_t -from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource +from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor from cuda.core.experimental._memory cimport _ipc from cuda.core.experimental._stream cimport default_stream, Stream diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd similarity index 100% rename from cuda_core/cuda/core/experimental/_memory/_dmr.pxd rename to cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx similarity index 100% rename from cuda_core/cuda/core/experimental/_memory/_dmr.pyx rename to cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd index 6480f32619..2b9c80290d 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd +++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd @@ -4,7 +4,7 @@ from cuda.bindings cimport cydriver from cuda.core.experimental._memory._buffer cimport Buffer -from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource +from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource # Holds DeviceMemoryResource objects imported by this process. This enables diff --git a/cuda_core/cuda/core/experimental/_memory/_vmm.py b/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py similarity index 100% rename from cuda_core/cuda/core/experimental/_memory/_vmm.py rename to cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py