From 37849a3641e8c781f5805be17b7c6b7f34e73e2f Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 28 Oct 2025 13:39:18 -0700
Subject: [PATCH 01/30] Resolve a Cython build warning.

---
 cuda_core/cuda/core/experimental/_event.pyx | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/experimental/_event.pyx
index dd6ef0b06e..051c216a4c 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/experimental/_event.pyx
@@ -7,14 +7,13 @@ from __future__ import annotations
 cimport cpython
 from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
-
 from cuda.bindings cimport cydriver
-
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN
 )
 
+import cython
 from dataclasses import dataclass
 import multiprocessing
 from typing import TYPE_CHECKING, Optional
@@ -277,7 +276,7 @@ cdef class IPCEventDescriptor:
         raise RuntimeError("IPCEventDescriptor objects cannot be instantiated directly. Please use Event APIs.")
 
     @classmethod
-    def _init(cls, reserved: bytes, busy_waited: bint):
+    def _init(cls, reserved: bytes, busy_waited: cython.bint):
         cdef IPCEventDescriptor self = IPCEventDescriptor.__new__(cls)
         self._reserved = reserved
         self._busy_waited = busy_waited

From ac8a69ca34c93d1b9b77ffe3d5e617d6d23bfb42 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 28 Oct 2025 11:27:23 -0700
Subject: [PATCH 02/30] Make memory module into a package.

---
 .../core/experimental/_memory/__init__.py     |  2 ++
 .../cuda/core/experimental/_memory/memory.pxd | 36 +++++++++++++++++++
 .../{_memory.pyx => _memory/memory.pyx}       | 23 ++----------
 3 files changed, 41 insertions(+), 20 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_memory/__init__.py
 create mode 100644 cuda_core/cuda/core/experimental/_memory/memory.pxd
 rename cuda_core/cuda/core/experimental/{_memory.pyx => _memory/memory.pyx} (99%)

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
new file mode 100644
index 0000000000..f97d27eada
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -0,0 +1,2 @@
+from .memory import *
+from .memory import _SynchronousMemoryResource
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd
new file mode 100644
index 0000000000..7dda135754
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport uintptr_t, intptr_t
+from cuda.core.experimental._stream cimport Stream as cyStream
+
+from cuda.core.experimental._stream import Stream
+
+
+cdef class _cyBuffer:
+    """
+    Internal only. Responsible for offering fast C method access.
+    """
+    cdef:
+        intptr_t _ptr
+        size_t _size
+        _cyMemoryResource _mr
+        object _ptr_obj
+        cyStream _alloc_stream
+
+
+cdef class Buffer(_cyBuffer):
+    cpdef close(self, stream: Stream=*)
+
+
+cdef class _cyMemoryResource:
+    """
+    Internal only. Responsible for offering fast C method access.
+    """
+    cdef Buffer _allocate(self, size_t size, cyStream stream)
+    cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept
+
+
+cdef class MemoryResource(_cyMemoryResource):
+    cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept
diff --git a/cuda_core/cuda/core/experimental/_memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
similarity index 99%
rename from cuda_core/cuda/core/experimental/_memory.pyx
rename to cuda_core/cuda/core/experimental/_memory/memory.pyx
index 32519cd26c..5efceae3d1 100644
--- a/cuda_core/cuda/core/experimental/_memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -8,11 +8,8 @@ cimport cpython
 from libc.limits cimport ULLONG_MAX
 from libc.stdint cimport uintptr_t, intptr_t
 from libc.string cimport memset, memcpy
-
 from cuda.bindings cimport cydriver
-
-from cuda.core.experimental._stream cimport Stream as cyStream
-from cuda.core.experimental._stream cimport default_stream
+from cuda.core.experimental._stream cimport default_stream, Stream as cyStream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
     check_or_create_options,
@@ -33,7 +30,7 @@ from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version )
 
 if TYPE_CHECKING:
-    from ._device import Device
+    from .._device import Device
     import uuid
 
 
@@ -43,19 +40,6 @@ PyCapsule = TypeVar("PyCapsule")
 DevicePointerT = Union[driver.CUdeviceptr, int, None]
 """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
 
-
-cdef class _cyBuffer:
-    """
-    Internal only. Responsible for offering fast C method access.
-    """
-    cdef:
-        intptr_t _ptr
-        size_t _size
-        _cyMemoryResource _mr
-        object _ptr_obj
-        cyStream _alloc_stream
-
-
 cdef class _cyMemoryResource:
     """
     Internal only. Responsible for offering fast C method access.
@@ -355,7 +339,6 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
         # TODO: It is better to take a stream for latter deallocation
         return Buffer._init(ptr, size, mr=mr)
 
-
 cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC):
     """Abstract base class for memory resources that manage allocation and deallocation of buffers.
 
@@ -989,7 +972,7 @@ cdef class DeviceMemoryResource(MemoryResource):
 
 
 def _deep_reduce_device_memory_resource(mr):
-    from . import Device
+    from .._device import Device
     device = Device(mr.device_id)
     alloc_handle = mr.get_allocation_handle()
     return mr.from_allocation_handle, (device, alloc_handle)

From 123aa2437c38daf88a6797737b92d77c9cacdd97 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 29 Oct 2025 11:40:02 -0700
Subject: [PATCH 03/30] Rename cyStream to _cyStream for consistency.

---
 .../cuda/core/experimental/_memory/memory.pxd | 10 ++++----
 .../cuda/core/experimental/_memory/memory.pyx | 24 +++++++++----------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd
index 7dda135754..e23c858149 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport uintptr_t, intptr_t
-from cuda.core.experimental._stream cimport Stream as cyStream
+from cuda.core.experimental._stream cimport Stream as _cyStream
 
 from cuda.core.experimental._stream import Stream
 
@@ -17,7 +17,7 @@ cdef class _cyBuffer:
         size_t _size
         _cyMemoryResource _mr
         object _ptr_obj
-        cyStream _alloc_stream
+        _cyStream _alloc_stream
 
 
 cdef class Buffer(_cyBuffer):
@@ -28,9 +28,9 @@ cdef class _cyMemoryResource:
     """
     Internal only. Responsible for offering fast C method access.
     """
-    cdef Buffer _allocate(self, size_t size, cyStream stream)
-    cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept
+    cdef Buffer _allocate(self, size_t size, _cyStream stream)
+    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
 
 
 cdef class MemoryResource(_cyMemoryResource):
-    cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept
+    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index 5efceae3d1..5e71c30ba4 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -9,7 +9,7 @@ from libc.limits cimport ULLONG_MAX
 from libc.stdint cimport uintptr_t, intptr_t
 from libc.string cimport memset, memcpy
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._stream cimport default_stream, Stream as cyStream
+from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
     check_or_create_options,
@@ -44,10 +44,10 @@ cdef class _cyMemoryResource:
     """
     Internal only. Responsible for offering fast C method access.
     """
-    cdef Buffer _allocate(self, size_t size, cyStream stream):
+    cdef Buffer _allocate(self, size_t size, _cyStream stream):
         raise NotImplementedError
 
-    cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept:
+    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept:
         raise NotImplementedError
 
 
@@ -106,7 +106,7 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
         self._ptr_obj = ptr
         self._size = size
         self._mr = mr
-        self._alloc_stream = <cyStream>(stream) if stream is not None else None
+        self._alloc_stream = <_cyStream>(stream) if stream is not None else None
         return self
 
     def __dealloc__(self):
@@ -128,16 +128,16 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
             The stream object to use for asynchronous deallocation. If None,
             the behavior depends on the underlying memory resource.
         """
-        cdef cyStream s
+        cdef _cyStream s
         if self._ptr and self._mr is not None:
             if stream is None:
                 if self._alloc_stream is not None:
                     s = self._alloc_stream
                 else:
                     # TODO: remove this branch when from_handle takes a stream
-                    s = <cyStream>(default_stream())
+                    s = <_cyStream>(default_stream())
             else:
-                s = <cyStream>stream
+                s = <_cyStream>stream
             self._mr._deallocate(self._ptr, self._size, s)
             self._ptr = 0
             self._mr = None
@@ -348,7 +348,7 @@ cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC):
     hold a reference to self, the buffer properties are retrieved simply by looking up the underlying
     memory resource's respective property.)
     """
-    cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept:
+    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept:
         self.deallocate(ptr, size, stream)
 
     @abc.abstractmethod
@@ -867,7 +867,7 @@ cdef class DeviceMemoryResource(MemoryResource):
                 raise
         return self._alloc_handle
 
-    cdef Buffer _allocate(self, size_t size, cyStream stream):
+    cdef Buffer _allocate(self, size_t size, _cyStream stream):
         cdef cydriver.CUstream s = stream._handle
         cdef cydriver.CUdeviceptr devptr
         with nogil:
@@ -901,9 +901,9 @@ cdef class DeviceMemoryResource(MemoryResource):
             raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
         if stream is None:
             stream = default_stream()
-        return self._allocate(size, <cyStream>stream)
+        return self._allocate(size, <_cyStream>stream)
 
-    cdef void _deallocate(self, intptr_t ptr, size_t size, cyStream stream) noexcept:
+    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept:
         cdef cydriver.CUstream s = stream._handle
         cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
         with nogil:
@@ -923,7 +923,7 @@ cdef class DeviceMemoryResource(MemoryResource):
             If the buffer is deallocated without an explicit stream, the allocation stream
             is used.
         """
-        self._deallocate(<intptr_t>ptr, size, <cyStream>stream)
+        self._deallocate(<intptr_t>ptr, size, <_cyStream>stream)
 
     @property
     def attributes(self) -> DeviceMemoryResourceAttributes:

From fe4b67e1d798e8f5e6229b55100c24052af19158 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 29 Oct 2025 12:48:14 -0700
Subject: [PATCH 04/30] Move defs to memory.pxd header

---
 .../cuda/core/experimental/_memory/memory.pxd | 51 ++++++++++++++-----
 .../cuda/core/experimental/_memory/memory.pyx | 18 -------
 2 files changed, 39 insertions(+), 30 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd
index e23c858149..858901825b 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd
@@ -3,15 +3,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport uintptr_t, intptr_t
-from cuda.core.experimental._stream cimport Stream as _cyStream
+from cuda.bindings cimport cydriver
 
-from cuda.core.experimental._stream import Stream
+from cuda.core.experimental._stream cimport Stream as _cyStream
 
 
 cdef class _cyBuffer:
-    """
-    Internal only. Responsible for offering fast C method access.
-    """
     cdef:
         intptr_t _ptr
         size_t _size
@@ -20,17 +17,47 @@ cdef class _cyBuffer:
         _cyStream _alloc_stream
 
 
-cdef class Buffer(_cyBuffer):
-    cpdef close(self, stream: Stream=*)
-
-
 cdef class _cyMemoryResource:
-    """
-    Internal only. Responsible for offering fast C method access.
-    """
     cdef Buffer _allocate(self, size_t size, _cyStream stream)
     cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
 
 
+cdef class Buffer(_cyBuffer):
+    cpdef close(self, stream=*)
+
+
 cdef class MemoryResource(_cyMemoryResource):
     cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
+
+
+cdef class IPCBufferDescriptor:
+    cdef:
+        bytes _reserved
+        size_t _size
+
+
+cdef class IPCAllocationHandle:
+    cdef:
+        int _handle
+        object _uuid
+
+    cpdef close(self)
+
+
+cdef class DeviceMemoryResource(MemoryResource):
+    cdef:
+        int _dev_id
+        cydriver.CUmemoryPool _mempool_handle
+        object _attributes
+        cydriver.CUmemAllocationHandleType _ipc_handle_type
+        bint _mempool_owned
+        bint _is_mapped
+        object _uuid
+        IPCAllocationHandle _alloc_handle
+        object __weakref__
+
+    cpdef close(self)
+    cpdef IPCAllocationHandle get_allocation_handle(self)
+    cdef Buffer _allocate(self, size_t size, _cyStream stream)
+    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
+    cpdef deallocate(self, ptr, size_t size, stream=*)
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index 5e71c30ba4..7f753595f2 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -399,10 +399,6 @@ cdef cydriver.CUmemAllocationHandleType _IPC_HANDLE_TYPE = cydriver.CUmemAllocat
 cdef class IPCBufferDescriptor:
     """Serializable object describing a buffer that can be shared between processes."""
 
-    cdef:
-        bytes _reserved
-        size_t _size
-
     def __init__(self, *arg, **kwargs):
         raise RuntimeError("IPCBufferDescriptor objects cannot be instantiated directly. Please use MemoryResource APIs.")
 
@@ -424,10 +420,6 @@ cdef class IPCBufferDescriptor:
 cdef class IPCAllocationHandle:
     """Shareable handle to an IPC-enabled device memory pool."""
 
-    cdef:
-        int _handle
-        object _uuid
-
     def __init__(self, *arg, **kwargs):
         raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.")
 
@@ -643,16 +635,6 @@ cdef class DeviceMemoryResource(MemoryResource):
     methods.  The reconstruction procedure uses the registry to find the
     associated MMR.
     """
-    cdef:
-        int _dev_id
-        cydriver.CUmemoryPool _mempool_handle
-        object _attributes
-        cydriver.CUmemAllocationHandleType _ipc_handle_type
-        bint _mempool_owned
-        bint _is_mapped
-        object _uuid
-        IPCAllocationHandle _alloc_handle
-        object __weakref__
 
     def __cinit__(self):
         self._dev_id = cydriver.CU_DEVICE_INVALID

From ce77d446b511a9c3ba395b25f1944b8bf219fb46 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 29 Oct 2025 13:05:14 -0700
Subject: [PATCH 05/30] Separate VMM.

---
 .../core/experimental/_memory/__init__.py     |   2 +
 .../cuda/core/experimental/_memory/memory.pyx | 519 +----------------
 .../cuda/core/experimental/_memory/vmm.py     | 525 ++++++++++++++++++
 3 files changed, 534 insertions(+), 512 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_memory/vmm.py

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index f97d27eada..998009f16c 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -1,2 +1,4 @@
 from .memory import *
 from .memory import _SynchronousMemoryResource
+from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource
+
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index 7f753595f2..d25b47fce2 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -96,6 +96,13 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
         self._ptr_obj = None
         self._alloc_stream = None
 
+    def _clear(self):
+        self._ptr = 0
+        self._size = 0
+        self._mr = None
+        self._ptr_obj = None
+        self._alloc_stream = None
+
     def __init__(self, *args, **kwargs):
         raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.")
 
@@ -1054,515 +1061,3 @@ class _SynchronousMemoryResource(MemoryResource):
         return self._dev_id
 
 
-VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None]
-VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"]
-VirtualMemoryGranularityT = Literal["minimum", "recommended"]
-VirtualMemoryAccessTypeT = Union[Literal["rw", "r"], None]
-VirtualMemoryAllocationTypeT = Literal["pinned", "managed"]
-
-
-@dataclass
-class VirtualMemoryResourceOptions:
-    """A configuration object for the VirtualMemoryResource
-       Stores configuration information which tells the resource how to use the CUDA VMM APIs
-
-    Attributes
-    ----------
-    allocation_type: :obj:`~_memory.VirtualMemoryAllocationTypeT`
-        Controls the type of allocation.
-    location_type: :obj:`~_memory.VirtualMemoryLocationTypeT`
-        Controls the location of the allocation.
-    handle_type: :obj:`~_memory.VirtualMemoryHandleTypeT`
-        Export handle type for the physical allocation. Use
-        ``"posix_fd"`` on Linux if you plan to
-        import/export the allocation (required for cuMemRetainAllocationHandle).
-        Use `None` if you don't need an exportable handle.
-    gpu_direct_rdma: bool
-        Hint that the allocation should be GDR-capable (if supported).
-    granularity: :obj:`~_memory.VirtualMemoryGranularityT`
-        Controls granularity query and size rounding.
-    addr_hint: int
-        A (optional) virtual address hint to try to reserve at. Setting it to 0 lets the CUDA driver decide.
-    addr_align: int
-        Alignment for the VA reservation. If `None`, use the queried granularity.
-    peers: Iterable[int]
-        Extra device IDs that should be granted access in addition to ``device``.
-    self_access: :obj:`~_memory.VirtualMemoryAccessTypeT`
-        Access flags for the owning device.
-    peer_access: :obj:`~_memory.VirtualMemoryAccessTypeT`
-        Access flags for peers.
-    """
-    # Human-friendly strings; normalized in __post_init__
-    allocation_type: VirtualMemoryAllocationTypeT = "pinned"
-    location_type: VirtualMemoryLocationTypeT = "device"
-    handle_type: VirtualMemoryHandleTypeT = "posix_fd"
-    granularity: VirtualMemoryGranularityT = "recommended"
-    gpu_direct_rdma: bool = False
-    addr_hint: Optional[int] = 0
-    addr_align: Optional[int] = None
-    peers: Iterable[int] = field(default_factory=tuple)
-    self_access: VirtualMemoryAccessTypeT = "rw"
-    peer_access: VirtualMemoryAccessTypeT = "rw"
-
-    _a = driver.CUmemAccess_flags
-    _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, None: 0}
-    _h = driver.CUmemAllocationHandleType
-    _handle_types = {None: _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC}
-    _g = driver.CUmemAllocationGranularity_flags
-    _granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM}
-    _l = driver.CUmemLocationType
-    _location_type = {"device": _l.CU_MEM_LOCATION_TYPE_DEVICE, "host": _l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
-    # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not
-    _a = driver.CUmemAllocationType
-    _allocation_type = {"pinned": _a.CU_MEM_ALLOCATION_TYPE_PINNED}
-    ver_major, ver_minor = get_binding_version()
-    if ver_major >= 13:
-        _allocation_type["managed"] = _a.CU_MEM_ALLOCATION_TYPE_MANAGED
-
-    @staticmethod
-    def _access_to_flags(spec: str):
-        flags = VirtualMemoryResourceOptions._access_flags.get(spec)
-        if flags is None:
-            raise ValueError(f"Unknown access spec: {spec!r}")
-        return flags
-
-    @staticmethod
-    def _allocation_type_to_driver(spec: str):
-        alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec)
-        if alloc_type is None:
-            raise ValueError(f"Unsupported allocation_type: {spec!r}")
-        return alloc_type
-
-    @staticmethod
-    def _location_type_to_driver(spec: str):
-        loc_type = VirtualMemoryResourceOptions._location_type.get(spec)
-        if loc_type is None:
-            raise ValueError(f"Unsupported location_type: {spec!r}")
-        return loc_type
-
-    @staticmethod
-    def _handle_type_to_driver(spec: str):
-        handle_type = VirtualMemoryResourceOptions._handle_types.get(spec)
-        if handle_type is None:
-            raise ValueError(f"Unsupported handle_type: {spec!r}")
-        return handle_type
-
-    @staticmethod
-    def _granularity_to_driver(spec: str):
-        granularity = VirtualMemoryResourceOptions._granularity.get(spec)
-        if granularity is None:
-            raise ValueError(f"Unsupported granularity: {spec!r}")
-        return granularity
-
-
-class VirtualMemoryResource(MemoryResource):
-    """Create a device memory resource that uses the CUDA VMM APIs to allocate memory.
-
-    Parameters
-    ----------
-    device_id : int
-        Device ordinal for which a memory resource is constructed.
-
-    config : VirtualMemoryResourceOptions
-        A configuration object for the VirtualMemoryResource
-    """
-    def __init__(self, device, config: VirtualMemoryResourceOptions = None):
-        self.device = device
-        self.config = check_or_create_options(
-            VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False
-        )
-        if self.config.location_type == "host":
-            self.device = None
-        if platform.system() == "Windows":
-            raise NotImplementedError("VirtualMemoryResource is not supported on Windows")
-
-        # Validate RDMA support if requested
-        if self.config.gpu_direct_rdma and self.device is not None:
-            if not self.device.properties.gpu_direct_rdma_supported:
-                raise RuntimeError("GPU Direct RDMA is not supported on this device")
-
-    @staticmethod
-    def _align_up(size: int, gran: int) -> int:
-        """
-        Align a size up to the nearest multiple of a granularity.
-        """
-        return (size + gran - 1) & ~(gran - 1)
-
-    def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions = None) -> Buffer:
-        """
-        Grow an existing allocation using CUDA VMM, with a configurable policy.
-
-        This implements true growing allocations that preserve the base pointer
-        by extending the virtual address range and mapping additional physical memory.
-
-        This function uses transactional allocation: if any step fails, the original buffer is not modified and
-        all steps the function took are rolled back so a new allocation is not created.
-
-        Parameters
-        ----------
-        buf : Buffer
-            The existing buffer to grow
-        new_size : int
-            The new total size for the allocation
-        config : VirtualMemoryResourceOptions, optional
-            Configuration for the new physical memory chunks. If None, uses current config.
-
-        Returns
-        -------
-        Buffer
-            The same buffer with updated size and properties, preserving the original pointer
-        """
-        if config is not None:
-            self.config = config
-
-        # Build allocation properties for new chunks
-        prop = driver.CUmemAllocationProp()
-        prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type)
-        prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(self.config.location_type)
-        prop.location.id = self.device.device_id
-        prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0
-        prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(self.config.handle_type)
-
-        # Query granularity
-        gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(self.config.granularity)
-        res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
-        raise_if_driver_error(res)
-
-        # Calculate sizes
-        additional_size = new_size - buf.size
-        if additional_size <= 0:
-            # Same size: only update access policy if needed; avoid zero-sized driver calls
-            descs = self._build_access_descriptors(prop)
-            if descs:
-                res, = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs))
-                raise_if_driver_error(res)
-            return buf
-
-        aligned_additional_size = VirtualMemoryResource._align_up(additional_size, gran)
-        total_aligned_size = VirtualMemoryResource._align_up(new_size, gran)
-        aligned_prev_size = total_aligned_size - aligned_additional_size
-        addr_align = self.config.addr_align or gran
-
-        # Try to extend the existing VA range first
-        res, new_ptr = driver.cuMemAddressReserve(
-            aligned_additional_size,
-            addr_align,
-            int(buf.handle) + aligned_prev_size,  # fixedAddr hint - aligned end of current range
-            0
-        )
-
-        if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size):
-            # Check for specific errors that are not recoverable with the slow path
-            if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED):
-                raise_if_driver_error(res)
-            res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size)
-            raise_if_driver_error(res2)
-            # Fallback: couldn't extend contiguously, need full remapping
-            return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align)
-        else:
-            # Success! We can extend the VA range contiguously
-            return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr)
-
-    def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
-                                   aligned_additional_size: int, new_ptr: int) -> Buffer:
-        """
-        Fast path for growing a virtual memory allocation when the new region can be
-        reserved contiguously after the existing buffer.
-
-        This function creates and maps new physical memory for the additional size,
-        sets access permissions, and updates the buffer size in place (the pointer
-        remains unchanged).
-
-        Args:
-            buf (Buffer): The buffer to grow.
-            new_size (int): The new total size in bytes.
-            prop (driver.CUmemAllocationProp): Allocation properties for the new memory.
-            aligned_additional_size (int): The size of the new region to allocate, aligned to granularity.
-            new_ptr (int): The address of the newly reserved contiguous VA region (should be at the end of the current buffer).
-
-        Returns:
-            Buffer: The same buffer object with its size updated to `new_size`.
-        """
-        with Transaction() as trans:
-            # Create new physical memory for the additional size
-            trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]))
-            res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
-            raise_if_driver_error(res)
-            # Register undo for creation
-            trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
-
-            # Map the new physical memory to the extended VA range
-            res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
-            raise_if_driver_error(res)
-            # Register undo for mapping
-            trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0]))
-
-            # Set access permissions for the new portion
-            descs = self._build_access_descriptors(prop)
-            if descs:
-                res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs))
-                raise_if_driver_error(res)
-
-            # All succeeded, cancel undo actions
-            trans.commit()
-
-        # Update the buffer size (pointer stays the same)
-        buf._size = new_size
-        return buf
-
-    def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
-                                   aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer:
-        """
-        Slow path for growing a virtual memory allocation when the new region cannot be
-        reserved contiguously after the existing buffer.
-
-        This function reserves a new, larger virtual address (VA) range, remaps the old
-        physical memory to the beginning of the new VA range, creates and maps new physical
-        memory for the additional size, sets access permissions, and updates the buffer's
-        pointer and size.
-
-        Args:
-            buf (Buffer): The buffer to grow.
-            new_size (int): The new total size in bytes.
-            prop (driver.CUmemAllocationProp): Allocation properties for the new memory.
-            aligned_additional_size (int): The size of the new region to allocate, aligned to granularity.
-            total_aligned_size (int): The total new size to reserve, aligned to granularity.
-            addr_align (int): The required address alignment for the new VA range.
-
-        Returns:
-            Buffer: The buffer object updated with the new pointer and size.
-        """
-        with Transaction() as trans:
-            # Reserve a completely new, larger VA range
-            res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
-            raise_if_driver_error(res)
-            # Register undo for VA reservation
-            trans.append(lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]))
-
-            # Get the old allocation handle for remapping
-            result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
-            raise_if_driver_error(result)
-            # Register undo for old_handle
-            trans.append(lambda h=old_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
-
-            # Unmap the old VA range (aligned previous size)
-            aligned_prev_size = total_aligned_size - aligned_additional_size
-            result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size)
-            raise_if_driver_error(result)
-
-            def _remap_old():
-                # Try to remap the old physical memory back to the original VA range
-                try:
-                    res, = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0)
-                    raise_if_driver_error(res)
-                except Exception:
-                    pass
-            trans.append(_remap_old)
-
-            # Remap the old physical memory to the new VA range (aligned previous size)
-            res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0)
-            raise_if_driver_error(res)
-
-            # Register undo for mapping
-            trans.append(lambda np=new_ptr, s=aligned_prev_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0]))
-
-            # Create new physical memory for the additional size
-            res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
-            raise_if_driver_error(res)
-
-            # Register undo for new physical memory
-            trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
-
-            # Map the new physical memory to the extended portion (aligned offset)
-            res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
-            raise_if_driver_error(res)
-
-            # Register undo for mapping
-            trans.append(lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(base + offs, s)[0]))
-
-            # Set access permissions for the entire new range
-            descs = self._build_access_descriptors(prop)
-            if descs:
-                res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs))
-                raise_if_driver_error(res)
-
-            # All succeeded, cancel undo actions
-            trans.commit()
-
-        # Free the old VA range (aligned previous size)
-        res2, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
-        raise_if_driver_error(res2)
-
-        # Invalidate the old buffer so its destructor won't try to free again
-        buf._ptr = 0
-        buf._ptr_obj = None
-        buf._size = 0
-        buf._mr = None
-
-        # Return a new Buffer for the new mapping
-        return Buffer.from_handle(ptr=new_ptr, size=new_size, mr=self)
-
-
-    def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
-        """
-        Build access descriptors for memory access permissions.
-
-        Returns
-        -------
-        list
-            List of CUmemAccessDesc objects for setting memory access
-        """
-        descs = []
-
-        # Owner access
-        owner_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.self_access)
-        if owner_flags:
-            d = driver.CUmemAccessDesc()
-            d.location.type = prop.location.type
-            d.location.id = prop.location.id
-            d.flags = owner_flags
-            descs.append(d)
-
-        # Peer device access
-        peer_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.peer_access)
-        if peer_flags:
-            for peer_dev in self.config.peers:
-                d = driver.CUmemAccessDesc()
-                d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                d.location.id = int(peer_dev)
-                d.flags = peer_flags
-                descs.append(d)
-
-        return descs
-
-
-    def allocate(self, size: int, stream: Stream = None) -> Buffer:
-        """
-        Allocate a buffer of the given size using CUDA virtual memory.
-
-        Parameters
-        ----------
-        size : int
-            The size in bytes of the buffer to allocate.
-        stream : Stream, optional
-            CUDA stream to associate with the allocation (not currently supported).
-
-        Returns
-        -------
-        Buffer
-            A Buffer object representing the allocated virtual memory.
-
-        Raises
-        ------
-        NotImplementedError
-            If a stream is provided or if the location type is not device memory.
-        CUDAError
-            If any CUDA driver API call fails during allocation.
-
-        Notes
-        -----
-        This method uses transactional allocation: if any step fails, all resources
-        allocated so far are automatically cleaned up. The allocation is performed
-        with the configured granularity, access permissions, and peer access as
-        specified in the resource's configuration.
-        """
-        if stream is not None:
-            raise NotImplementedError("Stream is not supported with VirtualMemoryResource")
-
-        config = self.config
-        # ---- Build allocation properties ----
-        prop = driver.CUmemAllocationProp()
-        prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type)
-
-        prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type)
-        prop.location.id = self.device.device_id if config.location_type == "device" else -1
-        prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0
-        prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type)
-
-        # ---- Query and apply granularity ----
-        # Choose min vs recommended granularity per config
-        gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(config.granularity)
-        res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
-        raise_if_driver_error(res)
-
-        aligned_size = VirtualMemoryResource._align_up(size, gran)
-        addr_align = config.addr_align or gran
-
-        # ---- Transactional allocation ----
-        with Transaction() as trans:
-            # ---- Create physical memory ----
-            res, handle = driver.cuMemCreate(aligned_size, prop, 0)
-            raise_if_driver_error(res)
-            # Register undo for physical memory
-            trans.append(lambda h=handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
-
-            # ---- Reserve VA space ----
-            # Potentially, use a separate size for the VA reservation from the physical allocation size
-            res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0)
-            raise_if_driver_error(res)
-            # Register undo for VA reservation
-            trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemAddressFree(p, s)[0]))
-
-            # ---- Map physical memory into VA ----
-            res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
-            trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemUnmap(p, s)[0]))
-            raise_if_driver_error(res)
-
-            # ---- Set access for owner + peers ----
-            descs = self._build_access_descriptors(prop)
-            if descs:
-                res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs))
-                raise_if_driver_error(res)
-
-            trans.commit()
-
-        # Done — return a Buffer that tracks this VA range
-        buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
-        return buf
-
-    def deallocate(self, ptr: int, size: int, stream: Stream=None) -> None:
-        """
-        Deallocate memory on the device using CUDA VMM APIs.
-        """
-        result, handle = driver.cuMemRetainAllocationHandle(ptr)
-        raise_if_driver_error(result)
-        result, = driver.cuMemUnmap(ptr, size)
-        raise_if_driver_error(result)
-        result, = driver.cuMemAddressFree(ptr, size)
-        raise_if_driver_error(result)
-        result, = driver.cuMemRelease(handle)
-        raise_if_driver_error(result)
-
-
-    @property
-    def is_device_accessible(self) -> bool:
-        """
-        Indicates whether the allocated memory is accessible from the device.
-        """
-        return self.config.location_type == "device"
-
-    @property
-    def is_host_accessible(self) -> bool:
-        """
-        Indicates whether the allocated memory is accessible from the host.
-        """
-        return self.config.location_type == "host"
-
-    @property
-    def device_id(self) -> int:
-        """
-        Get the device ID associated with this memory resource.
-
-        Returns:
-            int: CUDA device ID. -1 if the memory resource allocates host memory
-        """
-        return self.device.device_id if self.config.location_type == "device" else -1
-
-    def __repr__(self) -> str:
-        """
-        Return a string representation of the VirtualMemoryResource.
-
-        Returns:
-            str: A string describing the object
-        """
-        return f"<VirtualMemoryResource device={self.device}>"
diff --git a/cuda_core/cuda/core/experimental/_memory/vmm.py b/cuda_core/cuda/core/experimental/_memory/vmm.py
new file mode 100644
index 0000000000..60ba8280d8
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/vmm.py
@@ -0,0 +1,525 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from dataclasses import dataclass, field
+from typing import Iterable, Literal, Optional, Union
+import platform
+
+from cuda.core.experimental._stream import Stream
+from cuda.core.experimental._memory.memory import Buffer, MemoryResource
+from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version )
+from cuda.core.experimental._utils.cuda_utils import (
+    _check_driver_error as raise_if_driver_error,
+    check_or_create_options,
+)
+
+VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None]
+VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"]
+VirtualMemoryGranularityT = Literal["minimum", "recommended"]
+VirtualMemoryAccessTypeT = Union[Literal["rw", "r"], None]
+VirtualMemoryAllocationTypeT = Literal["pinned", "managed"]
+
+
+@dataclass
+class VirtualMemoryResourceOptions:
+    """A configuration object for the VirtualMemoryResource
+       Stores configuration information which tells the resource how to use the CUDA VMM APIs
+
+    Attributes
+    ----------
+    allocation_type: :obj:`~_memory.VirtualMemoryAllocationTypeT`
+        Controls the type of allocation.
+    location_type: :obj:`~_memory.VirtualMemoryLocationTypeT`
+        Controls the location of the allocation.
+    handle_type: :obj:`~_memory.VirtualMemoryHandleTypeT`
+        Export handle type for the physical allocation. Use
+        ``"posix_fd"`` on Linux if you plan to
+        import/export the allocation (required for cuMemRetainAllocationHandle).
+        Use `None` if you don't need an exportable handle.
+    gpu_direct_rdma: bool
+        Hint that the allocation should be GDR-capable (if supported).
+    granularity: :obj:`~_memory.VirtualMemoryGranularityT`
+        Controls granularity query and size rounding.
+    addr_hint: int
+        A (optional) virtual address hint to try to reserve at. Setting it to 0 lets the CUDA driver decide.
+    addr_align: int
+        Alignment for the VA reservation. If `None`, use the queried granularity.
+    peers: Iterable[int]
+        Extra device IDs that should be granted access in addition to ``device``.
+    self_access: :obj:`~_memory.VirtualMemoryAccessTypeT`
+        Access flags for the owning device.
+    peer_access: :obj:`~_memory.VirtualMemoryAccessTypeT`
+        Access flags for peers.
+    """
+    # Human-friendly strings; normalized in __post_init__
+    allocation_type: VirtualMemoryAllocationTypeT = "pinned"
+    location_type: VirtualMemoryLocationTypeT = "device"
+    handle_type: VirtualMemoryHandleTypeT = "posix_fd"
+    granularity: VirtualMemoryGranularityT = "recommended"
+    gpu_direct_rdma: bool = False
+    addr_hint: Optional[int] = 0
+    addr_align: Optional[int] = None
+    peers: Iterable[int] = field(default_factory=tuple)
+    self_access: VirtualMemoryAccessTypeT = "rw"
+    peer_access: VirtualMemoryAccessTypeT = "rw"
+
+    _a = driver.CUmemAccess_flags
+    _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, None: 0}
+    _h = driver.CUmemAllocationHandleType
+    _handle_types = {None: _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC}
+    _g = driver.CUmemAllocationGranularity_flags
+    _granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM}
+    _l = driver.CUmemLocationType
+    _location_type = {"device": _l.CU_MEM_LOCATION_TYPE_DEVICE, "host": _l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
+    # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not
+    _a = driver.CUmemAllocationType
+    _allocation_type = {"pinned": _a.CU_MEM_ALLOCATION_TYPE_PINNED}
+    ver_major, ver_minor = get_binding_version()
+    if ver_major >= 13:
+        _allocation_type["managed"] = _a.CU_MEM_ALLOCATION_TYPE_MANAGED
+
+    @staticmethod
+    def _access_to_flags(spec: str):
+        flags = VirtualMemoryResourceOptions._access_flags.get(spec)
+        if flags is None:
+            raise ValueError(f"Unknown access spec: {spec!r}")
+        return flags
+
+    @staticmethod
+    def _allocation_type_to_driver(spec: str):
+        alloc_type = VirtualMemoryResourceOptions._allocation_type.get(spec)
+        if alloc_type is None:
+            raise ValueError(f"Unsupported allocation_type: {spec!r}")
+        return alloc_type
+
+    @staticmethod
+    def _location_type_to_driver(spec: str):
+        loc_type = VirtualMemoryResourceOptions._location_type.get(spec)
+        if loc_type is None:
+            raise ValueError(f"Unsupported location_type: {spec!r}")
+        return loc_type
+
+    @staticmethod
+    def _handle_type_to_driver(spec: str):
+        handle_type = VirtualMemoryResourceOptions._handle_types.get(spec)
+        if handle_type is None:
+            raise ValueError(f"Unsupported handle_type: {spec!r}")
+        return handle_type
+
+    @staticmethod
+    def _granularity_to_driver(spec: str):
+        granularity = VirtualMemoryResourceOptions._granularity.get(spec)
+        if granularity is None:
+            raise ValueError(f"Unsupported granularity: {spec!r}")
+        return granularity
+
+
+class VirtualMemoryResource(MemoryResource):
+    """Create a device memory resource that uses the CUDA VMM APIs to allocate memory.
+
+    Parameters
+    ----------
+    device_id : int
+        Device ordinal for which a memory resource is constructed.
+
+    config : VirtualMemoryResourceOptions
+        A configuration object for the VirtualMemoryResource
+    """
+    def __init__(self, device, config: VirtualMemoryResourceOptions = None):
+        self.device = device
+        self.config = check_or_create_options(
+            VirtualMemoryResourceOptions, config, "VirtualMemoryResource options", keep_none=False
+        )
+        if self.config.location_type == "host":
+            self.device = None
+        if platform.system() == "Windows":
+            raise NotImplementedError("VirtualMemoryResource is not supported on Windows")
+
+        # Validate RDMA support if requested
+        if self.config.gpu_direct_rdma and self.device is not None:
+            if not self.device.properties.gpu_direct_rdma_supported:
+                raise RuntimeError("GPU Direct RDMA is not supported on this device")
+
+    @staticmethod
+    def _align_up(size: int, gran: int) -> int:
+        """
+        Align a size up to the nearest multiple of a granularity.
+        """
+        return (size + gran - 1) & ~(gran - 1)
+
+    def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryResourceOptions = None) -> Buffer:
+        """
+        Grow an existing allocation using CUDA VMM, with a configurable policy.
+
+        This implements true growing allocations that preserve the base pointer
+        by extending the virtual address range and mapping additional physical memory.
+
+        This function uses transactional allocation: if any step fails, the original buffer is not modified and
+        all steps the function took are rolled back so a new allocation is not created.
+
+        Parameters
+        ----------
+        buf : Buffer
+            The existing buffer to grow
+        new_size : int
+            The new total size for the allocation
+        config : VirtualMemoryResourceOptions, optional
+            Configuration for the new physical memory chunks. If None, uses current config.
+
+        Returns
+        -------
+        Buffer
+            The same buffer with updated size and properties, preserving the original pointer
+        """
+        if config is not None:
+            self.config = config
+
+        # Build allocation properties for new chunks
+        prop = driver.CUmemAllocationProp()
+        prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(self.config.allocation_type)
+        prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(self.config.location_type)
+        prop.location.id = self.device.device_id
+        prop.allocFlags.gpuDirectRDMACapable = 1 if self.config.gpu_direct_rdma else 0
+        prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(self.config.handle_type)
+
+        # Query granularity
+        gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(self.config.granularity)
+        res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
+        raise_if_driver_error(res)
+
+        # Calculate sizes
+        additional_size = new_size - buf.size
+        if additional_size <= 0:
+            # Same size: only update access policy if needed; avoid zero-sized driver calls
+            descs = self._build_access_descriptors(prop)
+            if descs:
+                res, = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs))
+                raise_if_driver_error(res)
+            return buf
+
+        aligned_additional_size = VirtualMemoryResource._align_up(additional_size, gran)
+        total_aligned_size = VirtualMemoryResource._align_up(new_size, gran)
+        aligned_prev_size = total_aligned_size - aligned_additional_size
+        addr_align = self.config.addr_align or gran
+
+        # Try to extend the existing VA range first
+        res, new_ptr = driver.cuMemAddressReserve(
+            aligned_additional_size,
+            addr_align,
+            int(buf.handle) + aligned_prev_size,  # fixedAddr hint - aligned end of current range
+            0
+        )
+
+        if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size):
+            # Check for specific errors that are not recoverable with the slow path
+            if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED):
+                raise_if_driver_error(res)
+            res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size)
+            raise_if_driver_error(res2)
+            # Fallback: couldn't extend contiguously, need full remapping
+            return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align)
+        else:
+            # Success! We can extend the VA range contiguously
+            return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr)
+
+    def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
+                                   aligned_additional_size: int, new_ptr: int) -> Buffer:
+        """
+        Fast path for growing a virtual memory allocation when the new region can be
+        reserved contiguously after the existing buffer.
+
+        This function creates and maps new physical memory for the additional size,
+        sets access permissions, and updates the buffer size in place (the pointer
+        remains unchanged).
+
+        Args:
+            buf (Buffer): The buffer to grow.
+            new_size (int): The new total size in bytes.
+            prop (driver.CUmemAllocationProp): Allocation properties for the new memory.
+            aligned_additional_size (int): The size of the new region to allocate, aligned to granularity.
+            new_ptr (int): The address of the newly reserved contiguous VA region (should be at the end of the current buffer).
+
+        Returns:
+            Buffer: The same buffer object with its size updated to `new_size`.
+        """
+        with Transaction() as trans:
+            # Create new physical memory for the additional size
+            trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]))
+            res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
+            raise_if_driver_error(res)
+            # Register undo for creation
+            trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
+
+            # Map the new physical memory to the extended VA range
+            res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
+            raise_if_driver_error(res)
+            # Register undo for mapping
+            trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0]))
+
+            # Set access permissions for the new portion
+            descs = self._build_access_descriptors(prop)
+            if descs:
+                res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs))
+                raise_if_driver_error(res)
+
+            # All succeeded, cancel undo actions
+            trans.commit()
+
+        # Update the buffer size (pointer stays the same)
+        buf._size = new_size
+        return buf
+
+    def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
+                                   aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer:
+        """
+        Slow path for growing a virtual memory allocation when the new region cannot be
+        reserved contiguously after the existing buffer.
+
+        This function reserves a new, larger virtual address (VA) range, remaps the old
+        physical memory to the beginning of the new VA range, creates and maps new physical
+        memory for the additional size, sets access permissions, and updates the buffer's
+        pointer and size.
+
+        Args:
+            buf (Buffer): The buffer to grow.
+            new_size (int): The new total size in bytes.
+            prop (driver.CUmemAllocationProp): Allocation properties for the new memory.
+            aligned_additional_size (int): The size of the new region to allocate, aligned to granularity.
+            total_aligned_size (int): The total new size to reserve, aligned to granularity.
+            addr_align (int): The required address alignment for the new VA range.
+
+        Returns:
+            Buffer: The buffer object updated with the new pointer and size.
+        """
+        with Transaction() as trans:
+            # Reserve a completely new, larger VA range
+            res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
+            raise_if_driver_error(res)
+            # Register undo for VA reservation
+            trans.append(lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]))
+
+            # Get the old allocation handle for remapping
+            result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
+            raise_if_driver_error(result)
+            # Register undo for old_handle
+            trans.append(lambda h=old_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
+
+            # Unmap the old VA range (aligned previous size)
+            aligned_prev_size = total_aligned_size - aligned_additional_size
+            result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size)
+            raise_if_driver_error(result)
+
+            def _remap_old():
+                # Try to remap the old physical memory back to the original VA range
+                try:
+                    res, = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0)
+                    raise_if_driver_error(res)
+                except Exception:
+                    pass
+            trans.append(_remap_old)
+
+            # Remap the old physical memory to the new VA range (aligned previous size)
+            res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0)
+            raise_if_driver_error(res)
+
+            # Register undo for mapping
+            trans.append(lambda np=new_ptr, s=aligned_prev_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0]))
+
+            # Create new physical memory for the additional size
+            res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
+            raise_if_driver_error(res)
+
+            # Register undo for new physical memory
+            trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
+
+            # Map the new physical memory to the extended portion (aligned offset)
+            res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
+            raise_if_driver_error(res)
+
+            # Register undo for mapping
+            trans.append(lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(base + offs, s)[0]))
+
+            # Set access permissions for the entire new range
+            descs = self._build_access_descriptors(prop)
+            if descs:
+                res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs))
+                raise_if_driver_error(res)
+
+            # All succeeded, cancel undo actions
+            trans.commit()
+
+        # Free the old VA range (aligned previous size)
+        res2, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
+        raise_if_driver_error(res2)
+
+        # Invalidate the old buffer so its destructor won't try to free again
+        buf._clear()
+
+        # Return a new Buffer for the new mapping
+        return Buffer.from_handle(ptr=new_ptr, size=new_size, mr=self)
+
+
+    def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
+        """
+        Build access descriptors for memory access permissions.
+
+        Returns
+        -------
+        list
+            List of CUmemAccessDesc objects for setting memory access
+        """
+        descs = []
+
+        # Owner access
+        owner_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.self_access)
+        if owner_flags:
+            d = driver.CUmemAccessDesc()
+            d.location.type = prop.location.type
+            d.location.id = prop.location.id
+            d.flags = owner_flags
+            descs.append(d)
+
+        # Peer device access
+        peer_flags = VirtualMemoryResourceOptions._access_to_flags(self.config.peer_access)
+        if peer_flags:
+            for peer_dev in self.config.peers:
+                d = driver.CUmemAccessDesc()
+                d.location.type = driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+                d.location.id = int(peer_dev)
+                d.flags = peer_flags
+                descs.append(d)
+
+        return descs
+
+
+    def allocate(self, size: int, stream: Stream = None) -> Buffer:
+        """
+        Allocate a buffer of the given size using CUDA virtual memory.
+
+        Parameters
+        ----------
+        size : int
+            The size in bytes of the buffer to allocate.
+        stream : Stream, optional
+            CUDA stream to associate with the allocation (not currently supported).
+
+        Returns
+        -------
+        Buffer
+            A Buffer object representing the allocated virtual memory.
+
+        Raises
+        ------
+        NotImplementedError
+            If a stream is provided or if the location type is not device memory.
+        CUDAError
+            If any CUDA driver API call fails during allocation.
+
+        Notes
+        -----
+        This method uses transactional allocation: if any step fails, all resources
+        allocated so far are automatically cleaned up. The allocation is performed
+        with the configured granularity, access permissions, and peer access as
+        specified in the resource's configuration.
+        """
+        if stream is not None:
+            raise NotImplementedError("Stream is not supported with VirtualMemoryResource")
+
+        config = self.config
+        # ---- Build allocation properties ----
+        prop = driver.CUmemAllocationProp()
+        prop.type = VirtualMemoryResourceOptions._allocation_type_to_driver(config.allocation_type)
+
+        prop.location.type = VirtualMemoryResourceOptions._location_type_to_driver(config.location_type)
+        prop.location.id = self.device.device_id if config.location_type == "device" else -1
+        prop.allocFlags.gpuDirectRDMACapable = 1 if config.gpu_direct_rdma else 0
+        prop.requestedHandleTypes = VirtualMemoryResourceOptions._handle_type_to_driver(config.handle_type)
+
+        # ---- Query and apply granularity ----
+        # Choose min vs recommended granularity per config
+        gran_flag = VirtualMemoryResourceOptions._granularity_to_driver(config.granularity)
+        res, gran = driver.cuMemGetAllocationGranularity(prop, gran_flag)
+        raise_if_driver_error(res)
+
+        aligned_size = VirtualMemoryResource._align_up(size, gran)
+        addr_align = config.addr_align or gran
+
+        # ---- Transactional allocation ----
+        with Transaction() as trans:
+            # ---- Create physical memory ----
+            res, handle = driver.cuMemCreate(aligned_size, prop, 0)
+            raise_if_driver_error(res)
+            # Register undo for physical memory
+            trans.append(lambda h=handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
+
+            # ---- Reserve VA space ----
+            # Potentially, use a separate size for the VA reservation from the physical allocation size
+            res, ptr = driver.cuMemAddressReserve(aligned_size, addr_align, config.addr_hint, 0)
+            raise_if_driver_error(res)
+            # Register undo for VA reservation
+            trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemAddressFree(p, s)[0]))
+
+            # ---- Map physical memory into VA ----
+            res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
+            trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemUnmap(p, s)[0]))
+            raise_if_driver_error(res)
+
+            # ---- Set access for owner + peers ----
+            descs = self._build_access_descriptors(prop)
+            if descs:
+                res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs))
+                raise_if_driver_error(res)
+
+            trans.commit()
+
+        # Done — return a Buffer that tracks this VA range
+        buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
+        return buf
+
+    def deallocate(self, ptr: int, size: int, stream: Stream=None) -> None:
+        """
+        Deallocate memory on the device using CUDA VMM APIs.
+        """
+        result, handle = driver.cuMemRetainAllocationHandle(ptr)
+        raise_if_driver_error(result)
+        result, = driver.cuMemUnmap(ptr, size)
+        raise_if_driver_error(result)
+        result, = driver.cuMemAddressFree(ptr, size)
+        raise_if_driver_error(result)
+        result, = driver.cuMemRelease(handle)
+        raise_if_driver_error(result)
+
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """
+        Indicates whether the allocated memory is accessible from the device.
+        """
+        return self.config.location_type == "device"
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """
+        Indicates whether the allocated memory is accessible from the host.
+        """
+        return self.config.location_type == "host"
+
+    @property
+    def device_id(self) -> int:
+        """
+        Get the device ID associated with this memory resource.
+
+        Returns:
+            int: CUDA device ID. -1 if the memory resource allocates host memory
+        """
+        return self.device.device_id if self.config.location_type == "device" else -1
+
+    def __repr__(self) -> str:
+        """
+        Return a string representation of the VirtualMemoryResource.
+
+        Returns:
+            str: A string describing the object
+        """
+        return f"<VirtualMemoryResource device={self.device}>"

From c5179bc4afda910e6c6ea4754db095346ef2bc81 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 29 Oct 2025 13:49:23 -0700
Subject: [PATCH 06/30] Weaken dependencies from device to memory module.

---
 cuda_core/cuda/core/experimental/_device.pyx        | 9 ++++++---
 cuda_core/cuda/core/experimental/_memory/memory.pyx | 5 +----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index 1db2adbf8d..582585c6df 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -10,12 +10,11 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
 
 import threading
-from typing import Optional, Union
+from typing import Optional, Union, TYPE_CHECKING
 
 from cuda.core.experimental._context import Context, ContextOptions
 from cuda.core.experimental._event import Event, EventOptions
 from cuda.core.experimental._graph import GraphBuilder
-from cuda.core.experimental._memory import Buffer, DeviceMemoryResource, MemoryResource, _SynchronousMemoryResource
 from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions
 from cuda.core.experimental._utils.clear_error_support import assert_type
 from cuda.core.experimental._utils.cuda_utils import (
@@ -27,7 +26,8 @@ from cuda.core.experimental._utils.cuda_utils import (
 )
 from cuda.core.experimental._stream cimport default_stream
 
-
+if TYPE_CHECKING:
+    from cuda.core.experimental._memory import Buffer, MemoryResource
 
 # TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python,
 # but it seems it is very convenient to expose them for testing purposes...
@@ -996,8 +996,10 @@ class Device:
                         )
                     )
                 if attr == 1:
+                    from cuda.core.experimental._memory import DeviceMemoryResource
                     device._mr = DeviceMemoryResource(dev_id)
                 else:
+                    from cuda.core.experimental._memory import _SynchronousMemoryResource
                     device._mr = _SynchronousMemoryResource(dev_id)
 
                 device._has_inited = False
@@ -1122,6 +1124,7 @@ class Device:
 
     @memory_resource.setter
     def memory_resource(self, mr):
+        from cuda.core.experimental._memory import MemoryResource
         assert_type(mr, MemoryResource)
         self._mr = mr
 
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index d25b47fce2..6677d45289 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -34,9 +34,6 @@ if TYPE_CHECKING:
     import uuid
 
 
-PyCapsule = TypeVar("PyCapsule")
-"""Represent the capsule type."""
-
 DevicePointerT = Union[driver.CUdeviceptr, int, None]
 """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
 
@@ -291,7 +288,7 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
         max_version: tuple[int, int] | None = None,
         dl_device: tuple[int, int] | None = None,
         copy: bool | None = None,
-    ) -> PyCapsule:
+    ) -> TypeVar("PyCapsule"):
         # Note: we ignore the stream argument entirely (as if it is -1).
         # It is the user's responsibility to maintain stream order.
         if dl_device is not None:

From e19274887c7b781bcc98127284ab7ff863415965 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 29 Oct 2025 14:01:53 -0700
Subject: [PATCH 07/30] Move LegacyPinnedMemoryResource to a submodule.

---
 .../core/experimental/_memory/__init__.py     |  1 +
 .../cuda/core/experimental/_memory/legacy.py  | 75 +++++++++++++++++++
 .../cuda/core/experimental/_memory/memory.pyx | 62 +--------------
 cuda_core/cuda/core/experimental/_stream.pxd  |  2 +-
 cuda_core/cuda/core/experimental/_stream.pyx  |  2 +-
 5 files changed, 79 insertions(+), 63 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_memory/legacy.py

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index 998009f16c..2d1d500b5b 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -1,4 +1,5 @@
 from .memory import *
 from .memory import _SynchronousMemoryResource
+from .legacy import LegacyPinnedMemoryResource
 from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource
 
diff --git a/cuda_core/cuda/core/experimental/_memory/legacy.py b/cuda_core/cuda/core/experimental/_memory/legacy.py
new file mode 100644
index 0000000000..060e664924
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/legacy.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Union
+
+from cuda.core.experimental._memory.memory import Buffer, MemoryResource
+from cuda.core.experimental._utils.cuda_utils import (
+    driver,
+    _check_driver_error as raise_if_driver_error,
+)
+
+DevicePointerT = Union[driver.CUdeviceptr, int, None]
+
+
+class LegacyPinnedMemoryResource(MemoryResource):
+    """Create a pinned memory resource that uses legacy cuMemAllocHost/cudaMallocHost
+    APIs.
+    """
+
+    # TODO: support creating this MR with flags that are later passed to cuMemHostAlloc?
+
+    def allocate(self, size, stream = None) -> Buffer:
+        """Allocate a buffer of the requested size.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : Stream, optional
+            Currently ignored
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which is accessible on both host and device.
+        """
+        if stream is None:
+            from cuda.core.experimental._stream import default_stream
+            stream = default_stream()
+        err, ptr = driver.cuMemAllocHost(size)
+        raise_if_driver_error(err)
+        return Buffer._init(ptr, size, self, stream)
+
+    def deallocate(self, ptr: DevicePointerT, size, stream):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            The pointer or handle to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : Stream
+            The stream on which to perform the deallocation synchronously.
+        """
+        stream.sync()
+        err, = driver.cuMemFreeHost(ptr)
+        raise_if_driver_error(err)
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """bool: this memory resource provides device-accessible buffers."""
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """bool: this memory resource provides host-accessible buffers."""
+        return True
+
+    @property
+    def device_id(self) -> int:
+        """This memory resource is not bound to any GPU."""
+        raise RuntimeError("a pinned memory resource is not bound to any GPU")
+
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index 6677d45289..f7f17dc74e 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -27,7 +27,7 @@ import weakref
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.cuda_utils import ( driver, Transaction, get_binding_version )
+from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version)
 
 if TYPE_CHECKING:
     from .._device import Device
@@ -967,66 +967,6 @@ def _deep_reduce_device_memory_resource(mr):
 multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource)
 
 
-class LegacyPinnedMemoryResource(MemoryResource):
-    """Create a pinned memory resource that uses legacy cuMemAllocHost/cudaMallocHost
-    APIs.
-    """
-
-    # TODO: support creating this MR with flags that are later passed to cuMemHostAlloc?
-
-    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
-        """Allocate a buffer of the requested size.
-
-        Parameters
-        ----------
-        size : int
-            The size of the buffer to allocate, in bytes.
-        stream : Stream, optional
-            Currently ignored
-
-        Returns
-        -------
-        Buffer
-            The allocated buffer object, which is accessible on both host and device.
-        """
-        if stream is None:
-            stream = default_stream()
-        err, ptr = driver.cuMemAllocHost(size)
-        raise_if_driver_error(err)
-        return Buffer._init(ptr, size, self, stream)
-
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream):
-        """Deallocate a buffer previously allocated by this resource.
-
-        Parameters
-        ----------
-        ptr : :obj:`~_memory.DevicePointerT`
-            The pointer or handle to the buffer to deallocate.
-        size : int
-            The size of the buffer to deallocate, in bytes.
-        stream : Stream
-            The stream on which to perform the deallocation synchronously.
-        """
-        stream.sync()
-        err, = driver.cuMemFreeHost(ptr)
-        raise_if_driver_error(err)
-
-    @property
-    def is_device_accessible(self) -> bool:
-        """bool: this memory resource provides device-accessible buffers."""
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        """bool: this memory resource provides host-accessible buffers."""
-        return True
-
-    @property
-    def device_id(self) -> int:
-        """This memory resource is not bound to any GPU."""
-        raise RuntimeError("a pinned memory resource is not bound to any GPU")
-
-
 class _SynchronousMemoryResource(MemoryResource):
     __slots__ = ("_dev_id",)
 
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/experimental/_stream.pxd
index d992665a14..8f382e5d01 100644
--- a/cuda_core/cuda/core/experimental/_stream.pxd
+++ b/cuda_core/cuda/core/experimental/_stream.pxd
@@ -24,4 +24,4 @@ cdef class Stream:
     cdef int _get_device_and_context(self) except?-1
 
 
-cdef Stream default_stream()
+cpdef Stream default_stream()
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/experimental/_stream.pyx
index 82406c5598..146a15e573 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/experimental/_stream.pyx
@@ -402,7 +402,7 @@ LEGACY_DEFAULT_STREAM = C_LEGACY_DEFAULT_STREAM
 PER_THREAD_DEFAULT_STREAM = C_PER_THREAD_DEFAULT_STREAM
 
 
-cdef Stream default_stream():
+cpdef Stream default_stream():
     """Return the default CUDA :obj:`~_stream.Stream`.
 
     The type of default stream returned depends on if the environment

From 729c9009a3fcdd3eb24df18f071f92d8fdd09888 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 29 Oct 2025 14:08:38 -0700
Subject: [PATCH 08/30] Move _SynchronousMemoryResource into a submodule.

---
 .../core/experimental/_memory/__init__.py     |  3 +-
 .../cuda/core/experimental/_memory/legacy.py  | 33 +++++++++++++++++++
 .../cuda/core/experimental/_memory/memory.pyx | 31 -----------------
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index 2d1d500b5b..65947675e6 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -1,5 +1,4 @@
 from .memory import *
-from .memory import _SynchronousMemoryResource
-from .legacy import LegacyPinnedMemoryResource
+from .legacy import LegacyPinnedMemoryResource, _SynchronousMemoryResource
 from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource
 
diff --git a/cuda_core/cuda/core/experimental/_memory/legacy.py b/cuda_core/cuda/core/experimental/_memory/legacy.py
index 060e664924..d8507967c8 100644
--- a/cuda_core/cuda/core/experimental/_memory/legacy.py
+++ b/cuda_core/cuda/core/experimental/_memory/legacy.py
@@ -73,3 +73,36 @@ def device_id(self) -> int:
         """This memory resource is not bound to any GPU."""
         raise RuntimeError("a pinned memory resource is not bound to any GPU")
 
+
+class _SynchronousMemoryResource(MemoryResource):
+    __slots__ = ("_dev_id",)
+
+    def __init__(self, device_id):
+        self._dev_id = getattr(device_id, 'device_id', device_id)
+
+    def allocate(self, size, stream=None) -> Buffer:
+        if stream is None:
+            from cuda.core.experimental._stream import default_stream
+            stream = default_stream()
+        err, ptr = driver.cuMemAlloc(size)
+        raise_if_driver_error(err)
+        return Buffer._init(ptr, size, self)
+
+    def deallocate(self, ptr, size, stream):
+        stream.sync()
+        err, = driver.cuMemFree(ptr)
+        raise_if_driver_error(err)
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+    @property
+    def device_id(self) -> int:
+        return self._dev_id
+
+
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index f7f17dc74e..c82444fe3e 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -967,34 +967,3 @@ def _deep_reduce_device_memory_resource(mr):
 multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource)
 
 
-class _SynchronousMemoryResource(MemoryResource):
-    __slots__ = ("_dev_id",)
-
-    def __init__(self, device_id : int | Device):
-        self._dev_id = getattr(device_id, 'device_id', device_id)
-
-    def allocate(self, size, stream=None) -> Buffer:
-        if stream is None:
-            stream = default_stream()
-        err, ptr = driver.cuMemAlloc(size)
-        raise_if_driver_error(err)
-        return Buffer._init(ptr, size, self)
-
-    def deallocate(self, ptr, size, stream):
-        stream.sync()
-        err, = driver.cuMemFree(ptr)
-        raise_if_driver_error(err)
-
-    @property
-    def is_device_accessible(self) -> bool:
-        return True
-
-    @property
-    def is_host_accessible(self) -> bool:
-        return False
-
-    @property
-    def device_id(self) -> int:
-        return self._dev_id
-
-

From 87354552a1911732cdff4ba1ac90824e4c75f875 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 29 Oct 2025 14:26:57 -0700
Subject: [PATCH 09/30] Partly separates the IPC implementation.

---
 .../core/experimental/_memory/__init__.py     |  3 +-
 .../cuda/core/experimental/_memory/ipc.pxd    | 18 ++++
 .../cuda/core/experimental/_memory/ipc.pyx    | 87 +++++++++++++++++++
 .../cuda/core/experimental/_memory/memory.pxd | 15 +---
 .../cuda/core/experimental/_memory/memory.pyx | 76 +---------------
 5 files changed, 110 insertions(+), 89 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_memory/ipc.pxd
 create mode 100644 cuda_core/cuda/core/experimental/_memory/ipc.pyx

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index 65947675e6..8c6bc13196 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -1,4 +1,5 @@
-from .memory import *
+from .ipc import *
 from .legacy import LegacyPinnedMemoryResource, _SynchronousMemoryResource
+from .memory import *
 from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource
 
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
new file mode 100644
index 0000000000..06280c08a4
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
@@ -0,0 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+cdef class IPCBufferDescriptor:
+    cdef:
+        bytes _reserved
+        size_t _size
+
+
+cdef class IPCAllocationHandle:
+    cdef:
+        int _handle
+        object _uuid
+
+    cpdef close(self)
+
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
new file mode 100644
index 0000000000..82d25087e8
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union
+import multiprocessing
+import os
+
+if TYPE_CHECKING:
+    import uuid
+
+
+cdef class IPCBufferDescriptor:
+    """Serializable object describing a buffer that can be shared between processes."""
+
+    def __init__(self, *arg, **kwargs):
+        raise RuntimeError("IPCBufferDescriptor objects cannot be instantiated directly. Please use MemoryResource APIs.")
+
+    @classmethod
+    def _init(cls, reserved: bytes, size: int):
+        cdef IPCBufferDescriptor self = IPCBufferDescriptor.__new__(cls)
+        self._reserved = reserved
+        self._size = size
+        return self
+
+    def __reduce__(self):
+        return self._init, (self._reserved, self._size)
+
+    @property
+    def size(self):
+        return self._size
+
+
+cdef class IPCAllocationHandle:
+    """Shareable handle to an IPC-enabled device memory pool."""
+
+    def __init__(self, *arg, **kwargs):
+        raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.")
+
+    @classmethod
+    def _init(cls, handle: int, uuid: uuid.UUID):
+        cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls)
+        assert handle >= 0
+        self._handle = handle
+        self._uuid = uuid
+        return self
+
+    cpdef close(self):
+        """Close the handle."""
+        if self._handle >= 0:
+            try:
+                os.close(self._handle)
+            finally:
+                self._handle = -1
+                self._uuid = None
+
+    def __dealloc__(self):
+        self.close()
+
+    def __int__(self) -> int:
+        if self._handle < 0:
+            raise ValueError(
+                f"Cannot convert IPCAllocationHandle to int: the handle (id={id(self)}) is closed."
+            )
+        return self._handle
+
+    @property
+    def handle(self) -> int:
+        return self._handle
+
+    @property
+    def uuid(self) -> uuid.UUID:
+        return self._uuid
+
+
+def _reduce_allocation_handle(alloc_handle):
+    df = multiprocessing.reduction.DupFd(alloc_handle.handle)
+    return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid)
+
+def _reconstruct_allocation_handle(cls, df, uuid):
+    return cls._init(df.detach(), uuid)
+
+
+multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle)
+
+
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd
index 858901825b..ce3362fbd0 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd
@@ -5,6 +5,7 @@
 from libc.stdint cimport uintptr_t, intptr_t
 from cuda.bindings cimport cydriver
 
+from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle
 from cuda.core.experimental._stream cimport Stream as _cyStream
 
 
@@ -30,20 +31,6 @@ cdef class MemoryResource(_cyMemoryResource):
     cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
 
 
-cdef class IPCBufferDescriptor:
-    cdef:
-        bytes _reserved
-        size_t _size
-
-
-cdef class IPCAllocationHandle:
-    cdef:
-        int _handle
-        object _uuid
-
-    cpdef close(self)
-
-
 cdef class DeviceMemoryResource(MemoryResource):
     cdef:
         int _dev_id
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index c82444fe3e..3a1ee4d300 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -9,6 +9,7 @@ from libc.limits cimport ULLONG_MAX
 from libc.stdint cimport uintptr_t, intptr_t
 from libc.string cimport memset, memcpy
 from cuda.bindings cimport cydriver
+from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle, IPCBufferDescriptor
 from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
@@ -343,6 +344,7 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
         # TODO: It is better to take a stream for latter deallocation
         return Buffer._init(ptr, size, mr=mr)
 
+
 cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC):
     """Abstract base class for memory resources that manage allocation and deallocation of buffers.
 
@@ -400,80 +402,6 @@ cdef cydriver.CUmemAllocationHandleType _IPC_HANDLE_TYPE = cydriver.CUmemAllocat
     if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
 
 
-cdef class IPCBufferDescriptor:
-    """Serializable object describing a buffer that can be shared between processes."""
-
-    def __init__(self, *arg, **kwargs):
-        raise RuntimeError("IPCBufferDescriptor objects cannot be instantiated directly. Please use MemoryResource APIs.")
-
-    @classmethod
-    def _init(cls, reserved: bytes, size: int):
-        cdef IPCBufferDescriptor self = IPCBufferDescriptor.__new__(cls)
-        self._reserved = reserved
-        self._size = size
-        return self
-
-    def __reduce__(self):
-        return self._init, (self._reserved, self._size)
-
-    @property
-    def size(self):
-        return self._size
-
-
-cdef class IPCAllocationHandle:
-    """Shareable handle to an IPC-enabled device memory pool."""
-
-    def __init__(self, *arg, **kwargs):
-        raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.")
-
-    @classmethod
-    def _init(cls, handle: int, uuid: uuid.UUID):
-        cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls)
-        assert handle >= 0
-        self._handle = handle
-        self._uuid = uuid
-        return self
-
-    cpdef close(self):
-        """Close the handle."""
-        if self._handle >= 0:
-            try:
-                os.close(self._handle)
-            finally:
-                self._handle = -1
-                self._uuid = None
-
-    def __dealloc__(self):
-        self.close()
-
-    def __int__(self) -> int:
-        if self._handle < 0:
-            raise ValueError(
-                f"Cannot convert IPCAllocationHandle to int: the handle (id={id(self)}) is closed."
-            )
-        return self._handle
-
-    @property
-    def handle(self) -> int:
-        return self._handle
-
-    @property
-    def uuid(self) -> uuid.UUID:
-        return self._uuid
-
-
-def _reduce_allocation_handle(alloc_handle):
-    df = multiprocessing.reduction.DupFd(alloc_handle.handle)
-    return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid)
-
-def _reconstruct_allocation_handle(cls, df, uuid):
-    return cls._init(df.detach(), uuid)
-
-
-multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle)
-
-
 @dataclass
 cdef class DeviceMemoryResourceOptions:
     """Customizable :obj:`~_memory.DeviceMemoryResource` options.

From b2517f683cd9b770dc649fc6f0996b6e847f78c4 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 30 Oct 2025 10:04:08 -0700
Subject: [PATCH 10/30] Move IPC registry to ipc module.

---
 cuda_core/cuda/core/experimental/_memory/ipc.pxd |  8 ++++++++
 cuda_core/cuda/core/experimental/_memory/ipc.pyx |  3 +++
 .../cuda/core/experimental/_memory/memory.pyx    | 16 +++++-----------
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
index 06280c08a4..e8b67e2455 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
@@ -3,6 +3,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
+# Holds DeviceMemoryResource objects imported by this process.  This enables
+# buffer serialization, as buffers can reduce to a pair comprising the memory
+# resource UUID (the key into this registry) and the serialized buffer
+# descriptor.
+cdef object registry
+
+
 cdef class IPCBufferDescriptor:
     cdef:
         bytes _reserved
@@ -16,3 +23,4 @@ cdef class IPCAllocationHandle:
 
     cpdef close(self)
 
+
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
index 82d25087e8..579baa6a1a 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
@@ -6,11 +6,14 @@
 from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union
 import multiprocessing
 import os
+import weakref
 
 if TYPE_CHECKING:
     import uuid
 
 
+cdef object registry = weakref.WeakValueDictionary()
+
 cdef class IPCBufferDescriptor:
     """Serializable object describing a buffer that can be shared between processes."""
 
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index 3a1ee4d300..f20a0d0c2f 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -10,6 +10,7 @@ from libc.stdint cimport uintptr_t, intptr_t
 from libc.string cimport memset, memcpy
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle, IPCBufferDescriptor
+from cuda.core.experimental._memory cimport ipc
 from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
@@ -482,13 +483,6 @@ class DeviceMemoryResourceAttributes:
     del mempool_property
 
 
-# Holds DeviceMemoryResource objects imported by this process.
-# This enables buffer serialization, as buffers can reduce to a pair
-# of comprising the memory resource UUID (the key into this registry)
-# and the serialized buffer descriptor.
-cdef object _ipc_registry = weakref.WeakValueDictionary()
-
-
 cdef class DeviceMemoryResource(MemoryResource):
     """
     Create a device memory resource managing a stream-ordered memory pool.
@@ -673,7 +667,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
 
         try:
-            return _ipc_registry[uuid]
+            return ipc.registry[uuid]
         except KeyError:
             raise RuntimeError(f"Memory resource {uuid} was not found") from None
 
@@ -686,11 +680,11 @@ cdef class DeviceMemoryResource(MemoryResource):
         The registered mapped memory resource. If one was previously registered
         with the given key, it is returned.
         """
-        existing = _ipc_registry.get(uuid)
+        existing = ipc.registry.get(uuid)
         if existing is not None:
             return existing
         assert self._uuid is None or self._uuid == uuid
-        _ipc_registry[uuid] = self
+        ipc.registry[uuid] = self
         self._uuid = uuid
         return self
 
@@ -725,7 +719,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
          # Quick exit for registry hits.
         uuid = getattr(alloc_handle, 'uuid', None)
-        mr = _ipc_registry.get(uuid)
+        mr = ipc.registry.get(uuid)
         if mr is not None:
             return mr
 

From a61317aa610c54609210ebd910c2fd3e6ab04d81 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 30 Oct 2025 10:32:33 -0700
Subject: [PATCH 11/30] Collect and reorder DeviceMemoryResource properties.

---
 .../cuda/core/experimental/_memory/memory.pyx | 42 +++++++++----------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index f20a0d0c2f..32b45af2a2 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -688,14 +688,6 @@ cdef class DeviceMemoryResource(MemoryResource):
         self._uuid = uuid
         return self
 
-    @property
-    def uuid(self) -> Optional[uuid.UUID]:
-        """
-        A universally unique identifier for this memory resource. Meaningful
-        only for IPC-enabled memory resources.
-        """
-        return self._uuid
-
     @classmethod
     def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource:
         """Create a device memory resource from an allocation handle.
@@ -850,24 +842,16 @@ cdef class DeviceMemoryResource(MemoryResource):
         """Handle to the underlying memory pool."""
         return driver.CUmemoryPool(<uintptr_t>(self._mempool_handle))
 
-    @property
-    def is_handle_owned(self) -> bool:
-        """Whether the memory resource handle is owned. If False, ``close`` has no effect."""
-        return self._mempool_owned
-
-    @property
-    def is_mapped(self) -> bool:
-        """
-        Whether this is a mapping of an IPC-enabled memory resource from
-        another process.  If True, allocation is not permitted.
-        """
-        return self._is_mapped
-
     @property
     def is_device_accessible(self) -> bool:
         """Return True. This memory resource provides device-accessible buffers."""
         return True
 
+    @property
+    def is_handle_owned(self) -> bool:
+        """Whether the memory resource handle is owned. If False, ``close`` has no effect."""
+        return self._mempool_owned
+
     @property
     def is_host_accessible(self) -> bool:
         """Return False. This memory resource does not provide host-accessible buffers."""
@@ -878,6 +862,22 @@ cdef class DeviceMemoryResource(MemoryResource):
         """Whether this memory resource has IPC enabled."""
         return self._ipc_handle_type != cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
 
+    @property
+    def is_mapped(self) -> bool:
+        """
+        Whether this is a mapping of an IPC-enabled memory resource from
+        another process.  If True, allocation is not permitted.
+        """
+        return self._is_mapped
+
+    @property
+    def uuid(self) -> Optional[uuid.UUID]:
+        """
+        A universally unique identifier for this memory resource. Meaningful
+        only for IPC-enabled memory resources.
+        """
+        return self._uuid
+
 
 def _deep_reduce_device_memory_resource(mr):
     from .._device import Device

From 0e2d1d8f404dce975580651449a0ba770b9f7762 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 30 Oct 2025 11:28:48 -0700
Subject: [PATCH 12/30] Move more IPC implementation out of
 DeviceMemoryResource.

---
 .../cuda/core/experimental/_memory/ipc.pxd    | 17 ++++
 .../cuda/core/experimental/_memory/ipc.pyx    | 81 ++++++++++++++++++-
 .../cuda/core/experimental/_memory/memory.pyx | 73 ++---------------
 3 files changed, 101 insertions(+), 70 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
index e8b67e2455..46a7911d39 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
@@ -2,6 +2,11 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._memory.memory cimport DeviceMemoryResource
+
+import uuid as uuid_module
+
 
 # Holds DeviceMemoryResource objects imported by this process.  This enables
 # buffer serialization, as buffers can reduce to a pair comprising the memory
@@ -9,6 +14,10 @@
 # descriptor.
 cdef object registry
 
+# IPC is currently only supported on Linux. On other platforms, the IPC handle
+# type is set equal to the no-IPC handle type.
+cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE
+
 
 cdef class IPCBufferDescriptor:
     cdef:
@@ -24,3 +33,11 @@ cdef class IPCAllocationHandle:
     cpdef close(self)
 
 
+# DeviceMemoryResource IPC Implementation
+# ------
+cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self)
+cpdef DeviceMemoryResource DMR_from_allocation_handle(
+    cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle
+)
+cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UUID)
+cpdef DeviceMemoryResource DMR_from_registry(uuid: uuid.UUID)
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
index 579baa6a1a..0955a43ed8 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
@@ -2,18 +2,26 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+from libc.stdint cimport intptr_t
 
-from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+)
+
+from typing import Iterable, Literal, Optional, TypeVar, Union
 import multiprocessing
 import os
+import platform
+import uuid
 import weakref
 
-if TYPE_CHECKING:
-    import uuid
-
 
 cdef object registry = weakref.WeakValueDictionary()
 
+cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \
+    if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+
 cdef class IPCBufferDescriptor:
     """Serializable object describing a buffer that can be shared between processes."""
 
@@ -88,3 +96,68 @@ def _reconstruct_allocation_handle(cls, df, uuid):
 multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle)
 
 
+cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self):
+    # Note: This is Linux only (int for file descriptor)
+    cdef int alloc_handle
+
+    if self._alloc_handle is None:
+        if not self.is_ipc_enabled:
+            raise RuntimeError("Memory resource is not IPC-enabled")
+        if self._is_mapped:
+            raise RuntimeError("Imported memory resource cannot be exported")
+
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
+                &alloc_handle, self._mempool_handle, IPC_HANDLE_TYPE, 0)
+            )
+        try:
+            assert self._uuid is None
+            self._uuid = uuid.uuid4()
+            self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid)
+        except:
+            os.close(alloc_handle)
+            raise
+    return self._alloc_handle
+
+
+cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle):
+    # Quick exit for registry hits.
+    uuid = getattr(alloc_handle, 'uuid', None)
+    mr = registry.get(uuid)
+    if mr is not None:
+        return mr
+
+    device_id = getattr(device_id, 'device_id', device_id)
+
+    cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls)
+    self._dev_id = device_id
+    self._ipc_handle_type = IPC_HANDLE_TYPE
+    self._mempool_owned = True
+    self._is_mapped = True
+    #self._alloc_handle = None  # only used for non-imported
+
+    cdef int handle = int(alloc_handle)
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle(
+            &(self._mempool_handle), <void*><intptr_t>(handle), IPC_HANDLE_TYPE, 0)
+        )
+    if uuid is not None:
+        registered = self.register(uuid)
+        assert registered is self
+    return self
+
+
+cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UUID):
+    existing = registry.get(uuid)
+    if existing is not None:
+        return existing
+    assert self._uuid is None or self._uuid == uuid
+    registry[uuid] = self
+    self._uuid = uuid
+    return self
+
+cpdef DeviceMemoryResource DMR_from_registry(uuid: uuid.UUID):
+    try:
+        return registry[uuid]
+    except KeyError:
+        raise RuntimeError(f"Memory resource {uuid} was not found") from None
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index 32b45af2a2..fbba679095 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -397,12 +397,6 @@ cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC):
         ...
 
 
-# IPC is currently only supported on Linux. On other platforms, the IPC handle
-# type is set equal to the no-IPC handle type.
-cdef cydriver.CUmemAllocationHandleType _IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \
-    if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
-
-
 @dataclass
 cdef class DeviceMemoryResourceOptions:
     """Customizable :obj:`~_memory.DeviceMemoryResource` options.
@@ -607,12 +601,12 @@ cdef class DeviceMemoryResource(MemoryResource):
                     ))
         else:
             # Create a new memory pool.
-            if opts.ipc_enabled and _IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE:
+            if opts.ipc_enabled and ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE:
                 raise RuntimeError("IPC is not available on {platform.system()}")
 
             memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
             properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
-            properties.handleTypes = _IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+            properties.handleTypes = ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
             properties.location.id = dev_id
             properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
             properties.maxSize = opts.max_size
@@ -665,11 +659,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         RuntimeError
             If no mapped memory resource is found in the registry.
         """
-
-        try:
-            return ipc.registry[uuid]
-        except KeyError:
-            raise RuntimeError(f"Memory resource {uuid} was not found") from None
+        return ipc.DMR_from_registry(uuid)
 
     def register(self, uuid: uuid.UUID) -> DeviceMemoryResource:
         """
@@ -680,13 +670,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         The registered mapped memory resource. If one was previously registered
         with the given key, it is returned.
         """
-        existing = ipc.registry.get(uuid)
-        if existing is not None:
-            return existing
-        assert self._uuid is None or self._uuid == uuid
-        ipc.registry[uuid] = self
-        self._uuid = uuid
-        return self
+        return ipc.DMR_register(self, uuid)
 
     @classmethod
     def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource:
@@ -709,30 +693,8 @@ cdef class DeviceMemoryResource(MemoryResource):
         -------
             A new device memory resource instance with the imported handle.
         """
-         # Quick exit for registry hits.
-        uuid = getattr(alloc_handle, 'uuid', None)
-        mr = ipc.registry.get(uuid)
-        if mr is not None:
-            return mr
-
-        device_id = getattr(device_id, 'device_id', device_id)
-
-        cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls)
-        self._dev_id = device_id
-        self._ipc_handle_type = _IPC_HANDLE_TYPE
-        self._mempool_owned = True
-        self._is_mapped = True
-        #self._alloc_handle = None  # only used for non-imported
-
-        cdef int handle = int(alloc_handle)
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle(
-                &(self._mempool_handle), <void*><intptr_t>(handle), _IPC_HANDLE_TYPE, 0)
-            )
-        if uuid is not None:
-            registered = self.register(uuid)
-            assert registered is self
-        return self
+        return ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle)
+
 
     cpdef IPCAllocationHandle get_allocation_handle(self):
         """Export the memory pool handle to be shared (requires IPC).
@@ -744,28 +706,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         -------
             The shareable handle for the memory pool.
         """
-        # Note: This is Linux only (int for file descriptor)
-        cdef int alloc_handle
-
-        if self._alloc_handle is None:
-            if not self.is_ipc_enabled:
-                raise RuntimeError("Memory resource is not IPC-enabled")
-            if self._is_mapped:
-                raise RuntimeError("Imported memory resource cannot be exported")
-
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
-                    &alloc_handle, self._mempool_handle, _IPC_HANDLE_TYPE, 0)
-                )
-            try:
-                assert self._uuid is None
-                import uuid
-                self._uuid = uuid.uuid4()
-                self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid)
-            except:
-                os.close(alloc_handle)
-                raise
-        return self._alloc_handle
+        return ipc.DMR_get_allocation_handle(self)
 
     cdef Buffer _allocate(self, size_t size, _cyStream stream):
         cdef cydriver.CUstream s = stream._handle

From 538762991052eb8fd75f0332bf3a5b9c2e64aa2e Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 30 Oct 2025 12:53:07 -0700
Subject: [PATCH 13/30] Minor refactoring.

---
 .../cuda/core/experimental/_memory/ipc.pxd    |  12 +-
 .../cuda/core/experimental/_memory/ipc.pyx    |  22 +-
 .../cuda/core/experimental/_memory/memory.pxd |   1 -
 .../cuda/core/experimental/_memory/memory.pyx | 218 +++++++-----------
 4 files changed, 111 insertions(+), 142 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
index 46a7911d39..36f7721ed3 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
@@ -5,8 +5,6 @@
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory.memory cimport DeviceMemoryResource
 
-import uuid as uuid_module
-
 
 # Holds DeviceMemoryResource objects imported by this process.  This enables
 # buffer serialization, as buffers can reduce to a pair comprising the memory
@@ -35,9 +33,7 @@ cdef class IPCAllocationHandle:
 
 # DeviceMemoryResource IPC Implementation
 # ------
-cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self)
-cpdef DeviceMemoryResource DMR_from_allocation_handle(
-    cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle
-)
-cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UUID)
-cpdef DeviceMemoryResource DMR_from_registry(uuid: uuid.UUID)
+cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource)
+cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle)
+cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid)
+cpdef DeviceMemoryResource DMR_from_registry(uuid)
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
index 0955a43ed8..428fede67e 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
@@ -50,7 +50,7 @@ cdef class IPCAllocationHandle:
         raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.")
 
     @classmethod
-    def _init(cls, handle: int, uuid: uuid.UUID):
+    def _init(cls, handle: int, uuid):
         cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls)
         assert handle >= 0
         self._handle = handle
@@ -89,6 +89,7 @@ def _reduce_allocation_handle(alloc_handle):
     df = multiprocessing.reduction.DupFd(alloc_handle.handle)
     return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid)
 
+
 def _reconstruct_allocation_handle(cls, df, uuid):
     return cls._init(df.detach(), uuid)
 
@@ -96,6 +97,19 @@ def _reconstruct_allocation_handle(cls, df, uuid):
 multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle)
 
 
+def _deep_reduce_device_memory_resource(mr):
+    from .._device import Device
+    device = Device(mr.device_id)
+    alloc_handle = mr.get_allocation_handle()
+    return mr.from_allocation_handle, (device, alloc_handle)
+
+
+multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource)
+
+
+# DeviceMemoryResource IPC Implementation
+# ------
+
 cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self):
     # Note: This is Linux only (int for file descriptor)
     cdef int alloc_handle
@@ -120,7 +134,7 @@ cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self):
     return self._alloc_handle
 
 
-cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle):
+cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle):
     # Quick exit for registry hits.
     uuid = getattr(alloc_handle, 'uuid', None)
     mr = registry.get(uuid)
@@ -147,7 +161,7 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id: int | Devi
     return self
 
 
-cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UUID):
+cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid):
     existing = registry.get(uuid)
     if existing is not None:
         return existing
@@ -156,7 +170,7 @@ cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid: uuid.UU
     self._uuid = uuid
     return self
 
-cpdef DeviceMemoryResource DMR_from_registry(uuid: uuid.UUID):
+cpdef DeviceMemoryResource DMR_from_registry(uuid):
     try:
         return registry[uuid]
     except KeyError:
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd
index ce3362fbd0..84018cd30f 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd
@@ -44,7 +44,6 @@ cdef class DeviceMemoryResource(MemoryResource):
         object __weakref__
 
     cpdef close(self)
-    cpdef IPCAllocationHandle get_allocation_handle(self)
     cdef Buffer _allocate(self, size_t size, _cyStream stream)
     cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
     cpdef deallocate(self, ptr, size_t size, stream=*)
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index fbba679095..bc971d1df1 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -22,7 +22,6 @@ import abc
 import cython
 from dataclasses import dataclass, field
 from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union
-import multiprocessing
 import os
 import platform
 import weakref
@@ -50,36 +49,7 @@ cdef class _cyMemoryResource:
         raise NotImplementedError
 
 
-class MemoryResourceAttributes(abc.ABC):
-
-    __slots__ = ()
-
-    @property
-    @abc.abstractmethod
-    def is_device_accessible(self) -> bool:
-        """bool: True if buffers allocated by this resource can be accessed on the device."""
-        ...
-
-    @property
-    @abc.abstractmethod
-    def is_host_accessible(self) -> bool:
-        """bool: True if buffers allocated by this resource can be accessed on the host."""
-        ...
-
-    @property
-    @abc.abstractmethod
-    def device_id(self) -> int:
-        """int: The device ordinal for which this memory resource is responsible.
-
-        Raises
-        ------
-        RuntimeError
-            If the resource is not bound to a specific device.
-        """
-        ...
-
-
-cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
+cdef class Buffer(_cyBuffer):
     """Represent a handle to allocated memory.
 
     This generic object provides a unified representation for how
@@ -122,6 +92,47 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
         # Must not serialize the parent's stream!
         return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor())
 
+    @staticmethod
+    def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer:
+        """Create a new :class:`Buffer` object from a pointer.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            Allocated buffer handle object
+        size : int
+            Memory size of the buffer
+        mr : :obj:`~_memory.MemoryResource`, optional
+            Memory resource associated with the buffer
+        """
+        # TODO: It is better to take a stream for latter deallocation
+        return Buffer._init(ptr, size, mr=mr)
+
+    @classmethod
+    def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer:
+        """Import a buffer that was exported from another process."""
+        if not mr.is_ipc_enabled:
+            raise RuntimeError("Memory resource is not IPC-enabled")
+        if stream is None:
+            # Note: match this behavior to DeviceMemoryResource.allocate()
+            stream = default_stream()
+        cdef cydriver.CUmemPoolPtrExportData data
+        memcpy(data.reserved, <const void*><const char*>(ipc_buffer._reserved), sizeof(data.reserved))
+        cdef cydriver.CUdeviceptr ptr
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data))
+        return Buffer._init(<intptr_t>ptr, ipc_buffer.size, mr, stream)
+
+    def get_ipc_descriptor(self) -> IPCBufferDescriptor:
+        """Export a buffer allocated for sharing between processes."""
+        if not self._mr.is_ipc_enabled:
+            raise RuntimeError("Memory resource is not IPC-enabled")
+        cdef cydriver.CUmemPoolPtrExportData data
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, <cydriver.CUdeviceptr>(self._ptr)))
+        cdef bytes data_b = cpython.PyBytes_FromStringAndSize(<char*>(data.reserved), sizeof(data.reserved))
+        return IPCBufferDescriptor._init(data_b, self.size)
+
     cpdef close(self, stream: Stream = None):
         """Deallocate this buffer asynchronously on the given stream.
 
@@ -150,79 +161,6 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
             self._ptr_obj = None
             self._alloc_stream = None
 
-    @property
-    def handle(self) -> DevicePointerT:
-        """Return the buffer handle object.
-
-        .. caution::
-
-            This handle is a Python object. To get the memory address of the underlying C
-            handle, call ``int(Buffer.handle)``.
-        """
-        if self._ptr_obj is not None:
-            return self._ptr_obj
-        elif self._ptr:
-            return self._ptr
-        else:
-            # contract: Buffer is closed
-            return 0
-
-    @property
-    def size(self) -> int:
-        """Return the memory size of this buffer."""
-        return self._size
-
-    @property
-    def memory_resource(self) -> MemoryResource:
-        """Return the memory resource associated with this buffer."""
-        return self._mr
-
-    @property
-    def is_device_accessible(self) -> bool:
-        """Return True if this buffer can be accessed by the GPU, otherwise False."""
-        if self._mr is not None:
-            return self._mr.is_device_accessible
-        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
-
-    @property
-    def is_host_accessible(self) -> bool:
-        """Return True if this buffer can be accessed by the CPU, otherwise False."""
-        if self._mr is not None:
-            return self._mr.is_host_accessible
-        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
-
-    @property
-    def device_id(self) -> int:
-        """Return the device ordinal of this buffer."""
-        if self._mr is not None:
-            return self._mr.device_id
-        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
-
-    def get_ipc_descriptor(self) -> IPCBufferDescriptor:
-        """Export a buffer allocated for sharing between processes."""
-        if not self._mr.is_ipc_enabled:
-            raise RuntimeError("Memory resource is not IPC-enabled")
-        cdef cydriver.CUmemPoolPtrExportData data
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, <cydriver.CUdeviceptr>(self._ptr)))
-        cdef bytes data_b = cpython.PyBytes_FromStringAndSize(<char*>(data.reserved), sizeof(data.reserved))
-        return IPCBufferDescriptor._init(data_b, self.size)
-
-    @classmethod
-    def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer:
-        """Import a buffer that was exported from another process."""
-        if not mr.is_ipc_enabled:
-            raise RuntimeError("Memory resource is not IPC-enabled")
-        if stream is None:
-            # Note: match this behavior to DeviceMemoryResource.allocate()
-            stream = default_stream()
-        cdef cydriver.CUmemPoolPtrExportData data
-        memcpy(data.reserved, <const void*><const char*>(ipc_buffer._reserved), sizeof(data.reserved))
-        cdef cydriver.CUdeviceptr ptr
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data))
-        return Buffer._init(<intptr_t>ptr, ipc_buffer.size, mr, stream)
-
     def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
 
@@ -329,24 +267,56 @@ cdef class Buffer(_cyBuffer, MemoryResourceAttributes):
         # Supporting method paired with __buffer__.
         raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.")
 
-    @staticmethod
-    def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer:
-        """Create a new :class:`Buffer` object from a pointer.
+    @property
+    def device_id(self) -> int:
+        """Return the device ordinal of this buffer."""
+        if self._mr is not None:
+            return self._mr.device_id
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
-        Parameters
-        ----------
-        ptr : :obj:`~_memory.DevicePointerT`
-            Allocated buffer handle object
-        size : int
-            Memory size of the buffer
-        mr : :obj:`~_memory.MemoryResource`, optional
-            Memory resource associated with the buffer
+    @property
+    def handle(self) -> DevicePointerT:
+        """Return the buffer handle object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Buffer.handle)``.
         """
-        # TODO: It is better to take a stream for latter deallocation
-        return Buffer._init(ptr, size, mr=mr)
+        if self._ptr_obj is not None:
+            return self._ptr_obj
+        elif self._ptr:
+            return self._ptr
+        else:
+            # contract: Buffer is closed
+            return 0
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True if this buffer can be accessed by the GPU, otherwise False."""
+        if self._mr is not None:
+            return self._mr.is_device_accessible
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return True if this buffer can be accessed by the CPU, otherwise False."""
+        if self._mr is not None:
+            return self._mr.is_host_accessible
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
-cdef class MemoryResource(_cyMemoryResource, MemoryResourceAttributes, abc.ABC):
+    @property
+    def memory_resource(self) -> MemoryResource:
+        """Return the memory resource associated with this buffer."""
+        return self._mr
+
+    @property
+    def size(self) -> int:
+        """Return the memory size of this buffer."""
+        return self._size
+
+
+cdef class MemoryResource(_cyMemoryResource):
     """Abstract base class for memory resources that manage allocation and deallocation of buffers.
 
     Subclasses must implement methods for allocating and deallocation, as well as properties
@@ -696,7 +666,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         return ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle)
 
 
-    cpdef IPCAllocationHandle get_allocation_handle(self):
+    def get_allocation_handle(self) -> IPCAllocationHandle:
         """Export the memory pool handle to be shared (requires IPC).
 
         The handle can be used to share the memory pool with other processes.
@@ -820,13 +790,3 @@ cdef class DeviceMemoryResource(MemoryResource):
         return self._uuid
 
 
-def _deep_reduce_device_memory_resource(mr):
-    from .._device import Device
-    device = Device(mr.device_id)
-    alloc_handle = mr.get_allocation_handle()
-    return mr.from_allocation_handle, (device, alloc_handle)
-
-
-multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource)
-
-

From 7fa38cad5161f68baf23988fc0cb42133cda6260 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 30 Oct 2025 14:42:46 -0700
Subject: [PATCH 14/30] Move Buffer IPC implementation.

---
 .../cuda/core/experimental/_memory/ipc.pxd    |  8 +++++-
 .../cuda/core/experimental/_memory/ipc.pyx    | 28 +++++++++++++++++++
 .../cuda/core/experimental/_memory/memory.pyx | 21 ++------------
 3 files changed, 37 insertions(+), 20 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
index 36f7721ed3..d2096a6299 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory.memory cimport DeviceMemoryResource
+from cuda.core.experimental._memory.memory cimport Buffer, DeviceMemoryResource
 
 
 # Holds DeviceMemoryResource objects imported by this process.  This enables
@@ -31,6 +31,12 @@ cdef class IPCAllocationHandle:
     cpdef close(self)
 
 
+# Buffer IPC Implementation
+# ------
+cpdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer)
+cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource, IPCBufferDescriptor, stream)
+
+
 # DeviceMemoryResource IPC Implementation
 # ------
 cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource)
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
index 428fede67e..2062a1c06e 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
@@ -2,9 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+cimport cpython
 from libc.stdint cimport intptr_t
+from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
+from cuda.core.experimental._stream cimport default_stream
 from cuda.core.experimental._utils.cuda_utils cimport (
     HANDLE_RETURN,
 )
@@ -107,6 +110,31 @@ def _deep_reduce_device_memory_resource(mr):
 multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource)
 
 
+# Buffer IPC Implementation
+# ------
+cpdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self):
+    if not self._mr.is_ipc_enabled:
+        raise RuntimeError("Memory resource is not IPC-enabled")
+    cdef cydriver.CUmemPoolPtrExportData data
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, <cydriver.CUdeviceptr>(self._ptr)))
+    cdef bytes data_b = cpython.PyBytes_FromStringAndSize(<char*>(data.reserved), sizeof(data.reserved))
+    return IPCBufferDescriptor._init(data_b, self.size)
+
+cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferDescriptor ipc_buffer, stream):
+    """Import a buffer that was exported from another process."""
+    if not mr.is_ipc_enabled:
+        raise RuntimeError("Memory resource is not IPC-enabled")
+    if stream is None:
+        # Note: match this behavior to DeviceMemoryResource.allocate()
+        stream = default_stream()
+    cdef cydriver.CUmemPoolPtrExportData data
+    memcpy(data.reserved, <const void*><const char*>(ipc_buffer._reserved), sizeof(data.reserved))
+    cdef cydriver.CUdeviceptr ptr
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data))
+    return Buffer._init(<intptr_t>ptr, ipc_buffer.size, mr, stream)
+
 # DeviceMemoryResource IPC Implementation
 # ------
 
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index bc971d1df1..23c55d881b 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -4,7 +4,6 @@
 
 from __future__ import annotations
 
-cimport cpython
 from libc.limits cimport ULLONG_MAX
 from libc.stdint cimport uintptr_t, intptr_t
 from libc.string cimport memset, memcpy
@@ -111,27 +110,11 @@ cdef class Buffer(_cyBuffer):
     @classmethod
     def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer:
         """Import a buffer that was exported from another process."""
-        if not mr.is_ipc_enabled:
-            raise RuntimeError("Memory resource is not IPC-enabled")
-        if stream is None:
-            # Note: match this behavior to DeviceMemoryResource.allocate()
-            stream = default_stream()
-        cdef cydriver.CUmemPoolPtrExportData data
-        memcpy(data.reserved, <const void*><const char*>(ipc_buffer._reserved), sizeof(data.reserved))
-        cdef cydriver.CUdeviceptr ptr
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data))
-        return Buffer._init(<intptr_t>ptr, ipc_buffer.size, mr, stream)
+        return ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream)
 
     def get_ipc_descriptor(self) -> IPCBufferDescriptor:
         """Export a buffer allocated for sharing between processes."""
-        if not self._mr.is_ipc_enabled:
-            raise RuntimeError("Memory resource is not IPC-enabled")
-        cdef cydriver.CUmemPoolPtrExportData data
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, <cydriver.CUdeviceptr>(self._ptr)))
-        cdef bytes data_b = cpython.PyBytes_FromStringAndSize(<char*>(data.reserved), sizeof(data.reserved))
-        return IPCBufferDescriptor._init(data_b, self.size)
+        return ipc.Buffer_get_ipc_descriptor(self)
 
     cpdef close(self, stream: Stream = None):
         """Deallocate this buffer asynchronously on the given stream.

From f357abdc104159e1e0188e3f9d3ca67e391ce7e5 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 30 Oct 2025 14:54:29 -0700
Subject: [PATCH 15/30] Simplify the class hierarchy (remove _cyBuffer and
 _cyMemoryResource).

---
 .../cuda/core/experimental/_memory/memory.pxd     | 13 +++----------
 .../cuda/core/experimental/_memory/memory.pyx     | 15 ++-------------
 2 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd
index 84018cd30f..b582d8de5b 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd
@@ -9,25 +9,18 @@ from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle
 from cuda.core.experimental._stream cimport Stream as _cyStream
 
 
-cdef class _cyBuffer:
+cdef class Buffer:
     cdef:
         intptr_t _ptr
         size_t _size
-        _cyMemoryResource _mr
+        MemoryResource _mr
         object _ptr_obj
         _cyStream _alloc_stream
 
-
-cdef class _cyMemoryResource:
-    cdef Buffer _allocate(self, size_t size, _cyStream stream)
-    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
-
-
-cdef class Buffer(_cyBuffer):
     cpdef close(self, stream=*)
 
 
-cdef class MemoryResource(_cyMemoryResource):
+cdef class MemoryResource:
     cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
 
 
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index 23c55d881b..2a7c0eca2d 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -37,18 +37,7 @@ if TYPE_CHECKING:
 DevicePointerT = Union[driver.CUdeviceptr, int, None]
 """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
 
-cdef class _cyMemoryResource:
-    """
-    Internal only. Responsible for offering fast C method access.
-    """
-    cdef Buffer _allocate(self, size_t size, _cyStream stream):
-        raise NotImplementedError
-
-    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept:
-        raise NotImplementedError
-
-
-cdef class Buffer(_cyBuffer):
+cdef class Buffer:
     """Represent a handle to allocated memory.
 
     This generic object provides a unified representation for how
@@ -299,7 +288,7 @@ cdef class Buffer(_cyBuffer):
         return self._size
 
 
-cdef class MemoryResource(_cyMemoryResource):
+cdef class MemoryResource:
     """Abstract base class for memory resources that manage allocation and deallocation of buffers.
 
     Subclasses must implement methods for allocating and deallocation, as well as properties

From 89057f9efb83b49b22fd3196bd4bd46c8d62f8cb Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 30 Oct 2025 15:02:09 -0700
Subject: [PATCH 16/30] Refactor to shrink Cython interface.

---
 .../cuda/core/experimental/_memory/memory.pxd |  4 -
 .../cuda/core/experimental/_memory/memory.pyx | 85 ++++++++++---------
 2 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd
index b582d8de5b..c0013b01ac 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd
@@ -36,7 +36,3 @@ cdef class DeviceMemoryResource(MemoryResource):
         IPCAllocationHandle _alloc_handle
         object __weakref__
 
-    cpdef close(self)
-    cdef Buffer _allocate(self, size_t size, _cyStream stream)
-    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
-    cpdef deallocate(self, ptr, size_t size, stream=*)
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index 2a7c0eca2d..ac58295156 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -567,26 +567,11 @@ cdef class DeviceMemoryResource(MemoryResource):
                 self.get_allocation_handle()  # enables Buffer.get_ipc_descriptor, sets uuid
 
     def __dealloc__(self):
-        self.close()
+        DMR_close(self)
 
-    cpdef close(self):
+    def close(self):
         """Close the device memory resource and destroy the associated memory pool if owned."""
-        if self._mempool_handle == NULL:
-            return
-
-        try:
-            if self._mempool_owned:
-                with nogil:
-                    HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._mempool_handle))
-        finally:
-            self._dev_id = cydriver.CU_DEVICE_INVALID
-            self._mempool_handle = NULL
-            self._attributes = None
-            self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX
-            self._mempool_owned = False
-            self._is_mapped = False
-            self._uuid = None
-            self._alloc_handle = None
+        DMR_close(self)
 
     def __reduce__(self):
         return DeviceMemoryResource.from_registry, (self.uuid,)
@@ -637,7 +622,6 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
         return ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle)
 
-
     def get_allocation_handle(self) -> IPCAllocationHandle:
         """Export the memory pool handle to be shared (requires IPC).
 
@@ -650,19 +634,6 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
         return ipc.DMR_get_allocation_handle(self)
 
-    cdef Buffer _allocate(self, size_t size, _cyStream stream):
-        cdef cydriver.CUstream s = stream._handle
-        cdef cydriver.CUdeviceptr devptr
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s))
-        cdef Buffer buf = Buffer.__new__(Buffer)
-        buf._ptr = <intptr_t>(devptr)
-        buf._ptr_obj = None
-        buf._size = size
-        buf._mr = self
-        buf._alloc_stream = stream
-        return buf
-
     def allocate(self, size_t size, stream: Stream = None) -> Buffer:
         """Allocate a buffer of the requested size.
 
@@ -684,15 +655,9 @@ cdef class DeviceMemoryResource(MemoryResource):
             raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
         if stream is None:
             stream = default_stream()
-        return self._allocate(size, <_cyStream>stream)
-
-    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept:
-        cdef cydriver.CUstream s = stream._handle
-        cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
+        return DMR_allocate(self, size, <_cyStream>stream)
 
-    cpdef deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
         """Deallocate a buffer previously allocated by this resource.
 
         Parameters
@@ -706,7 +671,7 @@ cdef class DeviceMemoryResource(MemoryResource):
             If the buffer is deallocated without an explicit stream, the allocation stream
             is used.
         """
-        self._deallocate(<intptr_t>ptr, size, <_cyStream>stream)
+        DMR_deallocate(self, <intptr_t>ptr, size, <_cyStream>stream)
 
     @property
     def attributes(self) -> DeviceMemoryResourceAttributes:
@@ -762,3 +727,41 @@ cdef class DeviceMemoryResource(MemoryResource):
         return self._uuid
 
 
+cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream):
+    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUdeviceptr devptr
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s))
+    cdef Buffer buf = Buffer.__new__(Buffer)
+    buf._ptr = <intptr_t>(devptr)
+    buf._ptr_obj = None
+    buf._size = size
+    buf._mr = self
+    buf._alloc_stream = stream
+    return buf
+
+
+cdef void DMR_deallocate(DeviceMemoryResource self, intptr_t ptr, size_t size, _cyStream stream) noexcept:
+    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
+    with nogil:
+            HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
+
+
+cdef DMR_close(DeviceMemoryResource self):
+    if self._mempool_handle == NULL:
+        return
+
+    try:
+        if self._mempool_owned:
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._mempool_handle))
+    finally:
+        self._dev_id = cydriver.CU_DEVICE_INVALID
+        self._mempool_handle = NULL
+        self._attributes = None
+        self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX
+        self._mempool_owned = False
+        self._is_mapped = False
+        self._uuid = None
+        self._alloc_handle = None

From 00b60ebe75175907e38132a034c3796e4d7c9499 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 30 Oct 2025 15:25:53 -0700
Subject: [PATCH 17/30] Simplify Buffer close.

---
 .../cuda/core/experimental/_memory/memory.pxd |  4 +-
 .../cuda/core/experimental/_memory/memory.pyx | 40 ++++++++++---------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/memory.pxd
index c0013b01ac..0334bba731 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pxd
@@ -17,11 +17,9 @@ cdef class Buffer:
         object _ptr_obj
         _cyStream _alloc_stream
 
-    cpdef close(self, stream=*)
-
 
 cdef class MemoryResource:
-    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept
+    pass
 
 
 cdef class DeviceMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index ac58295156..00a7feac78 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -105,7 +105,7 @@ cdef class Buffer:
         """Export a buffer allocated for sharing between processes."""
         return ipc.Buffer_get_ipc_descriptor(self)
 
-    cpdef close(self, stream: Stream = None):
+    def close(self, stream: Stream = None):
         """Deallocate this buffer asynchronously on the given stream.
 
         This buffer is released back to their memory resource
@@ -117,21 +117,7 @@ cdef class Buffer:
             The stream object to use for asynchronous deallocation. If None,
             the behavior depends on the underlying memory resource.
         """
-        cdef _cyStream s
-        if self._ptr and self._mr is not None:
-            if stream is None:
-                if self._alloc_stream is not None:
-                    s = self._alloc_stream
-                else:
-                    # TODO: remove this branch when from_handle takes a stream
-                    s = <_cyStream>(default_stream())
-            else:
-                s = <_cyStream>stream
-            self._mr._deallocate(self._ptr, self._size, s)
-            self._ptr = 0
-            self._mr = None
-            self._ptr_obj = None
-            self._alloc_stream = None
+        Buffer_close(self, stream)
 
     def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
@@ -288,6 +274,24 @@ cdef class Buffer:
         return self._size
 
 
+cdef Buffer_close(Buffer self, stream):
+    cdef _cyStream s
+    if self._ptr and self._mr is not None:
+        if stream is None:
+            if self._alloc_stream is not None:
+                s = self._alloc_stream
+            else:
+                # TODO: remove this branch when from_handle takes a stream
+                s = <_cyStream>(default_stream())
+        else:
+            s = <_cyStream>stream
+        self._mr.deallocate(self._ptr, self._size, s)
+        self._ptr = 0
+        self._mr = None
+        self._ptr_obj = None
+        self._alloc_stream = None
+
+
 cdef class MemoryResource:
     """Abstract base class for memory resources that manage allocation and deallocation of buffers.
 
@@ -297,8 +301,6 @@ cdef class MemoryResource:
     hold a reference to self, the buffer properties are retrieved simply by looking up the underlying
     memory resource's respective property.)
     """
-    cdef void _deallocate(self, intptr_t ptr, size_t size, _cyStream stream) noexcept:
-        self.deallocate(ptr, size, stream)
 
     @abc.abstractmethod
     def allocate(self, size_t size, stream: Stream = None) -> Buffer:
@@ -745,7 +747,7 @@ cdef void DMR_deallocate(DeviceMemoryResource self, intptr_t ptr, size_t size, _
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     with nogil:
-            HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
+        HANDLE_RETURN(cydriver.cuMemFreeAsync(devptr, s))
 
 
 cdef DMR_close(DeviceMemoryResource self):

From ecc9405a7c99003d1088fc5938d24e423a3d7e26 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Thu, 30 Oct 2025 15:43:17 -0700
Subject: [PATCH 18/30] Refactor DeviceMemoryResource.__init__.

---
 .../cuda/core/experimental/_memory/memory.pyx | 113 ++++++++++--------
 1 file changed, 63 insertions(+), 50 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/memory.pyx
index 00a7feac78..55b2fd5d0e 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/memory.pyx
@@ -515,58 +515,11 @@ cdef class DeviceMemoryResource(MemoryResource):
         opts = check_or_create_options(
             DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", keep_none=True
         )
-        cdef cydriver.cuuint64_t current_threshold
-        cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
-        cdef cydriver.CUmemPoolProps properties
 
         if opts is None:
-            # Get the current memory pool.
-            self._dev_id = dev_id
-            self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
-            self._mempool_owned = False
-
-            with nogil:
-                HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id))
-
-                # Set a higher release threshold to improve performance when there are no active allocations.
-                # By default, the release threshold is 0, which means memory is immediately released back
-                # to the OS when there are no active suballocations, causing performance issues.
-                # Check current release threshold
-                HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(
-                    self._mempool_handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &current_threshold)
-                )
-
-                # If threshold is 0 (default), set it to maximum to retain memory in the pool
-                if current_threshold == 0:
-                    HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
-                        self._mempool_handle,
-                        cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-                        &max_threshold
-                    ))
+            DMR_init_current(self, dev_id)
         else:
-            # Create a new memory pool.
-            if opts.ipc_enabled and ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE:
-                raise RuntimeError("IPC is not available on {platform.system()}")
-
-            memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
-            properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
-            properties.handleTypes = ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
-            properties.location.id = dev_id
-            properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-            properties.maxSize = opts.max_size
-            properties.win32SecurityAttributes = NULL
-            properties.usage = 0
-
-            self._dev_id = dev_id
-            self._ipc_handle_type = properties.handleTypes
-            self._mempool_owned = True
-
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties))
-                # TODO: should we also set the threshold here?
-
-            if opts.ipc_enabled:
-                self.get_allocation_handle()  # enables Buffer.get_ipc_descriptor, sets uuid
+            DMR_init_create(self, dev_id, opts)
 
     def __dealloc__(self):
         DMR_close(self)
@@ -602,7 +555,9 @@ cdef class DeviceMemoryResource(MemoryResource):
         return ipc.DMR_register(self, uuid)
 
     @classmethod
-    def from_allocation_handle(cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle) -> DeviceMemoryResource:
+    def from_allocation_handle(
+        cls, device_id: int | Device, alloc_handle: int | IPCAllocationHandle
+    ) -> DeviceMemoryResource:
         """Create a device memory resource from an allocation handle.
 
         Construct a new `DeviceMemoryResource` instance that imports a memory
@@ -729,6 +684,63 @@ cdef class DeviceMemoryResource(MemoryResource):
         return self._uuid
 
 
+cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
+    # Get the current memory pool.
+    cdef cydriver.cuuint64_t current_threshold
+    cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
+
+    self._dev_id = dev_id
+    self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+    self._mempool_owned = False
+
+    with nogil:
+        HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id))
+
+        # Set a higher release threshold to improve performance when there are no active allocations.
+        # By default, the release threshold is 0, which means memory is immediately released back
+        # to the OS when there are no active suballocations, causing performance issues.
+        # Check current release threshold
+        HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(
+            self._mempool_handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &current_threshold)
+        )
+
+        # If threshold is 0 (default), set it to maximum to retain memory in the pool
+        if current_threshold == 0:
+            HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
+                self._mempool_handle,
+                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                &max_threshold
+            ))
+
+
+cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts):
+    # Create a new memory pool.
+    cdef cydriver.CUmemPoolProps properties
+
+    if opts.ipc_enabled and ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE:
+        raise RuntimeError("IPC is not available on {platform.system()}")
+
+    memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
+    properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
+    properties.handleTypes = ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+    properties.location.id = dev_id
+    properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+    properties.maxSize = opts.max_size
+    properties.win32SecurityAttributes = NULL
+    properties.usage = 0
+
+    self._dev_id = dev_id
+    self._ipc_handle_type = properties.handleTypes
+    self._mempool_owned = True
+
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties))
+        # TODO: should we also set the threshold here?
+
+    if opts.ipc_enabled:
+        self.get_allocation_handle()  # enables Buffer.get_ipc_descriptor, sets uuid
+
+
 cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
@@ -767,3 +779,4 @@ cdef DMR_close(DeviceMemoryResource self):
         self._is_mapped = False
         self._uuid = None
         self._alloc_handle = None
+

From 228936bbd6ed1a5ed2ff9971dae33dc84a2a69e4 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 31 Oct 2025 09:40:28 -0700
Subject: [PATCH 19/30] Move Buffer into a separate module.

---
 .../core/experimental/_memory/__init__.py     |  11 +-
 .../cuda/core/experimental/_memory/buffer.pxd |  22 ++
 .../cuda/core/experimental/_memory/buffer.pyx | 331 ++++++++++++++++++
 .../_memory/{memory.pxd => dmr.pxd}           |  16 +-
 .../_memory/{memory.pyx => dmr.pyx}           | 323 +----------------
 .../cuda/core/experimental/_memory/ipc.pxd    |   3 +-
 .../cuda/core/experimental/_memory/legacy.py  |  11 +-
 .../cuda/core/experimental/_memory/vmm.py     |   4 +-
 8 files changed, 383 insertions(+), 338 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_memory/buffer.pxd
 create mode 100644 cuda_core/cuda/core/experimental/_memory/buffer.pyx
 rename cuda_core/cuda/core/experimental/_memory/{memory.pxd => dmr.pxd} (66%)
 rename cuda_core/cuda/core/experimental/_memory/{memory.pyx => dmr.pyx} (60%)

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index 8c6bc13196..d5dbccee64 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -1,5 +1,10 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from .buffer import *
+from .dmr import *
 from .ipc import *
-from .legacy import LegacyPinnedMemoryResource, _SynchronousMemoryResource
-from .memory import *
-from .vmm import VirtualMemoryResourceOptions, VirtualMemoryResource
+from .legacy import *
+from .vmm import *
 
diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pxd b/cuda_core/cuda/core/experimental/_memory/buffer.pxd
new file mode 100644
index 0000000000..b6c75f63cc
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/buffer.pxd
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+
+from cuda.core.experimental._stream cimport Stream as _cyStream
+
+
+cdef class Buffer:
+    cdef:
+        intptr_t _ptr
+        size_t _size
+        MemoryResource _mr
+        object _ptr_obj
+        _cyStream _alloc_stream
+
+
+cdef class MemoryResource:
+    pass
+
+
diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pyx b/cuda_core/cuda/core/experimental/_memory/buffer.pyx
new file mode 100644
index 0000000000..6d7c238d7d
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/buffer.pyx
@@ -0,0 +1,331 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.stdint cimport intptr_t
+
+from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource
+from cuda.core.experimental._memory.ipc cimport IPCBufferDescriptor
+from cuda.core.experimental._memory cimport ipc
+from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
+from cuda.core.experimental._utils.cuda_utils cimport (
+    _check_driver_error as raise_if_driver_error,
+)
+
+import abc
+from typing import TypeVar, Union
+
+from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
+from cuda.core.experimental._stream import Stream
+from cuda.core.experimental._utils.cuda_utils import driver
+
+
+DevicePointerT = Union[driver.CUdeviceptr, int, None]
+"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
+
+cdef class Buffer:
+    """Represent a handle to allocated memory.
+
+    This generic object provides a unified representation for how
+    different memory resources are to give access to their memory
+    allocations.
+
+    Support for data interchange mechanisms are provided by DLPack.
+    """
+    def __cinit__(self):
+        self._ptr = 0
+        self._size = 0
+        self._mr = None
+        self._ptr_obj = None
+        self._alloc_stream = None
+
+    def _clear(self):
+        self._ptr = 0
+        self._size = 0
+        self._mr = None
+        self._ptr_obj = None
+        self._alloc_stream = None
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.")
+
+    @classmethod
+    def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, stream: Stream | None = None):
+        cdef Buffer self = Buffer.__new__(cls)
+        self._ptr = <intptr_t>(int(ptr))
+        self._ptr_obj = ptr
+        self._size = size
+        self._mr = mr
+        self._alloc_stream = <_cyStream>(stream) if stream is not None else None
+        return self
+
+    def __dealloc__(self):
+        self.close(self._alloc_stream)
+
+    def __reduce__(self):
+        # Must not serialize the parent's stream!
+        return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor())
+
+    @staticmethod
+    def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer:
+        """Create a new :class:`Buffer` object from a pointer.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            Allocated buffer handle object
+        size : int
+            Memory size of the buffer
+        mr : :obj:`~_memory.MemoryResource`, optional
+            Memory resource associated with the buffer
+        """
+        # TODO: It is better to take a stream for latter deallocation
+        return Buffer._init(ptr, size, mr=mr)
+
+    @classmethod
+    def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer:
+        """Import a buffer that was exported from another process."""
+        return ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream)
+
+    def get_ipc_descriptor(self) -> IPCBufferDescriptor:
+        """Export a buffer allocated for sharing between processes."""
+        return ipc.Buffer_get_ipc_descriptor(self)
+
+    def close(self, stream: Stream = None):
+        """Deallocate this buffer asynchronously on the given stream.
+
+        This buffer is released back to their memory resource
+        asynchronously on the given stream.
+
+        Parameters
+        ----------
+        stream : Stream, optional
+            The stream object to use for asynchronous deallocation. If None,
+            the behavior depends on the underlying memory resource.
+        """
+        Buffer_close(self, stream)
+
+    def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
+        """Copy from this buffer to the dst buffer asynchronously on the given stream.
+
+        Copies the data from this buffer to the provided dst buffer.
+        If the dst buffer is not provided, then a new buffer is first
+        allocated using the associated memory resource before the copy.
+
+        Parameters
+        ----------
+        dst : :obj:`~_memory.Buffer`
+            Source buffer to copy data from
+        stream : Stream
+            Keyword argument specifying the stream for the
+            asynchronous copy
+
+        """
+        if stream is None:
+            raise ValueError("stream must be provided")
+
+        cdef size_t src_size = self._size
+
+        if dst is None:
+            if self._mr is None:
+                raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)")
+            dst = self._mr.allocate(src_size, stream)
+
+        cdef size_t dst_size = dst._size
+        if dst_size != src_size:
+            raise ValueError(
+                f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
+            )
+        err, = driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle)
+        raise_if_driver_error(err)
+        return dst
+
+    def copy_from(self, src: Buffer, *, stream: Stream):
+        """Copy from the src buffer to this buffer asynchronously on the given stream.
+
+        Parameters
+        ----------
+        src : :obj:`~_memory.Buffer`
+            Source buffer to copy data from
+        stream : Stream
+            Keyword argument specifying the stream for the
+            asynchronous copy
+
+        """
+        if stream is None:
+            raise ValueError("stream must be provided")
+
+        cdef size_t dst_size = self._size
+        cdef size_t src_size = src._size
+
+        if src_size != dst_size:
+            raise ValueError(
+                f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
+            )
+        err, = driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle)
+        raise_if_driver_error(err)
+
+    def __dlpack__(
+        self,
+        *,
+        stream: int | None = None,
+        max_version: tuple[int, int] | None = None,
+        dl_device: tuple[int, int] | None = None,
+        copy: bool | None = None,
+    ) -> TypeVar("PyCapsule"):
+        # Note: we ignore the stream argument entirely (as if it is -1).
+        # It is the user's responsibility to maintain stream order.
+        if dl_device is not None:
+            raise BufferError("Sorry, not supported: dl_device other than None")
+        if copy is True:
+            raise BufferError("Sorry, not supported: copy=True")
+        if max_version is None:
+            versioned = False
+        else:
+            if not isinstance(max_version, tuple) or len(max_version) != 2:
+                raise BufferError(f"Expected max_version tuple[int, int], got {max_version}")
+            versioned = max_version >= (1, 0)
+        capsule = make_py_capsule(self, versioned)
+        return capsule
+
+    def __dlpack_device__(self) -> tuple[int, int]:
+        cdef bint d = self.is_device_accessible
+        cdef bint h = self.is_host_accessible
+        if d and (not h):
+            return (DLDeviceType.kDLCUDA, self.device_id)
+        if d and h:
+            # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks
+            return (DLDeviceType.kDLCUDAHost, 0)
+        if (not d) and h:
+            return (DLDeviceType.kDLCPU, 0)
+        raise BufferError("buffer is neither device-accessible nor host-accessible")
+
+    def __buffer__(self, flags: int, /) -> memoryview:
+        # Support for Python-level buffer protocol as per PEP 688.
+        # This raises a BufferError unless:
+        #   1. Python is 3.12+
+        #   2. This Buffer object is host accessible
+        raise NotImplementedError("WIP: Buffer.__buffer__ hasn't been implemented yet.")
+
+    def __release_buffer__(self, buffer: memoryview, /):
+        # Supporting method paired with __buffer__.
+        raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.")
+
+    @property
+    def device_id(self) -> int:
+        """Return the device ordinal of this buffer."""
+        if self._mr is not None:
+            return self._mr.device_id
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
+
+    @property
+    def handle(self) -> DevicePointerT:
+        """Return the buffer handle object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Buffer.handle)``.
+        """
+        if self._ptr_obj is not None:
+            return self._ptr_obj
+        elif self._ptr:
+            return self._ptr
+        else:
+            # contract: Buffer is closed
+            return 0
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True if this buffer can be accessed by the GPU, otherwise False."""
+        if self._mr is not None:
+            return self._mr.is_device_accessible
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return True if this buffer can be accessed by the CPU, otherwise False."""
+        if self._mr is not None:
+            return self._mr.is_host_accessible
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
+
+    @property
+    def memory_resource(self) -> MemoryResource:
+        """Return the memory resource associated with this buffer."""
+        return self._mr
+
+    @property
+    def size(self) -> int:
+        """Return the memory size of this buffer."""
+        return self._size
+
+
+cdef Buffer_close(Buffer self, stream):
+    cdef _cyStream s
+    if self._ptr and self._mr is not None:
+        if stream is None:
+            if self._alloc_stream is not None:
+                s = self._alloc_stream
+            else:
+                # TODO: remove this branch when from_handle takes a stream
+                s = <_cyStream>(default_stream())
+        else:
+            s = <_cyStream>stream
+        self._mr.deallocate(self._ptr, self._size, s)
+        self._ptr = 0
+        self._mr = None
+        self._ptr_obj = None
+        self._alloc_stream = None
+
+
+cdef class MemoryResource:
+    """Abstract base class for memory resources that manage allocation and deallocation of buffers.
+
+    Subclasses must implement methods for allocating and deallocation, as well as properties
+    associated with this memory resource from which all allocated buffers will inherit. (Since
+    all :class:`Buffer` instances allocated and returned by the :meth:`allocate` method would
+    hold a reference to self, the buffer properties are retrieved simply by looking up the underlying
+    memory resource's respective property.)
+    """
+
+    @abc.abstractmethod
+    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
+        """Allocate a buffer of the requested size.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : Stream, optional
+            The stream on which to perform the allocation asynchronously.
+            If None, it is up to each memory resource implementation to decide
+            and document the behavior.
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which can be used for device or host operations
+            depending on the resource's properties.
+        """
+        ...
+
+    @abc.abstractmethod
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            The pointer or handle to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : Stream, optional
+            The stream on which to perform the deallocation asynchronously.
+            If None, it is up to each memory resource implementation to decide
+            and document the behavior.
+        """
+        ...
+
+
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pxd b/cuda_core/cuda/core/experimental/_memory/dmr.pxd
similarity index 66%
rename from cuda_core/cuda/core/experimental/_memory/memory.pxd
rename to cuda_core/cuda/core/experimental/_memory/dmr.pxd
index 0334bba731..c3572d34b7 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/dmr.pxd
@@ -2,24 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport uintptr_t, intptr_t
 from cuda.bindings cimport cydriver
 
+from cuda.core.experimental._memory.buffer cimport MemoryResource
 from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle
-from cuda.core.experimental._stream cimport Stream as _cyStream
-
-
-cdef class Buffer:
-    cdef:
-        intptr_t _ptr
-        size_t _size
-        MemoryResource _mr
-        object _ptr_obj
-        _cyStream _alloc_stream
-
-
-cdef class MemoryResource:
-    pass
 
 
 cdef class DeviceMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/memory.pyx b/cuda_core/cuda/core/experimental/_memory/dmr.pyx
similarity index 60%
rename from cuda_core/cuda/core/experimental/_memory/memory.pyx
rename to cuda_core/cuda/core/experimental/_memory/dmr.pyx
index 55b2fd5d0e..bdfb65f04e 100644
--- a/cuda_core/cuda/core/experimental/_memory/memory.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/dmr.pyx
@@ -6,10 +6,12 @@ from __future__ import annotations
 
 from libc.limits cimport ULLONG_MAX
 from libc.stdint cimport uintptr_t, intptr_t
-from libc.string cimport memset, memcpy
+from libc.string cimport memset
+
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle, IPCBufferDescriptor
+from cuda.core.experimental._memory.buffer cimport Buffer, MemoryResource
 from cuda.core.experimental._memory cimport ipc
+from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle
 from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
@@ -17,330 +19,21 @@ from cuda.core.experimental._utils.cuda_utils cimport (
     HANDLE_RETURN,
 )
 
-import abc
 import cython
-from dataclasses import dataclass, field
-from typing import Iterable, Literal, Optional, TYPE_CHECKING, TypeVar, Union
-import os
+from dataclasses import dataclass
+from typing import Optional, TYPE_CHECKING
 import platform
 import weakref
 
-from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version)
+from cuda.core.experimental._utils.cuda_utils import driver
 
 if TYPE_CHECKING:
+    from cuda.core.experimental._memory.buffer import DevicePointerT
     from .._device import Device
     import uuid
 
 
-DevicePointerT = Union[driver.CUdeviceptr, int, None]
-"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
-
-cdef class Buffer:
-    """Represent a handle to allocated memory.
-
-    This generic object provides a unified representation for how
-    different memory resources are to give access to their memory
-    allocations.
-
-    Support for data interchange mechanisms are provided by DLPack.
-    """
-    def __cinit__(self):
-        self._ptr = 0
-        self._size = 0
-        self._mr = None
-        self._ptr_obj = None
-        self._alloc_stream = None
-
-    def _clear(self):
-        self._ptr = 0
-        self._size = 0
-        self._mr = None
-        self._ptr_obj = None
-        self._alloc_stream = None
-
-    def __init__(self, *args, **kwargs):
-        raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.")
-
-    @classmethod
-    def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, stream: Stream | None = None):
-        cdef Buffer self = Buffer.__new__(cls)
-        self._ptr = <intptr_t>(int(ptr))
-        self._ptr_obj = ptr
-        self._size = size
-        self._mr = mr
-        self._alloc_stream = <_cyStream>(stream) if stream is not None else None
-        return self
-
-    def __dealloc__(self):
-        self.close(self._alloc_stream)
-
-    def __reduce__(self):
-        # Must not serialize the parent's stream!
-        return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor())
-
-    @staticmethod
-    def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer:
-        """Create a new :class:`Buffer` object from a pointer.
-
-        Parameters
-        ----------
-        ptr : :obj:`~_memory.DevicePointerT`
-            Allocated buffer handle object
-        size : int
-            Memory size of the buffer
-        mr : :obj:`~_memory.MemoryResource`, optional
-            Memory resource associated with the buffer
-        """
-        # TODO: It is better to take a stream for latter deallocation
-        return Buffer._init(ptr, size, mr=mr)
-
-    @classmethod
-    def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer:
-        """Import a buffer that was exported from another process."""
-        return ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream)
-
-    def get_ipc_descriptor(self) -> IPCBufferDescriptor:
-        """Export a buffer allocated for sharing between processes."""
-        return ipc.Buffer_get_ipc_descriptor(self)
-
-    def close(self, stream: Stream = None):
-        """Deallocate this buffer asynchronously on the given stream.
-
-        This buffer is released back to their memory resource
-        asynchronously on the given stream.
-
-        Parameters
-        ----------
-        stream : Stream, optional
-            The stream object to use for asynchronous deallocation. If None,
-            the behavior depends on the underlying memory resource.
-        """
-        Buffer_close(self, stream)
-
-    def copy_to(self, dst: Buffer = None, *, stream: Stream) -> Buffer:
-        """Copy from this buffer to the dst buffer asynchronously on the given stream.
-
-        Copies the data from this buffer to the provided dst buffer.
-        If the dst buffer is not provided, then a new buffer is first
-        allocated using the associated memory resource before the copy.
-
-        Parameters
-        ----------
-        dst : :obj:`~_memory.Buffer`
-            Source buffer to copy data from
-        stream : Stream
-            Keyword argument specifying the stream for the
-            asynchronous copy
-
-        """
-        if stream is None:
-            raise ValueError("stream must be provided")
-
-        cdef size_t src_size = self._size
-
-        if dst is None:
-            if self._mr is None:
-                raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)")
-            dst = self._mr.allocate(src_size, stream)
-
-        cdef size_t dst_size = dst._size
-        if dst_size != src_size:
-            raise ValueError(
-                f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
-            )
-        err, = driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle)
-        raise_if_driver_error(err)
-        return dst
-
-    def copy_from(self, src: Buffer, *, stream: Stream):
-        """Copy from the src buffer to this buffer asynchronously on the given stream.
-
-        Parameters
-        ----------
-        src : :obj:`~_memory.Buffer`
-            Source buffer to copy data from
-        stream : Stream
-            Keyword argument specifying the stream for the
-            asynchronous copy
-
-        """
-        if stream is None:
-            raise ValueError("stream must be provided")
-
-        cdef size_t dst_size = self._size
-        cdef size_t src_size = src._size
-
-        if src_size != dst_size:
-            raise ValueError(
-                f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
-            )
-        err, = driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle)
-        raise_if_driver_error(err)
-
-    def __dlpack__(
-        self,
-        *,
-        stream: int | None = None,
-        max_version: tuple[int, int] | None = None,
-        dl_device: tuple[int, int] | None = None,
-        copy: bool | None = None,
-    ) -> TypeVar("PyCapsule"):
-        # Note: we ignore the stream argument entirely (as if it is -1).
-        # It is the user's responsibility to maintain stream order.
-        if dl_device is not None:
-            raise BufferError("Sorry, not supported: dl_device other than None")
-        if copy is True:
-            raise BufferError("Sorry, not supported: copy=True")
-        if max_version is None:
-            versioned = False
-        else:
-            if not isinstance(max_version, tuple) or len(max_version) != 2:
-                raise BufferError(f"Expected max_version tuple[int, int], got {max_version}")
-            versioned = max_version >= (1, 0)
-        capsule = make_py_capsule(self, versioned)
-        return capsule
-
-    def __dlpack_device__(self) -> tuple[int, int]:
-        cdef bint d = self.is_device_accessible
-        cdef bint h = self.is_host_accessible
-        if d and (not h):
-            return (DLDeviceType.kDLCUDA, self.device_id)
-        if d and h:
-            # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks
-            return (DLDeviceType.kDLCUDAHost, 0)
-        if (not d) and h:
-            return (DLDeviceType.kDLCPU, 0)
-        raise BufferError("buffer is neither device-accessible nor host-accessible")
-
-    def __buffer__(self, flags: int, /) -> memoryview:
-        # Support for Python-level buffer protocol as per PEP 688.
-        # This raises a BufferError unless:
-        #   1. Python is 3.12+
-        #   2. This Buffer object is host accessible
-        raise NotImplementedError("WIP: Buffer.__buffer__ hasn't been implemented yet.")
-
-    def __release_buffer__(self, buffer: memoryview, /):
-        # Supporting method paired with __buffer__.
-        raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.")
-
-    @property
-    def device_id(self) -> int:
-        """Return the device ordinal of this buffer."""
-        if self._mr is not None:
-            return self._mr.device_id
-        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
-
-    @property
-    def handle(self) -> DevicePointerT:
-        """Return the buffer handle object.
-
-        .. caution::
-
-            This handle is a Python object. To get the memory address of the underlying C
-            handle, call ``int(Buffer.handle)``.
-        """
-        if self._ptr_obj is not None:
-            return self._ptr_obj
-        elif self._ptr:
-            return self._ptr
-        else:
-            # contract: Buffer is closed
-            return 0
-
-    @property
-    def is_device_accessible(self) -> bool:
-        """Return True if this buffer can be accessed by the GPU, otherwise False."""
-        if self._mr is not None:
-            return self._mr.is_device_accessible
-        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
-
-    @property
-    def is_host_accessible(self) -> bool:
-        """Return True if this buffer can be accessed by the CPU, otherwise False."""
-        if self._mr is not None:
-            return self._mr.is_host_accessible
-        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
-
-    @property
-    def memory_resource(self) -> MemoryResource:
-        """Return the memory resource associated with this buffer."""
-        return self._mr
-
-    @property
-    def size(self) -> int:
-        """Return the memory size of this buffer."""
-        return self._size
-
-
-cdef Buffer_close(Buffer self, stream):
-    cdef _cyStream s
-    if self._ptr and self._mr is not None:
-        if stream is None:
-            if self._alloc_stream is not None:
-                s = self._alloc_stream
-            else:
-                # TODO: remove this branch when from_handle takes a stream
-                s = <_cyStream>(default_stream())
-        else:
-            s = <_cyStream>stream
-        self._mr.deallocate(self._ptr, self._size, s)
-        self._ptr = 0
-        self._mr = None
-        self._ptr_obj = None
-        self._alloc_stream = None
-
-
-cdef class MemoryResource:
-    """Abstract base class for memory resources that manage allocation and deallocation of buffers.
-
-    Subclasses must implement methods for allocating and deallocation, as well as properties
-    associated with this memory resource from which all allocated buffers will inherit. (Since
-    all :class:`Buffer` instances allocated and returned by the :meth:`allocate` method would
-    hold a reference to self, the buffer properties are retrieved simply by looking up the underlying
-    memory resource's respective property.)
-    """
-
-    @abc.abstractmethod
-    def allocate(self, size_t size, stream: Stream = None) -> Buffer:
-        """Allocate a buffer of the requested size.
-
-        Parameters
-        ----------
-        size : int
-            The size of the buffer to allocate, in bytes.
-        stream : Stream, optional
-            The stream on which to perform the allocation asynchronously.
-            If None, it is up to each memory resource implementation to decide
-            and document the behavior.
-
-        Returns
-        -------
-        Buffer
-            The allocated buffer object, which can be used for device or host operations
-            depending on the resource's properties.
-        """
-        ...
-
-    @abc.abstractmethod
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
-        """Deallocate a buffer previously allocated by this resource.
-
-        Parameters
-        ----------
-        ptr : :obj:`~_memory.DevicePointerT`
-            The pointer or handle to the buffer to deallocate.
-        size : int
-            The size of the buffer to deallocate, in bytes.
-        stream : Stream, optional
-            The stream on which to perform the deallocation asynchronously.
-            If None, it is up to each memory resource implementation to decide
-            and document the behavior.
-        """
-        ...
-
-
 @dataclass
 cdef class DeviceMemoryResourceOptions:
     """Customizable :obj:`~_memory.DeviceMemoryResource` options.
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
index d2096a6299..f3444028e4 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
@@ -3,7 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory.memory cimport Buffer, DeviceMemoryResource
+from cuda.core.experimental._memory.buffer cimport Buffer
+from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource
 
 
 # Holds DeviceMemoryResource objects imported by this process.  This enables
diff --git a/cuda_core/cuda/core/experimental/_memory/legacy.py b/cuda_core/cuda/core/experimental/_memory/legacy.py
index d8507967c8..487ddeae5a 100644
--- a/cuda_core/cuda/core/experimental/_memory/legacy.py
+++ b/cuda_core/cuda/core/experimental/_memory/legacy.py
@@ -2,15 +2,20 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import Union
+from __future__ import annotations
 
-from cuda.core.experimental._memory.memory import Buffer, MemoryResource
+from typing import TYPE_CHECKING
+
+from cuda.core.experimental._memory.buffer import Buffer, MemoryResource
 from cuda.core.experimental._utils.cuda_utils import (
     driver,
     _check_driver_error as raise_if_driver_error,
 )
 
-DevicePointerT = Union[driver.CUdeviceptr, int, None]
+if TYPE_CHECKING:
+    from cuda.core.experimental._memory.buffer import DevicePointerT
+
+__all__ = ["LegacyPinnedMemoryResource", "_SynchronousMemoryResource"]
 
 
 class LegacyPinnedMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/vmm.py b/cuda_core/cuda/core/experimental/_memory/vmm.py
index 60ba8280d8..44c9250de3 100644
--- a/cuda_core/cuda/core/experimental/_memory/vmm.py
+++ b/cuda_core/cuda/core/experimental/_memory/vmm.py
@@ -7,13 +7,15 @@
 import platform
 
 from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._memory.memory import Buffer, MemoryResource
+from cuda.core.experimental._memory.buffer import Buffer, MemoryResource
 from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version )
 from cuda.core.experimental._utils.cuda_utils import (
     _check_driver_error as raise_if_driver_error,
     check_or_create_options,
 )
 
+__all__ = ["VirtualMemoryResourceOptions", "VirtualMemoryResource"]
+
 VirtualMemoryHandleTypeT = Union[Literal["posix_fd", "generic", "win32", "win32_kmt", "fabric"], None]
 VirtualMemoryLocationTypeT = Literal["device", "host", "host_numa", "host_numa_current"]
 VirtualMemoryGranularityT = Literal["minimum", "recommended"]

From 9a86bde8bd1fb4ff1cda5fe20cfbfc196d2cb3c7 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 31 Oct 2025 11:27:27 -0700
Subject: [PATCH 20/30] Refactors DeviceMemoryResource IPC implementation.

---
 .../cuda/core/experimental/_memory/dmr.pxd    |  5 ++--
 .../cuda/core/experimental/_memory/dmr.pyx    | 26 +++++++++++++-----
 .../cuda/core/experimental/_memory/ipc.pyx    | 27 +++++--------------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pxd b/cuda_core/cuda/core/experimental/_memory/dmr.pxd
index c3572d34b7..b34c08f287 100644
--- a/cuda_core/cuda/core/experimental/_memory/dmr.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/dmr.pxd
@@ -13,10 +13,11 @@ cdef class DeviceMemoryResource(MemoryResource):
         int _dev_id
         cydriver.CUmemoryPool _mempool_handle
         object _attributes
-        cydriver.CUmemAllocationHandleType _ipc_handle_type
         bint _mempool_owned
+        object __weakref__
+
+        cydriver.CUmemAllocationHandleType _ipc_handle_type
         bint _is_mapped
         object _uuid
         IPCAllocationHandle _alloc_handle
-        object __weakref__
 
diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pyx b/cuda_core/cuda/core/experimental/_memory/dmr.pyx
index bdfb65f04e..5193a2d7a8 100644
--- a/cuda_core/cuda/core/experimental/_memory/dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/dmr.pyx
@@ -22,7 +22,9 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 import cython
 from dataclasses import dataclass
 from typing import Optional, TYPE_CHECKING
+import os
 import platform
+import uuid
 import weakref
 
 from cuda.core.experimental._stream import Stream
@@ -31,7 +33,6 @@ from cuda.core.experimental._utils.cuda_utils import driver
 if TYPE_CHECKING:
     from cuda.core.experimental._memory.buffer import DevicePointerT
     from .._device import Device
-    import uuid
 
 
 @dataclass
@@ -197,8 +198,8 @@ cdef class DeviceMemoryResource(MemoryResource):
         self._dev_id = cydriver.CU_DEVICE_INVALID
         self._mempool_handle = NULL
         self._attributes = None
-        self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX
         self._mempool_owned = False
+        self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
         self._is_mapped = False
         self._uuid = None
         self._alloc_handle = None
@@ -383,8 +384,8 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
     cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
 
     self._dev_id = dev_id
-    self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
     self._mempool_owned = False
+    self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
 
     with nogil:
         HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id))
@@ -423,15 +424,28 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes
     properties.usage = 0
 
     self._dev_id = dev_id
-    self._ipc_handle_type = properties.handleTypes
     self._mempool_owned = True
 
     with nogil:
         HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties))
         # TODO: should we also set the threshold here?
 
+    # Note: This is Linux only (int for file descriptor)
+    cdef int alloc_handle
+
     if opts.ipc_enabled:
-        self.get_allocation_handle()  # enables Buffer.get_ipc_descriptor, sets uuid
+        self._ipc_handle_type = ipc.IPC_HANDLE_TYPE
+        self._is_mapped = False
+        self._uuid = uuid.uuid4()
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
+                &alloc_handle, self._mempool_handle, ipc.IPC_HANDLE_TYPE, 0)
+            )
+        try:
+            self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid)
+        except:
+            os.close(alloc_handle)
+            raise
 
 
 cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream):
@@ -467,8 +481,8 @@ cdef DMR_close(DeviceMemoryResource self):
         self._dev_id = cydriver.CU_DEVICE_INVALID
         self._mempool_handle = NULL
         self._attributes = None
-        self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX
         self._mempool_owned = False
+        self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
         self._is_mapped = False
         self._uuid = None
         self._alloc_handle = None
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
index 2062a1c06e..a3a9f03a18 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
@@ -139,26 +139,11 @@ cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferD
 # ------
 
 cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self):
-    # Note: This is Linux only (int for file descriptor)
-    cdef int alloc_handle
-
-    if self._alloc_handle is None:
-        if not self.is_ipc_enabled:
-            raise RuntimeError("Memory resource is not IPC-enabled")
-        if self._is_mapped:
-            raise RuntimeError("Imported memory resource cannot be exported")
-
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
-                &alloc_handle, self._mempool_handle, IPC_HANDLE_TYPE, 0)
-            )
-        try:
-            assert self._uuid is None
-            self._uuid = uuid.uuid4()
-            self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid)
-        except:
-            os.close(alloc_handle)
-            raise
+    if not self.is_ipc_enabled:
+        raise RuntimeError("Memory resource is not IPC-enabled")
+    if self._is_mapped:
+        raise RuntimeError("Imported memory resource cannot be exported")
+    assert self._alloc_handle is not None
     return self._alloc_handle
 
 
@@ -173,8 +158,8 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand
 
     cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls)
     self._dev_id = device_id
-    self._ipc_handle_type = IPC_HANDLE_TYPE
     self._mempool_owned = True
+    self._ipc_handle_type = IPC_HANDLE_TYPE
     self._is_mapped = True
     #self._alloc_handle = None  # only used for non-imported
 

From c7f6cdee48ae8d6f9d238d99501c8149b2a27ef7 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 31 Oct 2025 12:37:28 -0700
Subject: [PATCH 21/30] Removes superfluous _uuid member of
 DeviceMemoryResource.

---
 cuda_core/cuda/core/experimental/_memory/dmr.pxd |  1 -
 cuda_core/cuda/core/experimental/_memory/dmr.pyx |  8 +++-----
 cuda_core/cuda/core/experimental/_memory/ipc.pyx | 11 +++++++----
 cuda_core/tests/memory_ipc/test_serialize.py     |  1 -
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pxd b/cuda_core/cuda/core/experimental/_memory/dmr.pxd
index b34c08f287..d8e3a2622a 100644
--- a/cuda_core/cuda/core/experimental/_memory/dmr.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/dmr.pxd
@@ -18,6 +18,5 @@ cdef class DeviceMemoryResource(MemoryResource):
 
         cydriver.CUmemAllocationHandleType _ipc_handle_type
         bint _is_mapped
-        object _uuid
         IPCAllocationHandle _alloc_handle
 
diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pyx b/cuda_core/cuda/core/experimental/_memory/dmr.pyx
index 5193a2d7a8..8870f7677b 100644
--- a/cuda_core/cuda/core/experimental/_memory/dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/dmr.pyx
@@ -201,7 +201,6 @@ cdef class DeviceMemoryResource(MemoryResource):
         self._mempool_owned = False
         self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
         self._is_mapped = False
-        self._uuid = None
         self._alloc_handle = None
 
     def __init__(self, device_id: int | Device, options=None):
@@ -375,7 +374,8 @@ cdef class DeviceMemoryResource(MemoryResource):
         A universally unique identifier for this memory resource. Meaningful
         only for IPC-enabled memory resources.
         """
-        return self._uuid
+        if self._alloc_handle is not None:
+            return self._alloc_handle._uuid
 
 
 cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
@@ -436,13 +436,12 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes
     if opts.ipc_enabled:
         self._ipc_handle_type = ipc.IPC_HANDLE_TYPE
         self._is_mapped = False
-        self._uuid = uuid.uuid4()
         with nogil:
             HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
                 &alloc_handle, self._mempool_handle, ipc.IPC_HANDLE_TYPE, 0)
             )
         try:
-            self._alloc_handle = IPCAllocationHandle._init(alloc_handle, self._uuid)
+            self._alloc_handle = IPCAllocationHandle._init(alloc_handle, uuid.uuid4())
         except:
             os.close(alloc_handle)
             raise
@@ -484,6 +483,5 @@ cdef DMR_close(DeviceMemoryResource self):
         self._mempool_owned = False
         self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
         self._is_mapped = False
-        self._uuid = None
         self._alloc_handle = None
 
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
index a3a9f03a18..8aee686ebb 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
@@ -67,7 +67,6 @@ cdef class IPCAllocationHandle:
                 os.close(self._handle)
             finally:
                 self._handle = -1
-                self._uuid = None
 
     def __dealloc__(self):
         self.close()
@@ -149,6 +148,8 @@ cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self):
 
 cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle):
     # Quick exit for registry hits.
+    if isinstance(alloc_handle, int):
+        alloc_handle = IPCAllocationHandle._init(alloc_handle, None)
     uuid = getattr(alloc_handle, 'uuid', None)
     mr = registry.get(uuid)
     if mr is not None:
@@ -161,7 +162,7 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand
     self._mempool_owned = True
     self._ipc_handle_type = IPC_HANDLE_TYPE
     self._is_mapped = True
-    #self._alloc_handle = None  # only used for non-imported
+    self._alloc_handle = alloc_handle
 
     cdef int handle = int(alloc_handle)
     with nogil:
@@ -171,6 +172,7 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand
     if uuid is not None:
         registered = self.register(uuid)
         assert registered is self
+    self._alloc_handle.close()
     return self
 
 
@@ -178,9 +180,10 @@ cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid):
     existing = registry.get(uuid)
     if existing is not None:
         return existing
-    assert self._uuid is None or self._uuid == uuid
+    assert self._alloc_handle is not None
+    assert self._alloc_handle._uuid is None or self._alloc_handle._uuid == uuid
     registry[uuid] = self
-    self._uuid = uuid
+    self._alloc_handle._uuid = uuid
     return self
 
 cpdef DeviceMemoryResource DMR_from_registry(uuid):
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index ceac50e502..0be8513d58 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -60,7 +60,6 @@ def child_main(self, conn):
         # Receive the memory resource.
         handle = mp.reduction.recv_handle(conn)
         mr = DeviceMemoryResource.from_allocation_handle(device, handle)
-        os.close(handle)
 
         # Receive the buffers.
         buffer1 = conn.recv()  # directly

From 216b4fb357da5186f0f74ccb1a76580347520ee5 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 31 Oct 2025 12:48:22 -0700
Subject: [PATCH 22/30] Adds __all__ lists.

---
 cuda_core/cuda/core/experimental/_memory/buffer.pyx | 2 ++
 cuda_core/cuda/core/experimental/_memory/dmr.pyx    | 2 ++
 cuda_core/cuda/core/experimental/_memory/ipc.pxd    | 1 +
 cuda_core/cuda/core/experimental/_memory/ipc.pyx    | 2 ++
 4 files changed, 7 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pyx b/cuda_core/cuda/core/experimental/_memory/buffer.pyx
index 6d7c238d7d..4e3896c0fd 100644
--- a/cuda_core/cuda/core/experimental/_memory/buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/buffer.pyx
@@ -21,6 +21,8 @@ from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.cuda_utils import driver
 
+__all__ = ['Buffer', 'MemoryResource']
+
 
 DevicePointerT = Union[driver.CUdeviceptr, int, None]
 """A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pyx b/cuda_core/cuda/core/experimental/_memory/dmr.pyx
index 8870f7677b..dbe3a065c6 100644
--- a/cuda_core/cuda/core/experimental/_memory/dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/dmr.pyx
@@ -34,6 +34,8 @@ if TYPE_CHECKING:
     from cuda.core.experimental._memory.buffer import DevicePointerT
     from .._device import Device
 
+__all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions']
+
 
 @dataclass
 cdef class DeviceMemoryResourceOptions:
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
index f3444028e4..810b88cad0 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pxd
@@ -13,6 +13,7 @@ from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource
 # descriptor.
 cdef object registry
 
+
 # IPC is currently only supported on Linux. On other platforms, the IPC handle
 # type is set equal to the no-IPC handle type.
 cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
index 8aee686ebb..e1ff68fde7 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/ipc.pyx
@@ -19,6 +19,8 @@ import platform
 import uuid
 import weakref
 
+__all__ = ['IPCBufferDescriptor', 'IPCAllocationHandle']
+
 
 cdef object registry = weakref.WeakValueDictionary()
 

From 6a30a39949cf70208856e8eff5fd1f12cb9e2e0c Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 31 Oct 2025 13:07:22 -0700
Subject: [PATCH 23/30] Prepend underscore to submodules, add a test for
 package contents.

---
 .../core/experimental/_memory/__init__.py     | 10 ++++-----
 .../_memory/{buffer.pxd => _buffer.pxd}       |  0
 .../_memory/{buffer.pyx => _buffer.pyx}       | 10 ++++-----
 .../_memory/{dmr.pxd => _dmr.pxd}             |  4 ++--
 .../_memory/{dmr.pyx => _dmr.pyx}             | 22 +++++++++----------
 .../_memory/{ipc.pxd => _ipc.pxd}             |  4 ++--
 .../_memory/{ipc.pyx => _ipc.pyx}             |  0
 .../_memory/{legacy.py => _legacy.py}         |  2 +-
 .../experimental/_memory/{vmm.py => _vmm.py}  |  2 +-
 cuda_core/tests/test_memory.py                | 19 +++++++++++++++-
 10 files changed, 45 insertions(+), 28 deletions(-)
 rename cuda_core/cuda/core/experimental/_memory/{buffer.pxd => _buffer.pxd} (100%)
 rename cuda_core/cuda/core/experimental/_memory/{buffer.pyx => _buffer.pyx} (97%)
 rename cuda_core/cuda/core/experimental/_memory/{dmr.pxd => _dmr.pxd} (79%)
 rename cuda_core/cuda/core/experimental/_memory/{dmr.pyx => _dmr.pyx} (95%)
 rename cuda_core/cuda/core/experimental/_memory/{ipc.pxd => _ipc.pxd} (91%)
 rename cuda_core/cuda/core/experimental/_memory/{ipc.pyx => _ipc.pyx} (100%)
 rename cuda_core/cuda/core/experimental/_memory/{legacy.py => _legacy.py} (97%)
 rename cuda_core/cuda/core/experimental/_memory/{vmm.py => _vmm.py} (99%)

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index d5dbccee64..f9d528ac64 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -2,9 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from .buffer import *
-from .dmr import *
-from .ipc import *
-from .legacy import *
-from .vmm import *
+from ._buffer import *
+from ._dmr import *
+from ._ipc import *
+from ._legacy import *
+from ._vmm import *
 
diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_memory/buffer.pxd
rename to cuda_core/cuda/core/experimental/_memory/_buffer.pxd
diff --git a/cuda_core/cuda/core/experimental/_memory/buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
similarity index 97%
rename from cuda_core/cuda/core/experimental/_memory/buffer.pyx
rename to cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index 4e3896c0fd..a3a5fa48b8 100644
--- a/cuda_core/cuda/core/experimental/_memory/buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -6,9 +6,9 @@ from __future__ import annotations
 
 from libc.stdint cimport intptr_t
 
-from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource
-from cuda.core.experimental._memory.ipc cimport IPCBufferDescriptor
-from cuda.core.experimental._memory cimport ipc
+from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource
+from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor
+from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
@@ -89,11 +89,11 @@ cdef class Buffer:
     @classmethod
     def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer:
         """Import a buffer that was exported from another process."""
-        return ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream)
+        return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream)
 
     def get_ipc_descriptor(self) -> IPCBufferDescriptor:
         """Export a buffer allocated for sharing between processes."""
-        return ipc.Buffer_get_ipc_descriptor(self)
+        return _ipc.Buffer_get_ipc_descriptor(self)
 
     def close(self, stream: Stream = None):
         """Deallocate this buffer asynchronously on the given stream.
diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
similarity index 79%
rename from cuda_core/cuda/core/experimental/_memory/dmr.pxd
rename to cuda_core/cuda/core/experimental/_memory/_dmr.pxd
index d8e3a2622a..5a64a4cdac 100644
--- a/cuda_core/cuda/core/experimental/_memory/dmr.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
@@ -4,8 +4,8 @@
 
 from cuda.bindings cimport cydriver
 
-from cuda.core.experimental._memory.buffer cimport MemoryResource
-from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle
+from cuda.core.experimental._memory._buffer cimport MemoryResource
+from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
 
 
 cdef class DeviceMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
similarity index 95%
rename from cuda_core/cuda/core/experimental/_memory/dmr.pyx
rename to cuda_core/cuda/core/experimental/_memory/_dmr.pyx
index dbe3a065c6..daf91bf77d 100644
--- a/cuda_core/cuda/core/experimental/_memory/dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
@@ -9,9 +9,9 @@ from libc.stdint cimport uintptr_t, intptr_t
 from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory.buffer cimport Buffer, MemoryResource
-from cuda.core.experimental._memory cimport ipc
-from cuda.core.experimental._memory.ipc cimport IPCAllocationHandle
+from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
+from cuda.core.experimental._memory cimport _ipc
+from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
 from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
@@ -236,7 +236,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         RuntimeError
             If no mapped memory resource is found in the registry.
         """
-        return ipc.DMR_from_registry(uuid)
+        return _ipc.DMR_from_registry(uuid)
 
     def register(self, uuid: uuid.UUID) -> DeviceMemoryResource:
         """
@@ -247,7 +247,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         The registered mapped memory resource. If one was previously registered
         with the given key, it is returned.
         """
-        return ipc.DMR_register(self, uuid)
+        return _ipc.DMR_register(self, uuid)
 
     @classmethod
     def from_allocation_handle(
@@ -272,7 +272,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         -------
             A new device memory resource instance with the imported handle.
         """
-        return ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle)
+        return _ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle)
 
     def get_allocation_handle(self) -> IPCAllocationHandle:
         """Export the memory pool handle to be shared (requires IPC).
@@ -284,7 +284,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         -------
             The shareable handle for the memory pool.
         """
-        return ipc.DMR_get_allocation_handle(self)
+        return _ipc.DMR_get_allocation_handle(self)
 
     def allocate(self, size_t size, stream: Stream = None) -> Buffer:
         """Allocate a buffer of the requested size.
@@ -413,12 +413,12 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes
     # Create a new memory pool.
     cdef cydriver.CUmemPoolProps properties
 
-    if opts.ipc_enabled and ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE:
+    if opts.ipc_enabled and _ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE:
         raise RuntimeError("IPC is not available on {platform.system()}")
 
     memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
     properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
-    properties.handleTypes = ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+    properties.handleTypes = _ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
     properties.location.id = dev_id
     properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
     properties.maxSize = opts.max_size
@@ -436,11 +436,11 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes
     cdef int alloc_handle
 
     if opts.ipc_enabled:
-        self._ipc_handle_type = ipc.IPC_HANDLE_TYPE
+        self._ipc_handle_type = _ipc.IPC_HANDLE_TYPE
         self._is_mapped = False
         with nogil:
             HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
-                &alloc_handle, self._mempool_handle, ipc.IPC_HANDLE_TYPE, 0)
+                &alloc_handle, self._mempool_handle, _ipc.IPC_HANDLE_TYPE, 0)
             )
         try:
             self._alloc_handle = IPCAllocationHandle._init(alloc_handle, uuid.uuid4())
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
similarity index 91%
rename from cuda_core/cuda/core/experimental/_memory/ipc.pxd
rename to cuda_core/cuda/core/experimental/_memory/_ipc.pxd
index 810b88cad0..006d835320 100644
--- a/cuda_core/cuda/core/experimental/_memory/ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory.buffer cimport Buffer
-from cuda.core.experimental._memory.dmr cimport DeviceMemoryResource
+from cuda.core.experimental._memory._buffer cimport Buffer
+from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource
 
 
 # Holds DeviceMemoryResource objects imported by this process.  This enables
diff --git a/cuda_core/cuda/core/experimental/_memory/ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
similarity index 100%
rename from cuda_core/cuda/core/experimental/_memory/ipc.pyx
rename to cuda_core/cuda/core/experimental/_memory/_ipc.pyx
diff --git a/cuda_core/cuda/core/experimental/_memory/legacy.py b/cuda_core/cuda/core/experimental/_memory/_legacy.py
similarity index 97%
rename from cuda_core/cuda/core/experimental/_memory/legacy.py
rename to cuda_core/cuda/core/experimental/_memory/_legacy.py
index 487ddeae5a..6af415433a 100644
--- a/cuda_core/cuda/core/experimental/_memory/legacy.py
+++ b/cuda_core/cuda/core/experimental/_memory/_legacy.py
@@ -6,7 +6,7 @@
 
 from typing import TYPE_CHECKING
 
-from cuda.core.experimental._memory.buffer import Buffer, MemoryResource
+from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
 from cuda.core.experimental._utils.cuda_utils import (
     driver,
     _check_driver_error as raise_if_driver_error,
diff --git a/cuda_core/cuda/core/experimental/_memory/vmm.py b/cuda_core/cuda/core/experimental/_memory/_vmm.py
similarity index 99%
rename from cuda_core/cuda/core/experimental/_memory/vmm.py
rename to cuda_core/cuda/core/experimental/_memory/_vmm.py
index 44c9250de3..ebf7895076 100644
--- a/cuda_core/cuda/core/experimental/_memory/vmm.py
+++ b/cuda_core/cuda/core/experimental/_memory/_vmm.py
@@ -7,7 +7,7 @@
 import platform
 
 from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._memory.buffer import Buffer, MemoryResource
+from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
 from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version )
 from cuda.core.experimental._utils.cuda_utils import (
     _check_driver_error as raise_if_driver_error,
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 261454bf59..8879d2dee1 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -24,7 +24,8 @@
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
-from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor
+from cuda.core.experimental._dlpack import DLDeviceType
+from cuda.core.experimental._memory import IPCBufferDescriptor
 from cuda.core.experimental._utils.cuda_utils import handle_return
 from cuda.core.experimental.utils import StridedMemoryView
 from helpers.buffers import DummyUnifiedMemoryResource
@@ -125,6 +126,22 @@ class NullMemoryResource(DummyHostMemoryResource):
     def is_host_accessible(self) -> bool:
         return False
 
+def test_package_contents():
+    expected = [
+        'Buffer',
+        'MemoryResource',
+        'DeviceMemoryResource',
+        'DeviceMemoryResourceOptions',
+        'IPCBufferDescriptor',
+        'IPCAllocationHandle',
+        'LegacyPinnedMemoryResource',
+        'VirtualMemoryResourceOptions',
+        'VirtualMemoryResource'
+    ]
+    d = {}
+    exec("from cuda.core.experimental._memory import *", d)
+    d = {k:v for k,v in d.items() if not k.startswith("__")}
+    assert sorted(expected) == sorted(d.keys())
 
 def buffer_initialization(dummy_mr: MemoryResource):
     buffer = dummy_mr.allocate(size=1024)

From 229ddc6ca56288d959c2438cda00c73754cefaba Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 31 Oct 2025 13:46:29 -0700
Subject: [PATCH 24/30] Refactor IPC data of DMR into IPCData class.

---
 .../cuda/core/experimental/_memory/_dmr.pxd   | 11 +--
 .../cuda/core/experimental/_memory/_dmr.pyx   | 62 ++++++-------
 .../cuda/core/experimental/_memory/_ipc.pxd   | 10 ++-
 .../cuda/core/experimental/_memory/_ipc.pyx   | 86 +++++++++++++------
 4 files changed, 96 insertions(+), 73 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
index 5a64a4cdac..2d1420dd49 100644
--- a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
@@ -5,18 +5,15 @@
 from cuda.bindings cimport cydriver
 
 from cuda.core.experimental._memory._buffer cimport MemoryResource
-from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
+from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData
 
 
 cdef class DeviceMemoryResource(MemoryResource):
     cdef:
+        object __weakref__
         int _dev_id
-        cydriver.CUmemoryPool _mempool_handle
+        cydriver.CUmemoryPool _handle
         object _attributes
         bint _mempool_owned
-        object __weakref__
-
-        cydriver.CUmemAllocationHandleType _ipc_handle_type
-        bint _is_mapped
-        IPCAllocationHandle _alloc_handle
+        IPCData _ipc_data
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
index daf91bf77d..0601e93ea6 100644
--- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
@@ -11,7 +11,7 @@ from libc.string cimport memset
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
 from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
+from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData
 from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
@@ -198,12 +198,10 @@ cdef class DeviceMemoryResource(MemoryResource):
 
     def __cinit__(self):
         self._dev_id = cydriver.CU_DEVICE_INVALID
-        self._mempool_handle = NULL
+        self._handle = NULL
         self._attributes = None
         self._mempool_owned = False
-        self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
-        self._is_mapped = False
-        self._alloc_handle = None
+        self._ipc_data = None
 
     def __init__(self, device_id: int | Device, options=None):
         cdef int dev_id = getattr(device_id, 'device_id', device_id)
@@ -284,7 +282,11 @@ cdef class DeviceMemoryResource(MemoryResource):
         -------
             The shareable handle for the memory pool.
         """
-        return _ipc.DMR_get_allocation_handle(self)
+        if not self.is_ipc_enabled:
+            raise RuntimeError("Memory resource is not IPC-enabled")
+        if self.is_mapped:
+            raise RuntimeError("Imported memory resource cannot be exported")
+        return self._ipc_data._alloc_handle
 
     def allocate(self, size_t size, stream: Stream = None) -> Buffer:
         """Allocate a buffer of the requested size.
@@ -303,7 +305,7 @@ cdef class DeviceMemoryResource(MemoryResource):
             The allocated buffer object, which is accessible on the device that this memory
             resource was created for.
         """
-        if self._is_mapped:
+        if self.is_mapped:
             raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
         if stream is None:
             stream = default_stream()
@@ -340,7 +342,7 @@ cdef class DeviceMemoryResource(MemoryResource):
     @property
     def handle(self) -> driver.CUmemoryPool:
         """Handle to the underlying memory pool."""
-        return driver.CUmemoryPool(<uintptr_t>(self._mempool_handle))
+        return driver.CUmemoryPool(<uintptr_t>(self._handle))
 
     @property
     def is_device_accessible(self) -> bool:
@@ -360,7 +362,7 @@ cdef class DeviceMemoryResource(MemoryResource):
     @property
     def is_ipc_enabled(self) -> bool:
         """Whether this memory resource has IPC enabled."""
-        return self._ipc_handle_type != cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+        return self._ipc_data is not None
 
     @property
     def is_mapped(self) -> bool:
@@ -368,7 +370,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         Whether this is a mapping of an IPC-enabled memory resource from
         another process.  If True, allocation is not permitted.
         """
-        return self._is_mapped
+        return self._ipc_data is not None and self._ipc_data._is_mapped
 
     @property
     def uuid(self) -> Optional[uuid.UUID]:
@@ -376,8 +378,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         A universally unique identifier for this memory resource. Meaningful
         only for IPC-enabled memory resources.
         """
-        if self._alloc_handle is not None:
-            return self._alloc_handle._uuid
+        return getattr(self._ipc_data, 'uuid', None)
 
 
 cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
@@ -387,23 +388,22 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
 
     self._dev_id = dev_id
     self._mempool_owned = False
-    self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
 
     with nogil:
-        HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._mempool_handle), dev_id))
+        HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id))
 
         # Set a higher release threshold to improve performance when there are no active allocations.
         # By default, the release threshold is 0, which means memory is immediately released back
         # to the OS when there are no active suballocations, causing performance issues.
         # Check current release threshold
         HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(
-            self._mempool_handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &current_threshold)
+            self._handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &current_threshold)
         )
 
         # If threshold is 0 (default), set it to maximum to retain memory in the pool
         if current_threshold == 0:
             HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
-                self._mempool_handle,
+                self._handle,
                 cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
                 &max_threshold
             ))
@@ -429,31 +429,19 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes
     self._mempool_owned = True
 
     with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._mempool_handle), &properties))
+        HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._handle), &properties))
         # TODO: should we also set the threshold here?
 
-    # Note: This is Linux only (int for file descriptor)
-    cdef int alloc_handle
-
     if opts.ipc_enabled:
-        self._ipc_handle_type = _ipc.IPC_HANDLE_TYPE
-        self._is_mapped = False
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
-                &alloc_handle, self._mempool_handle, _ipc.IPC_HANDLE_TYPE, 0)
-            )
-        try:
-            self._alloc_handle = IPCAllocationHandle._init(alloc_handle, uuid.uuid4())
-        except:
-            os.close(alloc_handle)
-            raise
+        alloc_handle = _ipc.DMR_export_mempool(self)
+        self._ipc_data = IPCData(alloc_handle, mapped=False)
 
 
 cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
     with nogil:
-        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._mempool_handle, s))
+        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
     cdef Buffer buf = Buffer.__new__(Buffer)
     buf._ptr = <intptr_t>(devptr)
     buf._ptr_obj = None
@@ -471,19 +459,17 @@ cdef void DMR_deallocate(DeviceMemoryResource self, intptr_t ptr, size_t size, _
 
 
 cdef DMR_close(DeviceMemoryResource self):
-    if self._mempool_handle == NULL:
+    if self._handle == NULL:
         return
 
     try:
         if self._mempool_owned:
             with nogil:
-                HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._mempool_handle))
+                HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle))
     finally:
         self._dev_id = cydriver.CU_DEVICE_INVALID
-        self._mempool_handle = NULL
+        self._handle = NULL
         self._attributes = None
         self._mempool_owned = False
-        self._ipc_handle_type = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
-        self._is_mapped = False
-        self._alloc_handle = None
+        self._ipc_data = None
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
index 006d835320..c81fcc532a 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
@@ -19,6 +19,12 @@ cdef object registry
 cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE
 
 
+cdef class IPCData:
+    cdef:
+        bint _is_mapped
+        IPCAllocationHandle _alloc_handle
+
+
 cdef class IPCBufferDescriptor:
     cdef:
         bytes _reserved
@@ -41,7 +47,7 @@ cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource, IPCBufferDesc
 
 # DeviceMemoryResource IPC Implementation
 # ------
-cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource)
 cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle)
-cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid)
 cpdef DeviceMemoryResource DMR_from_registry(uuid)
+cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid)
+cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource)
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index e1ff68fde7..93fa6d0dcb 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 cimport cpython
-from libc.stdint cimport intptr_t
+from libc.stdint cimport intptr_t, uintptr_t
 from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
@@ -27,6 +27,30 @@ cdef object registry = weakref.WeakValueDictionary()
 cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \
     if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
 
+
+cdef class IPCData:
+    """Data members related to sharing memory pools via IPC."""
+    def __cinit__(self):
+        self._is_mapped = False
+        self._alloc_handle = None
+
+    def __init__(self, IPCAllocationHandle alloc_handle, bint mapped):
+        self._is_mapped = mapped
+        self._alloc_handle = alloc_handle
+
+    @property
+    def alloc_handle(self):
+        return self._alloc_handle
+
+    @property
+    def is_mapped(self):
+        return self._is_mapped
+
+    @property
+    def uuid(self):
+        return getattr(self._alloc_handle, 'uuid', None)
+
+
 cdef class IPCBufferDescriptor:
     """Serializable object describing a buffer that can be shared between processes."""
 
@@ -133,21 +157,12 @@ cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferD
     memcpy(data.reserved, <const void*><const char*>(ipc_buffer._reserved), sizeof(data.reserved))
     cdef cydriver.CUdeviceptr ptr
     with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._mempool_handle, &data))
+        HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data))
     return Buffer._init(<intptr_t>ptr, ipc_buffer.size, mr, stream)
 
 # DeviceMemoryResource IPC Implementation
 # ------
 
-cpdef IPCAllocationHandle DMR_get_allocation_handle(DeviceMemoryResource self):
-    if not self.is_ipc_enabled:
-        raise RuntimeError("Memory resource is not IPC-enabled")
-    if self._is_mapped:
-        raise RuntimeError("Imported memory resource cannot be exported")
-    assert self._alloc_handle is not None
-    return self._alloc_handle
-
-
 cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle):
     # Quick exit for registry hits.
     if isinstance(alloc_handle, int):
@@ -157,39 +172,58 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand
     if mr is not None:
         return mr
 
-    device_id = getattr(device_id, 'device_id', device_id)
-
+    # Construct a new DMR.
     cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls)
-    self._dev_id = device_id
+    self._dev_id = getattr(device_id, 'device_id', device_id)
     self._mempool_owned = True
-    self._ipc_handle_type = IPC_HANDLE_TYPE
-    self._is_mapped = True
-    self._alloc_handle = alloc_handle
+    self._ipc_data = IPCData(alloc_handle, mapped=True)
 
+    # Map the mempool into this process.
     cdef int handle = int(alloc_handle)
     with nogil:
         HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle(
-            &(self._mempool_handle), <void*><intptr_t>(handle), IPC_HANDLE_TYPE, 0)
+            &(self._handle), <void*><intptr_t>(handle), IPC_HANDLE_TYPE, 0)
         )
+
+    # Register it.
     if uuid is not None:
         registered = self.register(uuid)
         assert registered is self
-    self._alloc_handle.close()
+
+    # Always close the file handle (caller can dup it, if needed).
+    alloc_handle.close()
+
     return self
 
 
+cpdef DeviceMemoryResource DMR_from_registry(uuid):
+    try:
+        return registry[uuid]
+    except KeyError:
+        raise RuntimeError(f"Memory resource {uuid} was not found") from None
+
+
 cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid):
     existing = registry.get(uuid)
     if existing is not None:
         return existing
-    assert self._alloc_handle is not None
-    assert self._alloc_handle._uuid is None or self._alloc_handle._uuid == uuid
+    assert self.uuid is None or self.uuid == uuid
     registry[uuid] = self
-    self._alloc_handle._uuid = uuid
+    self._ipc_data._alloc_handle._uuid = uuid
     return self
 
-cpdef DeviceMemoryResource DMR_from_registry(uuid):
+
+cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self):
+    # Note: This is Linux only (int for file descriptor)
+    cdef int fd
+    cdef IPCAllocationHandle alloc_handle
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
+            &fd, self._handle, IPC_HANDLE_TYPE, 0)
+        )
     try:
-        return registry[uuid]
-    except KeyError:
-        raise RuntimeError(f"Memory resource {uuid} was not found") from None
+        return IPCAllocationHandle._init(fd, uuid.uuid4())
+    except:
+        os.close(fd)
+        raise
+

From 0fd3ca9a519e1fb211bf934efaff50a63e605975 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 31 Oct 2025 14:47:48 -0700
Subject: [PATCH 25/30] General clean up.

---
 .../core/experimental/_memory/__init__.py     |  11 +-
 .../core/experimental/_memory/_buffer.pxd     |  12 +-
 .../core/experimental/_memory/_buffer.pyx     |  64 +++++---
 .../cuda/core/experimental/_memory/_dmr.pxd   |  11 +-
 .../cuda/core/experimental/_memory/_dmr.pyx   |  60 ++++---
 .../cuda/core/experimental/_memory/_ipc.pxd   |  31 ++--
 .../cuda/core/experimental/_memory/_ipc.pyx   |  61 ++++---
 .../cuda/core/experimental/_memory/_legacy.py |  16 +-
 .../cuda/core/experimental/_memory/_vmm.py    | 149 ++++++++++++------
 cuda_core/tests/memory_ipc/test_serialize.py  |   1 -
 cuda_core/tests/test_memory.py                |  24 +--
 11 files changed, 271 insertions(+), 169 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index f9d528ac64..9781935cdc 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -2,9 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from ._buffer import *
-from ._dmr import *
-from ._ipc import *
-from ._legacy import *
-from ._vmm import *
-
+from ._buffer import *  # noqa: F403
+from ._dmr import *  # noqa: F403
+from ._ipc import *  # noqa: F403
+from ._legacy import *  # noqa: F403
+from ._vmm import *  # noqa: F403
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
index b6c75f63cc..a684c97f98 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
@@ -4,19 +4,17 @@
 
 from libc.stdint cimport intptr_t
 
-from cuda.core.experimental._stream cimport Stream as _cyStream
+from cuda.core.experimental._stream cimport Stream
 
 
 cdef class Buffer:
     cdef:
-        intptr_t _ptr
-        size_t _size
+        intptr_t       _ptr
+        size_t         _size
         MemoryResource _mr
-        object _ptr_obj
-        _cyStream _alloc_stream
+        object         _ptr_obj
+        Stream         _alloc_stream
 
 
 cdef class MemoryResource:
     pass
-
-
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index a3a5fa48b8..69910c9869 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -9,7 +9,7 @@ from libc.stdint cimport intptr_t
 from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource
 from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor
 from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
+from cuda.core.experimental._stream cimport default_stream, Stream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
 )
@@ -18,14 +18,16 @@ import abc
 from typing import TypeVar, Union
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
-from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.cuda_utils import driver
 
 __all__ = ['Buffer', 'MemoryResource']
 
 
 DevicePointerT = Union[driver.CUdeviceptr, int, None]
-"""A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting :attr:`Buffer.handle`."""
+"""
+A type union of :obj:`~driver.CUdeviceptr`, `int` and `None` for hinting
+:attr:`Buffer.handle`.
+"""
 
 cdef class Buffer:
     """Represent a handle to allocated memory.
@@ -51,16 +53,20 @@ cdef class Buffer:
         self._alloc_stream = None
 
     def __init__(self, *args, **kwargs):
-        raise RuntimeError("Buffer objects cannot be instantiated directly. Please use MemoryResource APIs.")
+        raise RuntimeError("Buffer objects cannot be instantiated directly. "
+                           "Please use MemoryResource APIs.")
 
     @classmethod
-    def _init(cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None, stream: Stream | None = None):
+    def _init(
+        cls, ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None,
+        stream: Stream | None = None
+    ):
         cdef Buffer self = Buffer.__new__(cls)
         self._ptr = <intptr_t>(int(ptr))
         self._ptr_obj = ptr
         self._size = size
         self._mr = mr
-        self._alloc_stream = <_cyStream>(stream) if stream is not None else None
+        self._alloc_stream = <Stream>(stream) if stream is not None else None
         return self
 
     def __dealloc__(self):
@@ -71,7 +77,9 @@ cdef class Buffer:
         return Buffer.from_ipc_descriptor, (self.memory_resource, self.get_ipc_descriptor())
 
     @staticmethod
-    def from_handle(ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None) -> Buffer:
+    def from_handle(
+        ptr: DevicePointerT, size_t size, mr: MemoryResource | None = None
+    ) -> Buffer:
         """Create a new :class:`Buffer` object from a pointer.
 
         Parameters
@@ -87,7 +95,10 @@ cdef class Buffer:
         return Buffer._init(ptr, size, mr=mr)
 
     @classmethod
-    def from_ipc_descriptor(cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor, stream: Stream = None) -> Buffer:
+    def from_ipc_descriptor(
+        cls, mr: DeviceMemoryResource, ipc_buffer: IPCBufferDescriptor,
+        stream: Stream = None
+    ) -> Buffer:
         """Import a buffer that was exported from another process."""
         return _ipc.Buffer_from_ipc_descriptor(cls, mr, ipc_buffer, stream)
 
@@ -132,13 +143,14 @@ cdef class Buffer:
 
         if dst is None:
             if self._mr is None:
-                raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)")
+                raise ValueError("a destination buffer must be provided (this "
+                                 "buffer does not have a memory_resource)")
             dst = self._mr.allocate(src_size, stream)
 
         cdef size_t dst_size = dst._size
         if dst_size != src_size:
-            raise ValueError(
-                f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
+            raise ValueError( "buffer sizes mismatch between src and dst (sizes "
+                             f"are: src={src_size}, dst={dst_size})"
             )
         err, = driver.cuMemcpyAsync(dst._ptr, self._ptr, src_size, stream.handle)
         raise_if_driver_error(err)
@@ -163,8 +175,8 @@ cdef class Buffer:
         cdef size_t src_size = src._size
 
         if src_size != dst_size:
-            raise ValueError(
-                f"buffer sizes mismatch between src and dst (sizes are: src={src_size}, dst={dst_size})"
+            raise ValueError( "buffer sizes mismatch between src and dst (sizes "
+                             f"are: src={src_size}, dst={dst_size})"
             )
         err, = driver.cuMemcpyAsync(self._ptr, src._ptr, dst_size, stream.handle)
         raise_if_driver_error(err)
@@ -264,17 +276,19 @@ cdef class Buffer:
         return self._size
 
 
+# Buffer Implementation
+# ---------------------
 cdef Buffer_close(Buffer self, stream):
-    cdef _cyStream s
+    cdef Stream s
     if self._ptr and self._mr is not None:
         if stream is None:
             if self._alloc_stream is not None:
                 s = self._alloc_stream
             else:
                 # TODO: remove this branch when from_handle takes a stream
-                s = <_cyStream>(default_stream())
+                s = <Stream>(default_stream())
         else:
-            s = <_cyStream>stream
+            s = <Stream>stream
         self._mr.deallocate(self._ptr, self._size, s)
         self._ptr = 0
         self._mr = None
@@ -283,13 +297,15 @@ cdef Buffer_close(Buffer self, stream):
 
 
 cdef class MemoryResource:
-    """Abstract base class for memory resources that manage allocation and deallocation of buffers.
-
-    Subclasses must implement methods for allocating and deallocation, as well as properties
-    associated with this memory resource from which all allocated buffers will inherit. (Since
-    all :class:`Buffer` instances allocated and returned by the :meth:`allocate` method would
-    hold a reference to self, the buffer properties are retrieved simply by looking up the underlying
-    memory resource's respective property.)
+    """Abstract base class for memory resources that manage allocation and
+    deallocation of buffers.
+
+    Subclasses must implement methods for allocating and deallocation, as well
+    as properties associated with this memory resource from which all allocated
+    buffers will inherit. (Since all :class:`Buffer` instances allocated and
+    returned by the :meth:`allocate` method would hold a reference to self, the
+    buffer properties are retrieved simply by looking up the underlying memory
+    resource's respective property.)
     """
 
     @abc.abstractmethod
@@ -329,5 +345,3 @@ cdef class MemoryResource:
             and document the behavior.
         """
         ...
-
-
diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
index 2d1420dd49..945291b6e4 100644
--- a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
@@ -10,10 +10,9 @@ from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData
 
 cdef class DeviceMemoryResource(MemoryResource):
     cdef:
-        object __weakref__
-        int _dev_id
+        int                   _dev_id
         cydriver.CUmemoryPool _handle
-        object _attributes
-        bint _mempool_owned
-        IPCData _ipc_data
-
+        bint                  _mempool_owned
+        IPCData               _ipc_data
+        object                _attributes
+        object                __weakref__
diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
index 0601e93ea6..fc541406a0 100644
--- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
@@ -12,7 +12,7 @@ from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
 from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData
-from cuda.core.experimental._stream cimport default_stream, Stream as _cyStream
+from cuda.core.experimental._stream cimport default_stream, Stream
 from cuda.core.experimental._utils.cuda_utils cimport (
     _check_driver_error as raise_if_driver_error,
     check_or_create_options,
@@ -22,12 +22,10 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 import cython
 from dataclasses import dataclass
 from typing import Optional, TYPE_CHECKING
-import os
-import platform
+import platform  # no-cython-lint
 import uuid
 import weakref
 
-from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.cuda_utils import driver
 
 if TYPE_CHECKING:
@@ -69,7 +67,9 @@ class DeviceMemoryResourceAttributes:
 
     def mempool_property(property_type: type):
         def decorator(stub):
-            attr_enum = getattr(driver.CUmemPool_attribute, f"CU_MEMPOOL_ATTR_{stub.__name__.upper()}")
+            attr_enum = getattr(
+                driver.CUmemPool_attribute, f"CU_MEMPOOL_ATTR_{stub.__name__.upper()}"
+            )
 
             def fget(self) -> property_type:
                 mr = self._mr()
@@ -206,7 +206,8 @@ cdef class DeviceMemoryResource(MemoryResource):
     def __init__(self, device_id: int | Device, options=None):
         cdef int dev_id = getattr(device_id, 'device_id', device_id)
         opts = check_or_create_options(
-            DeviceMemoryResourceOptions, options, "DeviceMemoryResource options", keep_none=True
+            DeviceMemoryResourceOptions, options, "DeviceMemoryResource options",
+            keep_none=True
         )
 
         if opts is None:
@@ -218,14 +219,17 @@ cdef class DeviceMemoryResource(MemoryResource):
         DMR_close(self)
 
     def close(self):
-        """Close the device memory resource and destroy the associated memory pool if owned."""
+        """
+        Close the device memory resource and destroy the associated memory pool
+        if owned.
+        """
         DMR_close(self)
 
     def __reduce__(self):
         return DeviceMemoryResource.from_registry, (self.uuid,)
 
     @staticmethod
-    def from_registry(uuid: uuid.UUID) -> DeviceMemoryResource:
+    def from_registry(uuid: uuid.UUID) -> DeviceMemoryResource:  # no-cython-lint
         """
         Obtain a registered mapped memory resource.
 
@@ -236,7 +240,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
         return _ipc.DMR_from_registry(uuid)
 
-    def register(self, uuid: uuid.UUID) -> DeviceMemoryResource:
+    def register(self, uuid: uuid.UUID) -> DeviceMemoryResource:  # no-cython-lint
         """
         Register a mapped memory resource.
 
@@ -309,7 +313,7 @@ cdef class DeviceMemoryResource(MemoryResource):
             raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
         if stream is None:
             stream = default_stream()
-        return DMR_allocate(self, size, <_cyStream>stream)
+        return DMR_allocate(self, size, <Stream>stream)
 
     def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream = None):
         """Deallocate a buffer previously allocated by this resource.
@@ -325,7 +329,7 @@ cdef class DeviceMemoryResource(MemoryResource):
             If the buffer is deallocated without an explicit stream, the allocation stream
             is used.
         """
-        DMR_deallocate(self, <intptr_t>ptr, size, <_cyStream>stream)
+        DMR_deallocate(self, <intptr_t>ptr, size, <Stream>stream)
 
     @property
     def attributes(self) -> DeviceMemoryResourceAttributes:
@@ -381,6 +385,9 @@ cdef class DeviceMemoryResource(MemoryResource):
         return getattr(self._ipc_data, 'uuid', None)
 
 
+# DeviceMemoryResource Implementation
+# -----------------------------------
+
 cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
     # Get the current memory pool.
     cdef cydriver.cuuint64_t current_threshold
@@ -392,15 +399,19 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
     with nogil:
         HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id))
 
-        # Set a higher release threshold to improve performance when there are no active allocations.
-        # By default, the release threshold is 0, which means memory is immediately released back
-        # to the OS when there are no active suballocations, causing performance issues.
-        # Check current release threshold
-        HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(
-            self._handle, cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &current_threshold)
+        # Set a higher release threshold to improve performance when there are
+        # no active allocations.  By default, the release threshold is 0, which
+        # means memory is immediately released back to the OS when there are no
+        # active suballocations, causing performance issues.
+        HANDLE_RETURN(
+            cydriver.cuMemPoolGetAttribute(
+                self._handle,
+                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                &current_threshold
+            )
         )
 
-        # If threshold is 0 (default), set it to maximum to retain memory in the pool
+        # If threshold is 0 (default), set it to maximum to retain memory in the pool.
         if current_threshold == 0:
             HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
                 self._handle,
@@ -409,11 +420,13 @@ cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
             ))
 
 
-cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts):
+cdef void DMR_init_create(
+    DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts
+):
     # Create a new memory pool.
     cdef cydriver.CUmemPoolProps properties
 
-    if opts.ipc_enabled and _ipc.IPC_HANDLE_TYPE == cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE:
+    if opts.ipc_enabled and not _ipc.is_supported():
         raise RuntimeError("IPC is not available on {platform.system()}")
 
     memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
@@ -437,7 +450,7 @@ cdef void DMR_init_create(DeviceMemoryResource self, int dev_id, DeviceMemoryRes
         self._ipc_data = IPCData(alloc_handle, mapped=False)
 
 
-cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream stream):
+cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr
     with nogil:
@@ -451,7 +464,9 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, _cyStream strea
     return buf
 
 
-cdef void DMR_deallocate(DeviceMemoryResource self, intptr_t ptr, size_t size, _cyStream stream) noexcept:
+cdef void DMR_deallocate(
+    DeviceMemoryResource self, intptr_t ptr, size_t size, Stream stream
+) noexcept:
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
     with nogil:
@@ -472,4 +487,3 @@ cdef DMR_close(DeviceMemoryResource self):
         self._attributes = None
         self._mempool_owned = False
         self._ipc_data = None
-
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
index c81fcc532a..6480f32619 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
@@ -14,40 +14,45 @@ from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource
 cdef object registry
 
 
-# IPC is currently only supported on Linux. On other platforms, the IPC handle
-# type is set equal to the no-IPC handle type.
+# The IPC handle type for this platform.  IPC is currently only supported on
+# Linux. On other platforms, the IPC handle type is set equal to the no-IPC
+# handle type.
 cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE
 
 
+# Whether IPC is supported on this platform.
+cdef is_supported()
+
+
 cdef class IPCData:
     cdef:
-        bint _is_mapped
         IPCAllocationHandle _alloc_handle
+        bint                _is_mapped
 
 
 cdef class IPCBufferDescriptor:
     cdef:
-        bytes _reserved
+        bytes  _payload
         size_t _size
 
 
 cdef class IPCAllocationHandle:
     cdef:
-        int _handle
+        int    _handle
         object _uuid
 
     cpdef close(self)
 
 
 # Buffer IPC Implementation
-# ------
-cpdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer)
-cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource, IPCBufferDescriptor, stream)
+# -------------------------
+cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer)
+cdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource, IPCBufferDescriptor, stream)
 
 
 # DeviceMemoryResource IPC Implementation
-# ------
-cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle)
-cpdef DeviceMemoryResource DMR_from_registry(uuid)
-cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid)
-cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource)
+# ---------------------------------------
+cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle)
+cdef DeviceMemoryResource DMR_from_registry(uuid)
+cdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid)
+cdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource)
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index 93fa6d0dcb..4856ce5546 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 cimport cpython
-from libc.stdint cimport intptr_t, uintptr_t
+from libc.stdint cimport intptr_t
 from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
@@ -12,7 +12,6 @@ from cuda.core.experimental._utils.cuda_utils cimport (
     HANDLE_RETURN,
 )
 
-from typing import Iterable, Literal, Optional, TypeVar, Union
 import multiprocessing
 import os
 import platform
@@ -24,19 +23,24 @@ __all__ = ['IPCBufferDescriptor', 'IPCAllocationHandle']
 
 cdef object registry = weakref.WeakValueDictionary()
 
-cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE = cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \
-    if platform.system() == "Linux" else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
 
+cdef cydriver.CUmemAllocationHandleType IPC_HANDLE_TYPE =                       \
+    cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR \
+    if platform.system() == "Linux" else                                        \
+    cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+
+cdef is_supported():
+    return IPC_HANDLE_TYPE != cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
 
 cdef class IPCData:
     """Data members related to sharing memory pools via IPC."""
     def __cinit__(self):
-        self._is_mapped = False
         self._alloc_handle = None
+        self._is_mapped = False
 
     def __init__(self, IPCAllocationHandle alloc_handle, bint mapped):
-        self._is_mapped = mapped
         self._alloc_handle = alloc_handle
+        self._is_mapped = mapped
 
     @property
     def alloc_handle(self):
@@ -60,12 +64,12 @@ cdef class IPCBufferDescriptor:
     @classmethod
     def _init(cls, reserved: bytes, size: int):
         cdef IPCBufferDescriptor self = IPCBufferDescriptor.__new__(cls)
-        self._reserved = reserved
+        self._payload = reserved
         self._size = size
         return self
 
     def __reduce__(self):
-        return self._init, (self._reserved, self._size)
+        return self._init, (self._payload, self._size)
 
     @property
     def size(self):
@@ -79,7 +83,7 @@ cdef class IPCAllocationHandle:
         raise RuntimeError("IPCAllocationHandle objects cannot be instantiated directly. Please use MemoryResource APIs.")
 
     @classmethod
-    def _init(cls, handle: int, uuid):
+    def _init(cls, handle: int, uuid):  # no-cython-lint
         cdef IPCAllocationHandle self = IPCAllocationHandle.__new__(cls)
         assert handle >= 0
         self._handle = handle
@@ -118,7 +122,7 @@ def _reduce_allocation_handle(alloc_handle):
     return _reconstruct_allocation_handle, (type(alloc_handle), df, alloc_handle.uuid)
 
 
-def _reconstruct_allocation_handle(cls, df, uuid):
+def _reconstruct_allocation_handle(cls, df, uuid):  # no-cython-lint
     return cls._init(df.detach(), uuid)
 
 
@@ -136,17 +140,23 @@ multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_mem
 
 
 # Buffer IPC Implementation
-# ------
-cpdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self):
+# -------------------------
+cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self):
     if not self._mr.is_ipc_enabled:
         raise RuntimeError("Memory resource is not IPC-enabled")
     cdef cydriver.CUmemPoolPtrExportData data
     with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolExportPointer(&data, <cydriver.CUdeviceptr>(self._ptr)))
-    cdef bytes data_b = cpython.PyBytes_FromStringAndSize(<char*>(data.reserved), sizeof(data.reserved))
+        HANDLE_RETURN(
+            cydriver.cuMemPoolExportPointer(&data, <cydriver.CUdeviceptr>(self._ptr))
+        )
+    cdef bytes data_b = cpython.PyBytes_FromStringAndSize(
+        <char*>(data.reserved), sizeof(data.reserved)
+    )
     return IPCBufferDescriptor._init(data_b, self.size)
 
-cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferDescriptor ipc_buffer, stream):
+cdef Buffer Buffer_from_ipc_descriptor(
+    cls, DeviceMemoryResource mr, IPCBufferDescriptor ipc_buffer, stream
+):
     """Import a buffer that was exported from another process."""
     if not mr.is_ipc_enabled:
         raise RuntimeError("Memory resource is not IPC-enabled")
@@ -154,20 +164,25 @@ cpdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource mr, IPCBufferD
         # Note: match this behavior to DeviceMemoryResource.allocate()
         stream = default_stream()
     cdef cydriver.CUmemPoolPtrExportData data
-    memcpy(data.reserved, <const void*><const char*>(ipc_buffer._reserved), sizeof(data.reserved))
+    memcpy(
+        data.reserved,
+        <const void*><const char*>(ipc_buffer._payload),
+        sizeof(data.reserved)
+    )
     cdef cydriver.CUdeviceptr ptr
     with nogil:
         HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data))
     return Buffer._init(<intptr_t>ptr, ipc_buffer.size, mr, stream)
 
+
 # DeviceMemoryResource IPC Implementation
-# ------
+# ---------------------------------------
 
-cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle):
+cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle):
     # Quick exit for registry hits.
     if isinstance(alloc_handle, int):
         alloc_handle = IPCAllocationHandle._init(alloc_handle, None)
-    uuid = getattr(alloc_handle, 'uuid', None)
+    uuid = getattr(alloc_handle, 'uuid', None)  # no-cython-lint
     mr = registry.get(uuid)
     if mr is not None:
         return mr
@@ -196,14 +211,14 @@ cpdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_hand
     return self
 
 
-cpdef DeviceMemoryResource DMR_from_registry(uuid):
+cdef DeviceMemoryResource DMR_from_registry(uuid):
     try:
         return registry[uuid]
     except KeyError:
         raise RuntimeError(f"Memory resource {uuid} was not found") from None
 
 
-cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid):
+cdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid):
     existing = registry.get(uuid)
     if existing is not None:
         return existing
@@ -213,10 +228,9 @@ cpdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid):
     return self
 
 
-cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self):
+cdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self):
     # Note: This is Linux only (int for file descriptor)
     cdef int fd
-    cdef IPCAllocationHandle alloc_handle
     with nogil:
         HANDLE_RETURN(cydriver.cuMemPoolExportToShareableHandle(
             &fd, self._handle, IPC_HANDLE_TYPE, 0)
@@ -226,4 +240,3 @@ cpdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self):
     except:
         os.close(fd)
         raise
-
diff --git a/cuda_core/cuda/core/experimental/_memory/_legacy.py b/cuda_core/cuda/core/experimental/_memory/_legacy.py
index 6af415433a..523835a79d 100644
--- a/cuda_core/cuda/core/experimental/_memory/_legacy.py
+++ b/cuda_core/cuda/core/experimental/_memory/_legacy.py
@@ -8,9 +8,11 @@
 
 from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
 from cuda.core.experimental._utils.cuda_utils import (
-    driver,
     _check_driver_error as raise_if_driver_error,
 )
+from cuda.core.experimental._utils.cuda_utils import (
+    driver,
+)
 
 if TYPE_CHECKING:
     from cuda.core.experimental._memory.buffer import DevicePointerT
@@ -25,7 +27,7 @@ class LegacyPinnedMemoryResource(MemoryResource):
 
     # TODO: support creating this MR with flags that are later passed to cuMemHostAlloc?
 
-    def allocate(self, size, stream = None) -> Buffer:
+    def allocate(self, size, stream=None) -> Buffer:
         """Allocate a buffer of the requested size.
 
         Parameters
@@ -42,6 +44,7 @@ def allocate(self, size, stream = None) -> Buffer:
         """
         if stream is None:
             from cuda.core.experimental._stream import default_stream
+
             stream = default_stream()
         err, ptr = driver.cuMemAllocHost(size)
         raise_if_driver_error(err)
@@ -60,7 +63,7 @@ def deallocate(self, ptr: DevicePointerT, size, stream):
             The stream on which to perform the deallocation synchronously.
         """
         stream.sync()
-        err, = driver.cuMemFreeHost(ptr)
+        (err,) = driver.cuMemFreeHost(ptr)
         raise_if_driver_error(err)
 
     @property
@@ -83,11 +86,12 @@ class _SynchronousMemoryResource(MemoryResource):
     __slots__ = ("_dev_id",)
 
     def __init__(self, device_id):
-        self._dev_id = getattr(device_id, 'device_id', device_id)
+        self._dev_id = getattr(device_id, "device_id", device_id)
 
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
             from cuda.core.experimental._stream import default_stream
+
             stream = default_stream()
         err, ptr = driver.cuMemAlloc(size)
         raise_if_driver_error(err)
@@ -95,7 +99,7 @@ def allocate(self, size, stream=None) -> Buffer:
 
     def deallocate(self, ptr, size, stream):
         stream.sync()
-        err, = driver.cuMemFree(ptr)
+        (err,) = driver.cuMemFree(ptr)
         raise_if_driver_error(err)
 
     @property
@@ -109,5 +113,3 @@ def is_host_accessible(self) -> bool:
     @property
     def device_id(self) -> int:
         return self._dev_id
-
-
diff --git a/cuda_core/cuda/core/experimental/_memory/_vmm.py b/cuda_core/cuda/core/experimental/_memory/_vmm.py
index ebf7895076..3f55614839 100644
--- a/cuda_core/cuda/core/experimental/_memory/_vmm.py
+++ b/cuda_core/cuda/core/experimental/_memory/_vmm.py
@@ -2,16 +2,20 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import platform
 from dataclasses import dataclass, field
 from typing import Iterable, Literal, Optional, Union
-import platform
 
-from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import (driver, Transaction, get_binding_version )
+from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.cuda_utils import (
-    _check_driver_error as raise_if_driver_error,
+    Transaction,
     check_or_create_options,
+    driver,
+    get_binding_version,
+)
+from cuda.core.experimental._utils.cuda_utils import (
+    _check_driver_error as raise_if_driver_error,
 )
 
 __all__ = ["VirtualMemoryResourceOptions", "VirtualMemoryResource"]
@@ -54,6 +58,7 @@ class VirtualMemoryResourceOptions:
     peer_access: :obj:`~_memory.VirtualMemoryAccessTypeT`
         Access flags for peers.
     """
+
     # Human-friendly strings; normalized in __post_init__
     allocation_type: VirtualMemoryAllocationTypeT = "pinned"
     location_type: VirtualMemoryLocationTypeT = "device"
@@ -69,11 +74,25 @@ class VirtualMemoryResourceOptions:
     _a = driver.CUmemAccess_flags
     _access_flags = {"rw": _a.CU_MEM_ACCESS_FLAGS_PROT_READWRITE, "r": _a.CU_MEM_ACCESS_FLAGS_PROT_READ, None: 0}
     _h = driver.CUmemAllocationHandleType
-    _handle_types = {None: _h.CU_MEM_HANDLE_TYPE_NONE, "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, "win32": _h.CU_MEM_HANDLE_TYPE_WIN32, "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT, "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC}
+    _handle_types = {
+        None: _h.CU_MEM_HANDLE_TYPE_NONE,
+        "posix_fd": _h.CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
+        "win32": _h.CU_MEM_HANDLE_TYPE_WIN32,
+        "win32_kmt": _h.CU_MEM_HANDLE_TYPE_WIN32_KMT,
+        "fabric": _h.CU_MEM_HANDLE_TYPE_FABRIC,
+    }
     _g = driver.CUmemAllocationGranularity_flags
-    _granularity = {"recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED, "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM}
+    _granularity = {
+        "recommended": _g.CU_MEM_ALLOC_GRANULARITY_RECOMMENDED,
+        "minimum": _g.CU_MEM_ALLOC_GRANULARITY_MINIMUM,
+    }
     _l = driver.CUmemLocationType
-    _location_type = {"device": _l.CU_MEM_LOCATION_TYPE_DEVICE, "host": _l.CU_MEM_LOCATION_TYPE_HOST, "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA, "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT}
+    _location_type = {
+        "device": _l.CU_MEM_LOCATION_TYPE_DEVICE,
+        "host": _l.CU_MEM_LOCATION_TYPE_HOST,
+        "host_numa": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA,
+        "host_numa_current": _l.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT,
+    }
     # CUDA 13+ exposes MANAGED in CUmemAllocationType; older 12.x does not
     _a = driver.CUmemAllocationType
     _allocation_type = {"pinned": _a.CU_MEM_ALLOCATION_TYPE_PINNED}
@@ -128,6 +147,7 @@ class VirtualMemoryResource(MemoryResource):
     config : VirtualMemoryResourceOptions
         A configuration object for the VirtualMemoryResource
     """
+
     def __init__(self, device, config: VirtualMemoryResourceOptions = None):
         self.device = device
         self.config = check_or_create_options(
@@ -139,9 +159,12 @@ def __init__(self, device, config: VirtualMemoryResourceOptions = None):
             raise NotImplementedError("VirtualMemoryResource is not supported on Windows")
 
         # Validate RDMA support if requested
-        if self.config.gpu_direct_rdma and self.device is not None:
-            if not self.device.properties.gpu_direct_rdma_supported:
-                raise RuntimeError("GPU Direct RDMA is not supported on this device")
+        if (
+            self.config.gpu_direct_rdma
+            and self.device is not None
+            and not self.device.properties.gpu_direct_rdma_supported
+        ):
+            raise RuntimeError("GPU Direct RDMA is not supported on this device")
 
     @staticmethod
     def _align_up(size: int, gran: int) -> int:
@@ -196,7 +219,7 @@ def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryRes
             # Same size: only update access policy if needed; avoid zero-sized driver calls
             descs = self._build_access_descriptors(prop)
             if descs:
-                res, = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs))
+                (res,) = driver.cuMemSetAccess(int(buf.handle), buf.size, descs, len(descs))
                 raise_if_driver_error(res)
             return buf
 
@@ -210,23 +233,31 @@ def modify_allocation(self, buf: Buffer, new_size: int, config: VirtualMemoryRes
             aligned_additional_size,
             addr_align,
             int(buf.handle) + aligned_prev_size,  # fixedAddr hint - aligned end of current range
-            0
+            0,
         )
 
         if res != driver.CUresult.CUDA_SUCCESS or new_ptr != (int(buf.handle) + aligned_prev_size):
             # Check for specific errors that are not recoverable with the slow path
-            if res in (driver.CUresult.CUDA_ERROR_INVALID_VALUE, driver.CUresult.CUDA_ERROR_NOT_PERMITTED, driver.CUresult.CUDA_ERROR_NOT_INITIALIZED, driver.CUresult.CUDA_ERROR_NOT_SUPPORTED):
+            if res in (
+                driver.CUresult.CUDA_ERROR_INVALID_VALUE,
+                driver.CUresult.CUDA_ERROR_NOT_PERMITTED,
+                driver.CUresult.CUDA_ERROR_NOT_INITIALIZED,
+                driver.CUresult.CUDA_ERROR_NOT_SUPPORTED,
+            ):
                 raise_if_driver_error(res)
-            res2, = driver.cuMemAddressFree(new_ptr, aligned_additional_size)
+            (res2,) = driver.cuMemAddressFree(new_ptr, aligned_additional_size)
             raise_if_driver_error(res2)
             # Fallback: couldn't extend contiguously, need full remapping
-            return self._grow_allocation_slow_path(buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align)
+            return self._grow_allocation_slow_path(
+                buf, new_size, prop, aligned_additional_size, total_aligned_size, addr_align
+            )
         else:
             # Success! We can extend the VA range contiguously
             return self._grow_allocation_fast_path(buf, new_size, prop, aligned_additional_size, new_ptr)
 
-    def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
-                                   aligned_additional_size: int, new_ptr: int) -> Buffer:
+    def _grow_allocation_fast_path(
+        self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp, aligned_additional_size: int, new_ptr: int
+    ) -> Buffer:
         """
         Fast path for growing a virtual memory allocation when the new region can be
         reserved contiguously after the existing buffer.
@@ -236,33 +267,47 @@ def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CU
         remains unchanged).
 
         Args:
-            buf (Buffer): The buffer to grow.
-            new_size (int): The new total size in bytes.
-            prop (driver.CUmemAllocationProp): Allocation properties for the new memory.
-            aligned_additional_size (int): The size of the new region to allocate, aligned to granularity.
-            new_ptr (int): The address of the newly reserved contiguous VA region (should be at the end of the current buffer).
+            buf (Buffer):
+                The buffer to grow.
+
+            new_size (int):
+                The new total size in bytes.
+
+            prop (driver.CUmemAllocationProp):
+                Allocation properties for the new memory.
+
+            aligned_additional_size (int):
+                The size of the new region to allocate, aligned to granularity.
+
+            new_ptr (int):
+                The address of the newly reserved contiguous VA region (should
+                be at the end of the current buffer).
 
         Returns:
             Buffer: The same buffer object with its size updated to `new_size`.
         """
         with Transaction() as trans:
             # Create new physical memory for the additional size
-            trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]))
+            trans.append(
+                lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])
+            )
             res, new_handle = driver.cuMemCreate(aligned_additional_size, prop, 0)
             raise_if_driver_error(res)
             # Register undo for creation
             trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
 
             # Map the new physical memory to the extended VA range
-            res, = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
+            (res,) = driver.cuMemMap(new_ptr, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
             # Register undo for mapping
-            trans.append(lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0]))
+            trans.append(
+                lambda np=new_ptr, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(np, s)[0])
+            )
 
             # Set access permissions for the new portion
             descs = self._build_access_descriptors(prop)
             if descs:
-                res, = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs))
+                (res,) = driver.cuMemSetAccess(new_ptr, aligned_additional_size, descs, len(descs))
                 raise_if_driver_error(res)
 
             # All succeeded, cancel undo actions
@@ -272,8 +317,15 @@ def _grow_allocation_fast_path(self, buf: Buffer, new_size: int, prop: driver.CU
         buf._size = new_size
         return buf
 
-    def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CUmemAllocationProp,
-                                   aligned_additional_size: int, total_aligned_size: int, addr_align: int) -> Buffer:
+    def _grow_allocation_slow_path(
+        self,
+        buf: Buffer,
+        new_size: int,
+        prop: driver.CUmemAllocationProp,
+        aligned_additional_size: int,
+        total_aligned_size: int,
+        addr_align: int,
+    ) -> Buffer:
         """
         Slow path for growing a virtual memory allocation when the new region cannot be
         reserved contiguously after the existing buffer.
@@ -299,7 +351,9 @@ def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CU
             res, new_ptr = driver.cuMemAddressReserve(total_aligned_size, addr_align, 0, 0)
             raise_if_driver_error(res)
             # Register undo for VA reservation
-            trans.append(lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0]))
+            trans.append(
+                lambda np=new_ptr, s=total_aligned_size: raise_if_driver_error(driver.cuMemAddressFree(np, s)[0])
+            )
 
             # Get the old allocation handle for remapping
             result, old_handle = driver.cuMemRetainAllocationHandle(buf.handle)
@@ -309,20 +363,22 @@ def _grow_allocation_slow_path(self, buf: Buffer, new_size: int, prop: driver.CU
 
             # Unmap the old VA range (aligned previous size)
             aligned_prev_size = total_aligned_size - aligned_additional_size
-            result, = driver.cuMemUnmap(int(buf.handle), aligned_prev_size)
+            (result,) = driver.cuMemUnmap(int(buf.handle), aligned_prev_size)
             raise_if_driver_error(result)
 
             def _remap_old():
                 # Try to remap the old physical memory back to the original VA range
                 try:
-                    res, = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0)
+                    (res,) = driver.cuMemMap(int(buf.handle), aligned_prev_size, 0, old_handle, 0)
                     raise_if_driver_error(res)
-                except Exception:
+                except Exception:  # noqa: S110
+                    # TODO: consider logging this exception
                     pass
+
             trans.append(_remap_old)
 
             # Remap the old physical memory to the new VA range (aligned previous size)
-            res, = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0)
+            (res,) = driver.cuMemMap(int(new_ptr), aligned_prev_size, 0, old_handle, 0)
             raise_if_driver_error(res)
 
             # Register undo for mapping
@@ -336,23 +392,27 @@ def _remap_old():
             trans.append(lambda h=new_handle: raise_if_driver_error(driver.cuMemRelease(h)[0]))
 
             # Map the new physical memory to the extended portion (aligned offset)
-            res, = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
+            (res,) = driver.cuMemMap(int(new_ptr) + aligned_prev_size, aligned_additional_size, 0, new_handle, 0)
             raise_if_driver_error(res)
 
             # Register undo for mapping
-            trans.append(lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error(driver.cuMemUnmap(base + offs, s)[0]))
+            trans.append(
+                lambda base=int(new_ptr), offs=aligned_prev_size, s=aligned_additional_size: raise_if_driver_error(
+                    driver.cuMemUnmap(base + offs, s)[0]
+                )
+            )
 
             # Set access permissions for the entire new range
             descs = self._build_access_descriptors(prop)
             if descs:
-                res, = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs))
+                (res,) = driver.cuMemSetAccess(new_ptr, total_aligned_size, descs, len(descs))
                 raise_if_driver_error(res)
 
             # All succeeded, cancel undo actions
             trans.commit()
 
         # Free the old VA range (aligned previous size)
-        res2, = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
+        (res2,) = driver.cuMemAddressFree(int(buf.handle), aligned_prev_size)
         raise_if_driver_error(res2)
 
         # Invalidate the old buffer so its destructor won't try to free again
@@ -361,7 +421,6 @@ def _remap_old():
         # Return a new Buffer for the new mapping
         return Buffer.from_handle(ptr=new_ptr, size=new_size, mr=self)
 
-
     def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
         """
         Build access descriptors for memory access permissions.
@@ -394,7 +453,6 @@ def _build_access_descriptors(self, prop: driver.CUmemAllocationProp) -> list:
 
         return descs
 
-
     def allocate(self, size: int, stream: Stream = None) -> Buffer:
         """
         Allocate a buffer of the given size using CUDA virtual memory.
@@ -463,14 +521,14 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer:
             trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemAddressFree(p, s)[0]))
 
             # ---- Map physical memory into VA ----
-            res, = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
+            (res,) = driver.cuMemMap(ptr, aligned_size, 0, handle, 0)
             trans.append(lambda p=ptr, s=aligned_size: raise_if_driver_error(driver.cuMemUnmap(p, s)[0]))
             raise_if_driver_error(res)
 
             # ---- Set access for owner + peers ----
             descs = self._build_access_descriptors(prop)
             if descs:
-                res, = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs))
+                (res,) = driver.cuMemSetAccess(ptr, aligned_size, descs, len(descs))
                 raise_if_driver_error(res)
 
             trans.commit()
@@ -479,20 +537,19 @@ def allocate(self, size: int, stream: Stream = None) -> Buffer:
         buf = Buffer.from_handle(ptr=ptr, size=aligned_size, mr=self)
         return buf
 
-    def deallocate(self, ptr: int, size: int, stream: Stream=None) -> None:
+    def deallocate(self, ptr: int, size: int, stream: Stream = None) -> None:
         """
         Deallocate memory on the device using CUDA VMM APIs.
         """
         result, handle = driver.cuMemRetainAllocationHandle(ptr)
         raise_if_driver_error(result)
-        result, = driver.cuMemUnmap(ptr, size)
+        (result,) = driver.cuMemUnmap(ptr, size)
         raise_if_driver_error(result)
-        result, = driver.cuMemAddressFree(ptr, size)
+        (result,) = driver.cuMemAddressFree(ptr, size)
         raise_if_driver_error(result)
-        result, = driver.cuMemRelease(handle)
+        (result,) = driver.cuMemRelease(handle)
         raise_if_driver_error(result)
 
-
     @property
     def is_device_accessible(self) -> bool:
         """
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 0be8513d58..df6865c915 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -3,7 +3,6 @@
 
 import multiprocessing as mp
 import multiprocessing.reduction
-import os
 
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 8879d2dee1..a261ec7a3d 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -126,23 +126,25 @@ class NullMemoryResource(DummyHostMemoryResource):
     def is_host_accessible(self) -> bool:
         return False
 
+
 def test_package_contents():
     expected = [
-        'Buffer',
-        'MemoryResource',
-        'DeviceMemoryResource',
-        'DeviceMemoryResourceOptions',
-        'IPCBufferDescriptor',
-        'IPCAllocationHandle',
-        'LegacyPinnedMemoryResource',
-        'VirtualMemoryResourceOptions',
-        'VirtualMemoryResource'
+        "Buffer",
+        "MemoryResource",
+        "DeviceMemoryResource",
+        "DeviceMemoryResourceOptions",
+        "IPCBufferDescriptor",
+        "IPCAllocationHandle",
+        "LegacyPinnedMemoryResource",
+        "VirtualMemoryResourceOptions",
+        "VirtualMemoryResource",
     ]
     d = {}
-    exec("from cuda.core.experimental._memory import *", d)
-    d = {k:v for k,v in d.items() if not k.startswith("__")}
+    exec("from cuda.core.experimental._memory import *", d)  # noqa: S102
+    d = {k: v for k, v in d.items() if not k.startswith("__")}
     assert sorted(expected) == sorted(d.keys())
 
+
 def buffer_initialization(dummy_mr: MemoryResource):
     buffer = dummy_mr.allocate(size=1024)
     assert buffer.handle != 0

From 44f7587af494c8ab0261396c8c71c51d2465625b Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 31 Oct 2025 15:53:37 -0700
Subject: [PATCH 26/30] Touch-ups

---
 .../cuda/core/experimental/_memory/_buffer.pyx  |  6 +-----
 .../cuda/core/experimental/_memory/_dmr.pxd     |  3 +--
 .../cuda/core/experimental/_memory/_dmr.pyx     | 17 ++++++++++++-----
 .../cuda/core/experimental/_memory/_vmm.py      |  6 +++---
 4 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index 69910c9869..94aa2ee871 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -39,11 +39,7 @@ cdef class Buffer:
     Support for data interchange mechanisms are provided by DLPack.
     """
     def __cinit__(self):
-        self._ptr = 0
-        self._size = 0
-        self._mr = None
-        self._ptr_obj = None
-        self._alloc_stream = None
+        self._clear()
 
     def _clear(self):
         self._ptr = 0
diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
index 945291b6e4..cdd00de067 100644
--- a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pxd
@@ -3,9 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
-
 from cuda.core.experimental._memory._buffer cimport MemoryResource
-from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData
+from cuda.core.experimental._memory._ipc cimport IPCData
 
 
 cdef class DeviceMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
index fc541406a0..e2de36088e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
@@ -14,7 +14,6 @@ from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCData
 from cuda.core.experimental._stream cimport default_stream, Stream
 from cuda.core.experimental._utils.cuda_utils cimport (
-    _check_driver_error as raise_if_driver_error,
     check_or_create_options,
     HANDLE_RETURN,
 )
@@ -75,9 +74,8 @@ class DeviceMemoryResourceAttributes:
                 mr = self._mr()
                 if mr is None:
                     raise RuntimeError("DeviceMemoryResource is expired")
-                # TODO: this implementation does not allow lowering to Cython + nogil
-                err, value = driver.cuMemPoolGetAttribute(mr.handle, attr_enum)
-                raise_if_driver_error(err)
+                value = DMRA_getattribute(<cydriver.CUmemoryPool><uintptr_t> mr.handle,
+                                          <cydriver.CUmemPool_attribute><uintptr_t> attr_enum)
                 return property_type(value)
             return property(fget=fget, doc=stub.__doc__)
         return decorator
@@ -117,6 +115,15 @@ class DeviceMemoryResourceAttributes:
     del mempool_property
 
 
+cdef int DMRA_getattribute(
+    cydriver.CUmemoryPool pool_handle, cydriver.CUmemPool_attribute attr_enum
+):
+    cdef int value
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, <void *> &value))
+    return value
+
+
 cdef class DeviceMemoryResource(MemoryResource):
     """
     Create a device memory resource managing a stream-ordered memory pool.
@@ -199,9 +206,9 @@ cdef class DeviceMemoryResource(MemoryResource):
     def __cinit__(self):
         self._dev_id = cydriver.CU_DEVICE_INVALID
         self._handle = NULL
-        self._attributes = None
         self._mempool_owned = False
         self._ipc_data = None
+        self._attributes = None
 
     def __init__(self, device_id: int | Device, options=None):
         cdef int dev_id = getattr(device_id, 'device_id', device_id)
diff --git a/cuda_core/cuda/core/experimental/_memory/_vmm.py b/cuda_core/cuda/core/experimental/_memory/_vmm.py
index 3f55614839..ab742e4273 100644
--- a/cuda_core/cuda/core/experimental/_memory/_vmm.py
+++ b/cuda_core/cuda/core/experimental/_memory/_vmm.py
@@ -4,7 +4,7 @@
 
 import platform
 from dataclasses import dataclass, field
-from typing import Iterable, Literal, Optional, Union
+from typing import Iterable, Literal, Union
 
 from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
 from cuda.core.experimental._stream import Stream
@@ -65,8 +65,8 @@ class VirtualMemoryResourceOptions:
     handle_type: VirtualMemoryHandleTypeT = "posix_fd"
     granularity: VirtualMemoryGranularityT = "recommended"
     gpu_direct_rdma: bool = False
-    addr_hint: Optional[int] = 0
-    addr_align: Optional[int] = None
+    addr_hint: int | None = 0
+    addr_align: int | None = None
     peers: Iterable[int] = field(default_factory=tuple)
     self_access: VirtualMemoryAccessTypeT = "rw"
     peer_access: VirtualMemoryAccessTypeT = "rw"

From 0fac80086a75b3e17ba39ff7f1217db2b78be000 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Mon, 3 Nov 2025 17:24:53 -0800
Subject: [PATCH 27/30] Cythonize DeviceMemoryResourceAttributes.

---
 .../cuda/core/experimental/_memory/_dmr.pyx   | 61 ++++++++++---------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
index e2de36088e..b64b6a6842 100644
--- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
@@ -53,66 +53,67 @@ cdef class DeviceMemoryResourceOptions:
     max_size : cython.size_t = 0
 
 
-# TODO: cythonize this?
-class DeviceMemoryResourceAttributes:
+cdef class DeviceMemoryResourceAttributes:
+    cdef:
+        object _mr_weakref
+
     def __init__(self, *args, **kwargs):
         raise RuntimeError("DeviceMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.")
 
     @classmethod
-    def _init(cls, mr : DeviceMemoryReference):
-        self = DeviceMemoryResourceAttributes.__new__(cls)
-        self._mr = mr
+    def _init(cls, mr):
+        cdef DeviceMemoryResourceAttributes self = DeviceMemoryResourceAttributes.__new__(cls)
+        self._mr_weakref = mr
         return self
 
-    def mempool_property(property_type: type):
-        def decorator(stub):
-            attr_enum = getattr(
-                driver.CUmemPool_attribute, f"CU_MEMPOOL_ATTR_{stub.__name__.upper()}"
-            )
-
-            def fget(self) -> property_type:
-                mr = self._mr()
-                if mr is None:
-                    raise RuntimeError("DeviceMemoryResource is expired")
-                value = DMRA_getattribute(<cydriver.CUmemoryPool><uintptr_t> mr.handle,
-                                          <cydriver.CUmemPool_attribute><uintptr_t> attr_enum)
-                return property_type(value)
-            return property(fget=fget, doc=stub.__doc__)
-        return decorator
-
-    @mempool_property(bool)
+    @DMRA_mempool_attribute(bool)
     def reuse_follow_event_dependencies(self):
         """Allow memory to be reused when there are event dependencies between streams."""
 
-    @mempool_property(bool)
+    @DMRA_mempool_attribute(bool)
     def reuse_allow_opportunistic(self):
         """Allow reuse of completed frees without dependencies."""
 
-    @mempool_property(bool)
+    @DMRA_mempool_attribute(bool)
     def reuse_allow_internal_dependencies(self):
         """Allow insertion of new stream dependencies for memory reuse."""
 
-    @mempool_property(int)
+    @DMRA_mempool_attribute(int)
     def release_threshold(self):
         """Amount of reserved memory to hold before OS release."""
 
-    @mempool_property(int)
+    @DMRA_mempool_attribute(int)
     def reserved_mem_current(self):
         """Current amount of backing memory allocated."""
 
-    @mempool_property(int)
+    @DMRA_mempool_attribute(int)
     def reserved_mem_high(self):
         """High watermark of backing memory allocated."""
 
-    @mempool_property(int)
+    @DMRA_mempool_attribute(int)
     def used_mem_current(self):
         """Current amount of memory in use."""
 
-    @mempool_property(int)
+    @DMRA_mempool_attribute(int)
     def used_mem_high(self):
         """High watermark of memory in use."""
 
-    del mempool_property
+
+cdef DMRA_mempool_attribute(property_type: type):
+    def decorator(stub):
+        attr_enum = getattr(
+            driver.CUmemPool_attribute, f"CU_MEMPOOL_ATTR_{stub.__name__.upper()}"
+        )
+
+        def fget(DeviceMemoryResourceAttributes self) -> property_type:
+            cdef DeviceMemoryResource mr = <DeviceMemoryResource> self._mr_weakref()
+            if mr is None:
+                raise RuntimeError("DeviceMemoryResource is expired")
+            value = DMRA_getattribute(<cydriver.CUmemoryPool><uintptr_t> mr.handle,
+                                      <cydriver.CUmemPool_attribute><uintptr_t> attr_enum)
+            return property_type(value)
+        return property(fget=fget, doc=stub.__doc__)
+    return decorator
 
 
 cdef int DMRA_getattribute(

From 0d5f08b9be79f69bb5ec3ba404903287dd58f04b Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Tue, 4 Nov 2025 09:30:27 -0800
Subject: [PATCH 28/30] Restore previous behavior for
 DMR.from_allocation_handle when passed a file descriptor (caller closes the
 fd).

---
 cuda_core/cuda/core/experimental/_memory/_dmr.pyx |  4 +++-
 cuda_core/cuda/core/experimental/_memory/_ipc.pyx | 14 +++++++++++---
 cuda_core/tests/memory_ipc/test_serialize.py      |  2 ++
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
index b64b6a6842..db631bfb3d 100644
--- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
@@ -276,7 +276,9 @@ cdef class DeviceMemoryResource(MemoryResource):
             resource is created.
 
         alloc_handle : int | IPCAllocationHandle
-            The shareable handle of the device memory resource to import.
+            The shareable handle of the device memory resource to import. If an
+            integer is supplied, it must represent a valid platform-specific
+            handle. It is the caller's responsibility to close that handle.
 
         Returns
         -------
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index 4856ce5546..706119c3fb 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -180,13 +180,21 @@ cdef Buffer Buffer_from_ipc_descriptor(
 
 cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle):
     # Quick exit for registry hits.
-    if isinstance(alloc_handle, int):
-        alloc_handle = IPCAllocationHandle._init(alloc_handle, None)
     uuid = getattr(alloc_handle, 'uuid', None)  # no-cython-lint
     mr = registry.get(uuid)
     if mr is not None:
         return mr
 
+    # Ensure we have an allocation handle. Duplicate the file descriptor, if
+    # necessary.
+    if isinstance(alloc_handle, int):
+        fd = os.dup(alloc_handle)
+        try:
+            alloc_handle = IPCAllocationHandle._init(fd, None)
+        except:
+            os.close(fd)
+            raise
+
     # Construct a new DMR.
     cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls)
     self._dev_id = getattr(device_id, 'device_id', device_id)
@@ -205,7 +213,7 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl
         registered = self.register(uuid)
         assert registered is self
 
-    # Always close the file handle (caller can dup it, if needed).
+    # Always close the file handle.
     alloc_handle.close()
 
     return self
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index df6865c915..ceac50e502 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -3,6 +3,7 @@
 
 import multiprocessing as mp
 import multiprocessing.reduction
+import os
 
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
@@ -59,6 +60,7 @@ def child_main(self, conn):
         # Receive the memory resource.
         handle = mp.reduction.recv_handle(conn)
         mr = DeviceMemoryResource.from_allocation_handle(device, handle)
+        os.close(handle)
 
         # Receive the buffers.
         buffer1 = conn.recv()  # directly

From 7315e285dce18cca74634eb60942794c8eb9b01d Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Fri, 7 Nov 2025 12:42:35 -0800
Subject: [PATCH 29/30] Rename _mr to _memory_resource. Change pointer types
 from intptr_t to uintptr_t.

---
 .../core/experimental/_memory/_buffer.pxd     |  6 ++--
 .../core/experimental/_memory/_buffer.pyx     | 32 +++++++++----------
 .../cuda/core/experimental/_memory/_dmr.pyx   | 10 +++---
 .../cuda/core/experimental/_memory/_ipc.pyx   |  8 ++---
 4 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
index a684c97f98..12da84b2bd 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
@@ -2,16 +2,16 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from libc.stdint cimport intptr_t
+from libc.stdint cimport uintptr_t
 
 from cuda.core.experimental._stream cimport Stream
 
 
 cdef class Buffer:
     cdef:
-        intptr_t       _ptr
+        uintptr_t      _ptr
         size_t         _size
-        MemoryResource _mr
+        MemoryResource _memory_resource
         object         _ptr_obj
         Stream         _alloc_stream
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index 94aa2ee871..61d4f191d0 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -4,7 +4,7 @@
 
 from __future__ import annotations
 
-from libc.stdint cimport intptr_t
+from libc.stdint cimport uintptr_t
 
 from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource
 from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor
@@ -44,7 +44,7 @@ cdef class Buffer:
     def _clear(self):
         self._ptr = 0
         self._size = 0
-        self._mr = None
+        self._memory_resource = None
         self._ptr_obj = None
         self._alloc_stream = None
 
@@ -58,10 +58,10 @@ cdef class Buffer:
         stream: Stream | None = None
     ):
         cdef Buffer self = Buffer.__new__(cls)
-        self._ptr = <intptr_t>(int(ptr))
+        self._ptr = <uintptr_t>(int(ptr))
         self._ptr_obj = ptr
         self._size = size
-        self._mr = mr
+        self._memory_resource = mr
         self._alloc_stream = <Stream>(stream) if stream is not None else None
         return self
 
@@ -138,10 +138,10 @@ cdef class Buffer:
         cdef size_t src_size = self._size
 
         if dst is None:
-            if self._mr is None:
+            if self._memory_resource is None:
                 raise ValueError("a destination buffer must be provided (this "
                                  "buffer does not have a memory_resource)")
-            dst = self._mr.allocate(src_size, stream)
+            dst = self._memory_resource.allocate(src_size, stream)
 
         cdef size_t dst_size = dst._size
         if dst_size != src_size:
@@ -226,8 +226,8 @@ cdef class Buffer:
     @property
     def device_id(self) -> int:
         """Return the device ordinal of this buffer."""
-        if self._mr is not None:
-            return self._mr.device_id
+        if self._memory_resource is not None:
+            return self._memory_resource.device_id
         raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
     @property
@@ -250,21 +250,21 @@ cdef class Buffer:
     @property
     def is_device_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the GPU, otherwise False."""
-        if self._mr is not None:
-            return self._mr.is_device_accessible
+        if self._memory_resource is not None:
+            return self._memory_resource.is_device_accessible
         raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
     @property
     def is_host_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the CPU, otherwise False."""
-        if self._mr is not None:
-            return self._mr.is_host_accessible
+        if self._memory_resource is not None:
+            return self._memory_resource.is_host_accessible
         raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
     @property
     def memory_resource(self) -> MemoryResource:
         """Return the memory resource associated with this buffer."""
-        return self._mr
+        return self._memory_resource
 
     @property
     def size(self) -> int:
@@ -276,7 +276,7 @@ cdef class Buffer:
 # ---------------------
 cdef Buffer_close(Buffer self, stream):
     cdef Stream s
-    if self._ptr and self._mr is not None:
+    if self._ptr and self._memory_resource is not None:
         if stream is None:
             if self._alloc_stream is not None:
                 s = self._alloc_stream
@@ -285,9 +285,9 @@ cdef Buffer_close(Buffer self, stream):
                 s = <Stream>(default_stream())
         else:
             s = <Stream>stream
-        self._mr.deallocate(self._ptr, self._size, s)
+        self._memory_resource.deallocate(self._ptr, self._size, s)
         self._ptr = 0
-        self._mr = None
+        self._memory_resource = None
         self._ptr_obj = None
         self._alloc_stream = None
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
index db631bfb3d..47b6fd114e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_dmr.pyx
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 from libc.limits cimport ULLONG_MAX
-from libc.stdint cimport uintptr_t, intptr_t
+from libc.stdint cimport uintptr_t
 from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
@@ -339,7 +339,7 @@ cdef class DeviceMemoryResource(MemoryResource):
             If the buffer is deallocated without an explicit stream, the allocation stream
             is used.
         """
-        DMR_deallocate(self, <intptr_t>ptr, size, <Stream>stream)
+        DMR_deallocate(self, <uintptr_t>ptr, size, <Stream>stream)
 
     @property
     def attributes(self) -> DeviceMemoryResourceAttributes:
@@ -466,16 +466,16 @@ cdef Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
     with nogil:
         HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
     cdef Buffer buf = Buffer.__new__(Buffer)
-    buf._ptr = <intptr_t>(devptr)
+    buf._ptr = <uintptr_t>(devptr)
     buf._ptr_obj = None
     buf._size = size
-    buf._mr = self
+    buf._memory_resource = self
     buf._alloc_stream = stream
     return buf
 
 
 cdef void DMR_deallocate(
-    DeviceMemoryResource self, intptr_t ptr, size_t size, Stream stream
+    DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream
 ) noexcept:
     cdef cydriver.CUstream s = stream._handle
     cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index 706119c3fb..5aa13af8fb 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 cimport cpython
-from libc.stdint cimport intptr_t
+from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
@@ -142,7 +142,7 @@ multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_mem
 # Buffer IPC Implementation
 # -------------------------
 cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self):
-    if not self._mr.is_ipc_enabled:
+    if not self.memory_resource.is_ipc_enabled:
         raise RuntimeError("Memory resource is not IPC-enabled")
     cdef cydriver.CUmemPoolPtrExportData data
     with nogil:
@@ -172,7 +172,7 @@ cdef Buffer Buffer_from_ipc_descriptor(
     cdef cydriver.CUdeviceptr ptr
     with nogil:
         HANDLE_RETURN(cydriver.cuMemPoolImportPointer(&ptr, mr._handle, &data))
-    return Buffer._init(<intptr_t>ptr, ipc_buffer.size, mr, stream)
+    return Buffer._init(<uintptr_t>ptr, ipc_buffer.size, mr, stream)
 
 
 # DeviceMemoryResource IPC Implementation
@@ -205,7 +205,7 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl
     cdef int handle = int(alloc_handle)
     with nogil:
         HANDLE_RETURN(cydriver.cuMemPoolImportFromShareableHandle(
-            &(self._handle), <void*><intptr_t>(handle), IPC_HANDLE_TYPE, 0)
+            &(self._handle), <void*><uintptr_t>(handle), IPC_HANDLE_TYPE, 0)
         )
 
     # Register it.

From cce7f6c2958f52ad985c5d78d3db1f921e90eb59 Mon Sep 17 00:00:00 2001
From: Andy Jost <ajost@nvidia.com>
Date: Wed, 12 Nov 2025 10:30:03 -0800
Subject: [PATCH 30/30] Rename files _dmr.* and _vmm.py to avoid abbreviations.

---
 cuda_core/cuda/core/experimental/_memory/__init__.py          | 4 ++--
 cuda_core/cuda/core/experimental/_memory/_buffer.pyx          | 2 +-
 .../_memory/{_dmr.pxd => _device_memory_resource.pxd}         | 0
 .../_memory/{_dmr.pyx => _device_memory_resource.pyx}         | 0
 cuda_core/cuda/core/experimental/_memory/_ipc.pxd             | 2 +-
 .../_memory/{_vmm.py => _virtual_memory_resource.py}          | 0
 6 files changed, 4 insertions(+), 4 deletions(-)
 rename cuda_core/cuda/core/experimental/_memory/{_dmr.pxd => _device_memory_resource.pxd} (100%)
 rename cuda_core/cuda/core/experimental/_memory/{_dmr.pyx => _device_memory_resource.pyx} (100%)
 rename cuda_core/cuda/core/experimental/_memory/{_vmm.py => _virtual_memory_resource.py} (100%)

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index 9781935cdc..3c07fbdde6 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from ._buffer import *  # noqa: F403
-from ._dmr import *  # noqa: F403
+from ._device_memory_resource import *  # noqa: F403
 from ._ipc import *  # noqa: F403
 from ._legacy import *  # noqa: F403
-from ._vmm import *  # noqa: F403
+from ._virtual_memory_resource import *  # noqa: F403
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index 61d4f191d0..2251272742 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -6,7 +6,7 @@ from __future__ import annotations
 
 from libc.stdint cimport uintptr_t
 
-from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource
+from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
 from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor
 from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._stream cimport default_stream, Stream
diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pxd b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_memory/_dmr.pxd
rename to cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
diff --git a/cuda_core/cuda/core/experimental/_memory/_dmr.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
similarity index 100%
rename from cuda_core/cuda/core/experimental/_memory/_dmr.pyx
rename to cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
index 6480f32619..2b9c80290d 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
@@ -4,7 +4,7 @@
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer
-from cuda.core.experimental._memory._dmr cimport DeviceMemoryResource
+from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
 
 
 # Holds DeviceMemoryResource objects imported by this process.  This enables
diff --git a/cuda_core/cuda/core/experimental/_memory/_vmm.py b/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_memory/_vmm.py
rename to cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py