From 2d8c99a89e487942ec14ae440d4c68bd250b69c6 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 12 Jan 2026 13:54:50 -0800
Subject: [PATCH 01/20] initial localized test

---
 .../cuda/bindings/_internal/nvfatbin.pxd      |  22 ++
 .../bindings/_internal/nvfatbin_linux.pyx     | 242 ++++++++++++++++++
 .../bindings/_internal/nvfatbin_windows.pyx   | 233 +++++++++++++++++
 cuda_bindings/cuda/bindings/cynvfatbin.pxd    |  53 ++++
 cuda_bindings/cuda/bindings/cynvfatbin.pyx    |  38 +++
 cuda_bindings/cuda/bindings/nvfatbin.pxd      |  37 +++
 cuda_bindings/cuda/bindings/nvfatbin.pyx      | 194 ++++++++++++++
 cuda_bindings/tests/test_nvfatbin.py          |  89 +++++++
 8 files changed, 908 insertions(+)
 create mode 100644 cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
 create mode 100644 cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
 create mode 100644 cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
 create mode 100644 cuda_bindings/cuda/bindings/cynvfatbin.pxd
 create mode 100644 cuda_bindings/cuda/bindings/cynvfatbin.pyx
 create mode 100644 cuda_bindings/cuda/bindings/nvfatbin.pxd
 create mode 100644 cuda_bindings/cuda/bindings/nvfatbin.pyx
 create mode 100644 cuda_bindings/tests/test_nvfatbin.py

diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
new file mode 100644
index 0000000000..14a8a6d608
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
+
+from ..cynvfatbin cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+
+
+
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
new file mode 100644
index 0000000000..06143d9031
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
@@ -0,0 +1,242 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uintptr_t
+
+import threading
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+cdef int get_cuda_version():
+    cdef void* handle = NULL
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = dlopen('libcuda.so.1', RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        err_msg = dlerror()
+        raise NotSupportedError(f'CUDA driver is not found ({err_msg.decode()})')
+    cuDriverGetVersion = dlsym(handle, "cuDriverGetVersion")
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in libcuda.so.1')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_nvfatbin_init = False
+
+cdef void* __nvFatbinCreate = NULL
+cdef void* __nvFatbinDestroy = NULL
+cdef void* __nvFatbinAddPTX = NULL
+cdef void* __nvFatbinSize = NULL
+cdef void* __nvFatbinGet = NULL
+cdef void* __nvFatbinVersion = NULL
+
+
+cdef void* load_library() except* with gil:
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvfatbin")._handle_uint
+    return <void*>handle
+
+
+cdef int _init_nvfatbin() except -1 nogil:
+    global __py_nvfatbin_init
+
+    cdef void* handle = NULL
+
+    with gil, __symbol_lock:
+        # Recheck the flag after obtaining the locks
+        if __py_nvfatbin_init:
+            return 0
+
+        # Load function
+        global __nvFatbinCreate
+        __nvFatbinCreate = dlsym(RTLD_DEFAULT, 'nvFatbinCreate')
+        if __nvFatbinCreate == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinCreate = dlsym(handle, 'nvFatbinCreate')
+
+        global __nvFatbinDestroy
+        __nvFatbinDestroy = dlsym(RTLD_DEFAULT, 'nvFatbinDestroy')
+        if __nvFatbinDestroy == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinDestroy = dlsym(handle, 'nvFatbinDestroy')
+
+        global __nvFatbinAddPTX
+        __nvFatbinAddPTX = dlsym(RTLD_DEFAULT, 'nvFatbinAddPTX')
+        if __nvFatbinAddPTX == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddPTX = dlsym(handle, 'nvFatbinAddPTX')
+
+        global __nvFatbinSize
+        __nvFatbinSize = dlsym(RTLD_DEFAULT, 'nvFatbinSize')
+        if __nvFatbinSize == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinSize = dlsym(handle, 'nvFatbinSize')
+
+        global __nvFatbinGet
+        __nvFatbinGet = dlsym(RTLD_DEFAULT, 'nvFatbinGet')
+        if __nvFatbinGet == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinGet = dlsym(handle, 'nvFatbinGet')
+
+        global __nvFatbinVersion
+        __nvFatbinVersion = dlsym(RTLD_DEFAULT, 'nvFatbinVersion')
+        if __nvFatbinVersion == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinVersion = dlsym(handle, 'nvFatbinVersion')
+
+        __py_nvfatbin_init = True
+        return 0
+
+
+cdef inline int _check_or_init_nvfatbin() except -1 nogil:
+    if __py_nvfatbin_init:
+        return 0
+
+    return _init_nvfatbin()
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvfatbin()
+    cdef dict data = {}
+
+    global __nvFatbinCreate
+    data["__nvFatbinCreate"] = <intptr_t>__nvFatbinCreate
+
+    global __nvFatbinDestroy
+    data["__nvFatbinDestroy"] = <intptr_t>__nvFatbinDestroy
+
+    global __nvFatbinAddPTX
+    data["__nvFatbinAddPTX"] = <intptr_t>__nvFatbinAddPTX
+
+    global __nvFatbinSize
+    data["__nvFatbinSize"] = <intptr_t>__nvFatbinSize
+
+    global __nvFatbinGet
+    data["__nvFatbinGet"] = <intptr_t>__nvFatbinGet
+
+    global __nvFatbinVersion
+    data["__nvFatbinVersion"] = <intptr_t>__nvFatbinVersion
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinCreate
+    _check_or_init_nvfatbin()
+    if __nvFatbinCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinCreate is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle*, const char**, size_t) noexcept nogil>__nvFatbinCreate)(
+        handle_indirect, options, optionsCount)
+
+
+cdef nvFatbinResult _nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinDestroy
+    _check_or_init_nvfatbin()
+    if __nvFatbinDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinDestroy is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle*) noexcept nogil>__nvFatbinDestroy)(
+        handle_indirect)
+
+
+cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddPTX
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddPTX == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddPTX is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const char*, size_t, const char*, const char*, const char*) noexcept nogil>__nvFatbinAddPTX)(
+        handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinSize
+    _check_or_init_nvfatbin()
+    if __nvFatbinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinSize is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, size_t*) noexcept nogil>__nvFatbinSize)(
+        handle, size)
+
+
+cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinGet
+    _check_or_init_nvfatbin()
+    if __nvFatbinGet == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinGet is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, void*) noexcept nogil>__nvFatbinGet)(
+        handle, buffer)
+
+
+cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinVersion
+    _check_or_init_nvfatbin()
+    if __nvFatbinVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinVersion is not found")
+    return (<nvFatbinResult (*)(unsigned int*, unsigned int*) noexcept nogil>__nvFatbinVersion)(
+        major, minor)
+
+
+
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
new file mode 100644
index 0000000000..cc1824bf43
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
@@ -0,0 +1,233 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t
+
+import threading
+from .utils import FunctionNotFoundError, NotSupportedError
+
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
+from libc.stddef cimport wchar_t
+from libc.stdint cimport uintptr_t
+from cpython cimport PyUnicode_AsWideCharString, PyMem_Free
+
+# You must 'from .utils import NotSupportedError' before using this template
+
+cdef extern from "windows.h" nogil:
+    ctypedef void* HMODULE
+    ctypedef void* HANDLE
+    ctypedef void* FARPROC
+    ctypedef unsigned long DWORD
+    ctypedef const wchar_t *LPCWSTR
+    ctypedef const char *LPCSTR
+
+    cdef DWORD LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
+    cdef DWORD LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
+    cdef DWORD LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
+
+    HMODULE _LoadLibraryExW "LoadLibraryExW"(
+        LPCWSTR lpLibFileName,
+        HANDLE hFile,
+        DWORD dwFlags
+    )
+
+    FARPROC _GetProcAddress "GetProcAddress"(HMODULE hModule, LPCSTR lpProcName)
+
+cdef inline uintptr_t LoadLibraryExW(str path, HANDLE hFile, DWORD dwFlags):
+    cdef uintptr_t result
+    cdef wchar_t* wpath = PyUnicode_AsWideCharString(path, NULL)
+    with nogil:
+        result = <uintptr_t>_LoadLibraryExW(
+            wpath,
+            hFile,
+            dwFlags
+        )
+    PyMem_Free(wpath)
+    return result
+
+cdef inline void *GetProcAddress(uintptr_t hModule, const char* lpProcName) nogil:
+    return _GetProcAddress(<HMODULE>hModule, lpProcName)
+
+cdef int get_cuda_version():
+    cdef int err, driver_ver = 0
+
+    # Load driver to check version
+    handle = LoadLibraryExW("nvcuda.dll", NULL, LOAD_LIBRARY_SEARCH_SYSTEM32)
+    if handle == 0:
+        raise NotSupportedError('CUDA driver is not found')
+    cuDriverGetVersion = GetProcAddress(handle, 'cuDriverGetVersion')
+    if cuDriverGetVersion == NULL:
+        raise RuntimeError('Did not find cuDriverGetVersion symbol in nvcuda.dll')
+    err = (<int (*)(int*) noexcept nogil>cuDriverGetVersion)(&driver_ver)
+    if err != 0:
+        raise RuntimeError(f'cuDriverGetVersion returned error code {err}')
+
+    return driver_ver
+
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef object __symbol_lock = threading.Lock()
+cdef bint __py_nvfatbin_init = False
+
+cdef void* __nvFatbinCreate = NULL
+cdef void* __nvFatbinDestroy = NULL
+cdef void* __nvFatbinAddPTX = NULL
+cdef void* __nvFatbinSize = NULL
+cdef void* __nvFatbinGet = NULL
+cdef void* __nvFatbinVersion = NULL
+
+
+cdef int _init_nvfatbin() except -1 nogil:
+    global __py_nvfatbin_init
+
+    with gil, __symbol_lock:
+        # Recheck the flag after obtaining the locks
+        if __py_nvfatbin_init:
+            return 0
+
+        # Load library
+        handle = load_nvidia_dynamic_lib("nvfatbin")._handle_uint
+
+        # Load function
+        global __nvFatbinCreate
+        __nvFatbinCreate = GetProcAddress(handle, 'nvFatbinCreate')
+
+        global __nvFatbinDestroy
+        __nvFatbinDestroy = GetProcAddress(handle, 'nvFatbinDestroy')
+
+        global __nvFatbinAddPTX
+        __nvFatbinAddPTX = GetProcAddress(handle, 'nvFatbinAddPTX')
+
+        global __nvFatbinSize
+        __nvFatbinSize = GetProcAddress(handle, 'nvFatbinSize')
+
+        global __nvFatbinGet
+        __nvFatbinGet = GetProcAddress(handle, 'nvFatbinGet')
+
+        global __nvFatbinVersion
+        __nvFatbinVersion = GetProcAddress(handle, 'nvFatbinVersion')
+
+        __py_nvfatbin_init = True
+        return 0
+
+
+cdef inline int _check_or_init_nvfatbin() except -1 nogil:
+    if __py_nvfatbin_init:
+        return 0
+
+    return _init_nvfatbin()
+
+
+cdef dict func_ptrs = None
+
+
+cpdef dict _inspect_function_pointers():
+    global func_ptrs
+    if func_ptrs is not None:
+        return func_ptrs
+
+    _check_or_init_nvfatbin()
+    cdef dict data = {}
+
+    global __nvFatbinCreate
+    data["__nvFatbinCreate"] = <intptr_t>__nvFatbinCreate
+
+    global __nvFatbinDestroy
+    data["__nvFatbinDestroy"] = <intptr_t>__nvFatbinDestroy
+
+    global __nvFatbinAddPTX
+    data["__nvFatbinAddPTX"] = <intptr_t>__nvFatbinAddPTX
+
+    global __nvFatbinSize
+    data["__nvFatbinSize"] = <intptr_t>__nvFatbinSize
+
+    global __nvFatbinGet
+    data["__nvFatbinGet"] = <intptr_t>__nvFatbinGet
+
+    global __nvFatbinVersion
+    data["__nvFatbinVersion"] = <intptr_t>__nvFatbinVersion
+
+    func_ptrs = data
+    return data
+
+
+cpdef _inspect_function_pointer(str name):
+    global func_ptrs
+    if func_ptrs is None:
+        func_ptrs = _inspect_function_pointers()
+    return func_ptrs[name]
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinCreate
+    _check_or_init_nvfatbin()
+    if __nvFatbinCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinCreate is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle*, const char**, size_t) noexcept nogil>__nvFatbinCreate)(
+        handle_indirect, options, optionsCount)
+
+
+cdef nvFatbinResult _nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinDestroy
+    _check_or_init_nvfatbin()
+    if __nvFatbinDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinDestroy is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle*) noexcept nogil>__nvFatbinDestroy)(
+        handle_indirect)
+
+
+cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddPTX
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddPTX == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddPTX is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const char*, size_t, const char*, const char*, const char*) noexcept nogil>__nvFatbinAddPTX)(
+        handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinSize
+    _check_or_init_nvfatbin()
+    if __nvFatbinSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinSize is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, size_t*) noexcept nogil>__nvFatbinSize)(
+        handle, size)
+
+
+cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinGet
+    _check_or_init_nvfatbin()
+    if __nvFatbinGet == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinGet is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, void*) noexcept nogil>__nvFatbinGet)(
+        handle, buffer)
+
+
+cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinVersion
+    _check_or_init_nvfatbin()
+    if __nvFatbinVersion == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinVersion is not found")
+    return (<nvFatbinResult (*)(unsigned int*, unsigned int*) noexcept nogil>__nvFatbinVersion)(
+        major, minor)
+
+
+
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pxd b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
new file mode 100644
index 0000000000..651aa27152
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uint32_t
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+# enums
+ctypedef enum nvFatbinResult "nvFatbinResult":
+    NVFATBIN_SUCCESS "NVFATBIN_SUCCESS" = 0
+    NVFATBIN_ERROR_INTERNAL "NVFATBIN_ERROR_INTERNAL"
+    NVFATBIN_ERROR_ELF_ARCH_MISMATCH "NVFATBIN_ERROR_ELF_ARCH_MISMATCH"
+    NVFATBIN_ERROR_ELF_SIZE_MISMATCH "NVFATBIN_ERROR_ELF_SIZE_MISMATCH"
+    NVFATBIN_ERROR_MISSING_PTX_VERSION "NVFATBIN_ERROR_MISSING_PTX_VERSION"
+    NVFATBIN_ERROR_NULL_POINTER "NVFATBIN_ERROR_NULL_POINTER"
+    NVFATBIN_ERROR_COMPRESSION_FAILED "NVFATBIN_ERROR_COMPRESSION_FAILED"
+    NVFATBIN_ERROR_COMPRESSED_SIZE_EXCEEDED "NVFATBIN_ERROR_COMPRESSED_SIZE_EXCEEDED"
+    NVFATBIN_ERROR_UNRECOGNIZED_OPTION "NVFATBIN_ERROR_UNRECOGNIZED_OPTION"
+    NVFATBIN_ERROR_INVALID_ARCH "NVFATBIN_ERROR_INVALID_ARCH"
+    NVFATBIN_ERROR_INVALID_NVVM "NVFATBIN_ERROR_INVALID_NVVM"
+    NVFATBIN_ERROR_EMPTY_INPUT "NVFATBIN_ERROR_EMPTY_INPUT"
+    NVFATBIN_ERROR_MISSING_PTX_ARCH "NVFATBIN_ERROR_MISSING_PTX_ARCH"
+    NVFATBIN_ERROR_PTX_ARCH_MISMATCH "NVFATBIN_ERROR_PTX_ARCH_MISMATCH"
+    NVFATBIN_ERROR_MISSING_FATBIN "NVFATBIN_ERROR_MISSING_FATBIN"
+    NVFATBIN_ERROR_INVALID_INDEX "NVFATBIN_ERROR_INVALID_INDEX"
+    NVFATBIN_ERROR_IDENTIFIER_REUSE "NVFATBIN_ERROR_IDENTIFIER_REUSE"
+    NVFATBIN_ERROR_INTERNAL_PTX_OPTION "NVFATBIN_ERROR_INTERNAL_PTX_OPTION"
+    _NVFATBINRESULT_INTERNAL_LOADING_ERROR "_NVFATBINRESULT_INTERNAL_LOADING_ERROR" = -42
+
+
+# types
+ctypedef void* nvFatbinHandle 'nvFatbinHandle'
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef nvFatbinResult nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+
+
+
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pyx b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
new file mode 100644
index 0000000000..13c9ac2cc1
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
+
+from ._internal cimport nvfatbin as _nvfatbin
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef nvFatbinResult nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinCreate(handle_indirect, options, optionsCount)
+
+
+cdef nvFatbinResult nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinDestroy(handle_indirect)
+
+
+cdef nvFatbinResult nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddPTX(handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinSize(handle, size)
+
+
+cdef nvFatbinResult nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinGet(handle, buffer)
+
+
+cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinVersion(major, minor)
+
+
+
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pxd b/cuda_bindings/cuda/bindings/nvfatbin.pxd
new file mode 100644
index 0000000000..1350d0ed52
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pxd
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
+
+from libc.stdint cimport intptr_t, uint32_t
+
+from .cynvfatbin cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef nvFatbinHandle Handle
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef nvFatbinResult _Result
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef intptr_t create(options, size_t options_count) except -1
+cpdef add_ptx(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line)
+cpdef size_t size(intptr_t handle) except? 0
+cpdef get(intptr_t handle, buffer)
+cpdef tuple version()
+
+
+
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
new file mode 100644
index 0000000000..dcc669797e
--- /dev/null
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -0,0 +1,194 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+#
+# This code was automatically generated with version 13.0.0. Do not modify it directly.
+
+cimport cython  # NOQA
+
+from ._internal.utils cimport (get_resource_ptr, get_nested_resource_ptr, nested_resource, nullable_unique_ptr,
+                               get_buffer_pointer, get_resource_ptrs)
+
+from enum import IntEnum as _IntEnum
+from libcpp.vector cimport vector
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class Result(_IntEnum):
+    """See `nvFatbinResult`."""
+    SUCCESS = NVFATBIN_SUCCESS
+    ERROR_INTERNAL = NVFATBIN_ERROR_INTERNAL
+    ERROR_ELF_ARCH_MISMATCH = NVFATBIN_ERROR_ELF_ARCH_MISMATCH
+    ERROR_ELF_SIZE_MISMATCH = NVFATBIN_ERROR_ELF_SIZE_MISMATCH
+    ERROR_MISSING_PTX_VERSION = NVFATBIN_ERROR_MISSING_PTX_VERSION
+    ERROR_NULL_POINTER = NVFATBIN_ERROR_NULL_POINTER
+    ERROR_COMPRESSION_FAILED = NVFATBIN_ERROR_COMPRESSION_FAILED
+    ERROR_COMPRESSED_SIZE_EXCEEDED = NVFATBIN_ERROR_COMPRESSED_SIZE_EXCEEDED
+    ERROR_UNRECOGNIZED_OPTION = NVFATBIN_ERROR_UNRECOGNIZED_OPTION
+    ERROR_INVALID_ARCH = NVFATBIN_ERROR_INVALID_ARCH
+    ERROR_INVALID_NVVM = NVFATBIN_ERROR_INVALID_NVVM
+    ERROR_EMPTY_INPUT = NVFATBIN_ERROR_EMPTY_INPUT
+    ERROR_MISSING_PTX_ARCH = NVFATBIN_ERROR_MISSING_PTX_ARCH
+    ERROR_PTX_ARCH_MISMATCH = NVFATBIN_ERROR_PTX_ARCH_MISMATCH
+    ERROR_MISSING_FATBIN = NVFATBIN_ERROR_MISSING_FATBIN
+    ERROR_INVALID_INDEX = NVFATBIN_ERROR_INVALID_INDEX
+    ERROR_IDENTIFIER_REUSE = NVFATBIN_ERROR_IDENTIFIER_REUSE
+    ERROR_INTERNAL_PTX_OPTION = NVFATBIN_ERROR_INTERNAL_PTX_OPTION
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+class nvfatbinError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        s = Result(status)
+        cdef str err = f"{s.name} ({s.value})"
+        super(nvfatbinError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cdef int check_status(int status) except 1 nogil:
+    if status != 0:
+        with gil:
+            raise nvfatbinError(status)
+    return status
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef destroy(intptr_t handle):
+    """nvFatbinDestroy frees the memory associated with the given handle.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+
+    .. seealso:: `nvFatbinDestroy`
+    """
+    cdef Handle h = <Handle>handle
+    with nogil:
+        status = nvFatbinDestroy(&h)
+    check_status(status)
+
+
+cpdef intptr_t create(options, size_t options_count) except -1:
+    """nvFatbinCreate creates a new handle.
+
+    Args:
+        options (object): An array of strings, each containing a single option. It can be:
+
+            - an :class:`int` as the pointer address to the nested sequence, or
+            - a Python sequence of :class:`int`\s, each of which is a pointer address
+              to a valid sequence of 'char', or
+            - a nested Python sequence of ``str``.
+
+        options_count (size_t): Number of options.
+
+    Returns:
+        intptr_t: Address of nvFatbin handle.
+
+    .. seealso:: `nvFatbinCreate`
+    """
+    cdef nested_resource[ char ] _options_
+    get_nested_resource_ptr[char](_options_, options, <char*>NULL)
+    cdef Handle handle_indirect
+    with nogil:
+        __status__ = nvFatbinCreate(&handle_indirect, <const char**>(_options_.ptrs.data()), options_count)
+    check_status(__status__)
+    return <intptr_t>handle_indirect
+
+
+cpdef add_ptx(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line):
+    """nvFatbinAddPTX adds PTX to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The PTX code.
+        size (size_t): The size of the PTX code.
+        arch (str): The numerical architecture that this PTX is for (the XX of any sm_XX, lto_XX, or compute_XX).
+        identifier (str): Name of the PTX, useful when extracting the fatbin with tools like cuobjdump.
+        options_cmd_line (str): Options used during JIT compilation.
+
+    .. seealso:: `nvFatbinAddPTX`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    if not isinstance(arch, str):
+        raise TypeError("arch must be a Python str")
+    cdef bytes _temp_arch_ = (<str>arch).encode()
+    cdef char* _arch_ = _temp_arch_
+    if not isinstance(identifier, str):
+        raise TypeError("identifier must be a Python str")
+    cdef bytes _temp_identifier_ = (<str>identifier).encode()
+    cdef char* _identifier_ = _temp_identifier_
+    if not isinstance(options_cmd_line, str):
+        raise TypeError("options_cmd_line must be a Python str")
+    cdef bytes _temp_options_cmd_line_ = (<str>options_cmd_line).encode()
+    cdef char* _options_cmd_line_ = _temp_options_cmd_line_
+    with nogil:
+        __status__ = nvFatbinAddPTX(<Handle>handle, <const char*>_code_, size, <const char*>_arch_, <const char*>_identifier_, <const char*>_options_cmd_line_)
+    check_status(__status__)
+
+
+cpdef size_t size(intptr_t handle) except? 0:
+    """nvFatbinSize returns the fatbinary's size.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+
+    Returns:
+        size_t: The fatbinary's size.
+
+    .. seealso:: `nvFatbinSize`
+    """
+    cdef size_t size
+    with nogil:
+        __status__ = nvFatbinSize(<Handle>handle, &size)
+    check_status(__status__)
+    return size
+
+
+cpdef get(intptr_t handle, buffer):
+    """nvFatbinGet returns the completed fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        buffer (bytes): memory to store fatbinary.
+
+    .. seealso:: `nvFatbinGet`
+    """
+    cdef void* _buffer_ = get_buffer_pointer(buffer, -1, readonly=False)
+    with nogil:
+        __status__ = nvFatbinGet(<Handle>handle, <void*>_buffer_)
+    check_status(__status__)
+
+
+cpdef tuple version():
+    """nvFatbinVersion returns the current version of nvFatbin.
+
+    Returns:
+        A 2-tuple containing:
+
+        - unsigned int: The major version.
+        - unsigned int: The minor version.
+
+    .. seealso:: `nvFatbinVersion`
+    """
+    cdef unsigned int major
+    cdef unsigned int minor
+    with nogil:
+        __status__ = nvFatbinVersion(&major, &minor)
+    check_status(__status__)
+    return (major, minor)
+
+
+
diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
new file mode 100644
index 0000000000..627bd300e4
--- /dev/null
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+from cuda.bindings import nvfatbin
+
+import pytest
+
+ARCHITECTURES = ["sm_75", "sm_80", "sm_90", "sm_100"]
+PTX_VERSIONS = ["6.4", "7.0", "8.5", "8.8"]
+
+PTX_TEMPLATE = """
+.version {PTX_VERSION}
+.target {ARCH}
+.address_size 64
+
+        // .globl       _Z6kernelPi
+
+.visible .entry _Z6kernelPi(
+        .param .u64 _Z6kernelPi_param_0
+)
+{{
+        .reg .b32       %r<7>;
+        .reg .b64       %rd<5>;
+
+
+        ld.param.u64    %rd1, [_Z6kernelPi_param_0];
+        cvta.to.global.u64      %rd2, %rd1;
+        mov.u32         %r1, %tid.x;
+        mov.u32         %r2, %ctaid.x;
+        mov.u32         %r3, %ntid.x;
+        mad.lo.s32      %r4, %r2, %r3, %r1;
+        mul.wide.s32    %rd3, %r4, 4;
+        add.s64         %rd4, %rd2, %rd3;
+        ld.global.u32   %r5, [%rd4];
+        add.s32         %r6, %r5, 1;
+        st.global.u32   [%rd4], %r6;
+        ret;
+
+}}
+"""
+
+@pytest.fixture(params=ARCHITECTURES)
+def arch(request):
+    return request.param
+
+@pytest.fixture(params=PTX_VERSIONS)
+def ptx_version(request):
+    return request.param
+
+@pytest.fixture
+def PTX(arch, ptx_version):
+    return PTX_TEMPLATE.format(PTX_VERSION=ptx_version, ARCH=arch)
+
+def test_nvfatbin_get_version():
+    major, minor = nvfatbin.version()
+    assert major is not None
+    assert minor is not None
+
+def test_nvfatbin_empty_create_and_destroy():
+    handle = nvfatbin.create([], 0)
+    assert handle is not None
+    nvfatbin.destroy(handle)
+
+def test_nvfatbin_invalid_input_create():
+    with pytest.raises(nvfatbin.nvfatbinError, match="ERROR_UNRECOGNIZED_OPTION"):
+        nvfatbin.create(["--unsupported_option"], 1)
+
+
+def test_nvfatbin_get_empty():
+    handle = nvfatbin.create([], 0)
+    size = nvfatbin.size(handle)
+
+    buffer = bytearray(size)
+    nvfatbin.get(handle, buffer)
+
+    nvfatbin.destroy(handle)
+
+
+def test_nvfatbin_add_ptx(PTX, arch):
+    arch_numeric = arch.split("_")[1]
+
+    handle = nvfatbin.create([], 0)
+    nvfatbin.add_ptx(handle, PTX.encode(), len(PTX), arch_numeric, "add", f"-arch={arch}")
+
+    buffer = bytearray(nvfatbin.size(handle))
+
+    nvfatbin.get(handle, buffer)
+    nvfatbin.destroy(handle)
+

From 9b1a5590a1b4f34e91cbd78991154707615a7171 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Mon, 12 Jan 2026 20:08:30 -0800
Subject: [PATCH 02/20] add rest of APIs

---
 .../cuda/bindings/_internal/nvfatbin.pxd      |   5 +
 .../bindings/_internal/nvfatbin_linux.pyx     |  65 +++++++++
 .../bindings/_internal/nvfatbin_windows.pyx   |  53 ++++++++
 cuda_bindings/cuda/bindings/cynvfatbin.pxd    |   5 +
 cuda_bindings/cuda/bindings/cynvfatbin.pyx    |  14 ++
 cuda_bindings/cuda/bindings/nvfatbin.pxd      |   5 +
 cuda_bindings/cuda/bindings/nvfatbin.pyx      |  75 ++++++++++
 cuda_bindings/tests/test_nvfatbin.py          | 128 +++++++++++++++++-
 8 files changed, 349 insertions(+), 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
index 14a8a6d608..d421e8c21e 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
@@ -14,9 +14,14 @@ from ..cynvfatbin cimport *
 cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 
 
 
+
+
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
index 06143d9031..097043f69a 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
@@ -62,6 +62,9 @@ cdef bint __py_nvfatbin_init = False
 cdef void* __nvFatbinCreate = NULL
 cdef void* __nvFatbinDestroy = NULL
 cdef void* __nvFatbinAddPTX = NULL
+cdef void* __nvFatbinAddCubin = NULL
+cdef void* __nvFatbinAddLTOIR = NULL
+cdef void* __nvFatbinAddReloc = NULL
 cdef void* __nvFatbinSize = NULL
 cdef void* __nvFatbinGet = NULL
 cdef void* __nvFatbinVersion = NULL
@@ -104,6 +107,27 @@ cdef int _init_nvfatbin() except -1 nogil:
                 handle = load_library()
             __nvFatbinAddPTX = dlsym(handle, 'nvFatbinAddPTX')
 
+        global __nvFatbinAddCubin
+        __nvFatbinAddCubin = dlsym(RTLD_DEFAULT, 'nvFatbinAddCubin')
+        if __nvFatbinAddCubin == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddCubin = dlsym(handle, 'nvFatbinAddCubin')
+
+        global __nvFatbinAddLTOIR
+        __nvFatbinAddLTOIR = dlsym(RTLD_DEFAULT, 'nvFatbinAddLTOIR')
+        if __nvFatbinAddLTOIR == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddLTOIR = dlsym(handle, 'nvFatbinAddLTOIR')
+
+        global __nvFatbinAddReloc
+        __nvFatbinAddReloc = dlsym(RTLD_DEFAULT, 'nvFatbinAddReloc')
+        if __nvFatbinAddReloc == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddReloc = dlsym(handle, 'nvFatbinAddReloc')
+
         global __nvFatbinSize
         __nvFatbinSize = dlsym(RTLD_DEFAULT, 'nvFatbinSize')
         if __nvFatbinSize == NULL:
@@ -155,6 +179,15 @@ cpdef dict _inspect_function_pointers():
     global __nvFatbinAddPTX
     data["__nvFatbinAddPTX"] = <intptr_t>__nvFatbinAddPTX
 
+    global __nvFatbinAddCubin
+    data["__nvFatbinAddCubin"] = <intptr_t>__nvFatbinAddCubin
+
+    global __nvFatbinAddLTOIR
+    data["__nvFatbinAddLTOIR"] = <intptr_t>__nvFatbinAddLTOIR
+
+    global __nvFatbinAddReloc
+    data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
+
     global __nvFatbinSize
     data["__nvFatbinSize"] = <intptr_t>__nvFatbinSize
 
@@ -209,6 +242,36 @@ cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, siz
         handle, code, size, arch, identifier, optionsCmdLine)
 
 
+cdef nvFatbinResult _nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddCubin
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddCubin is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddCubin)(
+        handle, code, size, arch, identifier)
+
+
+cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddLTOIR
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddLTOIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddLTOIR is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*, const char*) noexcept nogil>__nvFatbinAddLTOIR)(
+        handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddReloc
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddReloc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddReloc is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t) noexcept nogil>__nvFatbinAddReloc)(
+        handle, code, size)
+
+
 cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     global __nvFatbinSize
     _check_or_init_nvfatbin()
@@ -240,3 +303,5 @@ cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) e
 
 
 
+
+
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
index cc1824bf43..a499637f0d 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
@@ -80,6 +80,9 @@ cdef bint __py_nvfatbin_init = False
 cdef void* __nvFatbinCreate = NULL
 cdef void* __nvFatbinDestroy = NULL
 cdef void* __nvFatbinAddPTX = NULL
+cdef void* __nvFatbinAddCubin = NULL
+cdef void* __nvFatbinAddLTOIR = NULL
+cdef void* __nvFatbinAddReloc = NULL
 cdef void* __nvFatbinSize = NULL
 cdef void* __nvFatbinGet = NULL
 cdef void* __nvFatbinVersion = NULL
@@ -106,6 +109,15 @@ cdef int _init_nvfatbin() except -1 nogil:
         global __nvFatbinAddPTX
         __nvFatbinAddPTX = GetProcAddress(handle, 'nvFatbinAddPTX')
 
+        global __nvFatbinAddCubin
+        __nvFatbinAddCubin = GetProcAddress(handle, 'nvFatbinAddCubin')
+
+        global __nvFatbinAddLTOIR
+        __nvFatbinAddLTOIR = GetProcAddress(handle, 'nvFatbinAddLTOIR')
+
+        global __nvFatbinAddReloc
+        __nvFatbinAddReloc = GetProcAddress(handle, 'nvFatbinAddReloc')
+
         global __nvFatbinSize
         __nvFatbinSize = GetProcAddress(handle, 'nvFatbinSize')
 
@@ -146,6 +158,15 @@ cpdef dict _inspect_function_pointers():
     global __nvFatbinAddPTX
     data["__nvFatbinAddPTX"] = <intptr_t>__nvFatbinAddPTX
 
+    global __nvFatbinAddCubin
+    data["__nvFatbinAddCubin"] = <intptr_t>__nvFatbinAddCubin
+
+    global __nvFatbinAddLTOIR
+    data["__nvFatbinAddLTOIR"] = <intptr_t>__nvFatbinAddLTOIR
+
+    global __nvFatbinAddReloc
+    data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
+
     global __nvFatbinSize
     data["__nvFatbinSize"] = <intptr_t>__nvFatbinSize
 
@@ -200,6 +221,36 @@ cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, siz
         handle, code, size, arch, identifier, optionsCmdLine)
 
 
+cdef nvFatbinResult _nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddCubin
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddCubin == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddCubin is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddCubin)(
+        handle, code, size, arch, identifier)
+
+
+cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddLTOIR
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddLTOIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddLTOIR is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*, const char*) noexcept nogil>__nvFatbinAddLTOIR)(
+        handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddReloc
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddReloc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddReloc is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t) noexcept nogil>__nvFatbinAddReloc)(
+        handle, code, size)
+
+
 cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     global __nvFatbinSize
     _check_or_init_nvfatbin()
@@ -231,3 +282,5 @@ cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) e
 
 
 
+
+
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pxd b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
index 651aa27152..55d8c83c1a 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
@@ -45,9 +45,14 @@ ctypedef void* nvFatbinHandle 'nvFatbinHandle'
 cdef nvFatbinResult nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 
 
 
+
+
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pyx b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
index 13c9ac2cc1..142f374c1b 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
@@ -23,6 +23,18 @@ cdef nvFatbinResult nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size
     return _nvfatbin._nvFatbinAddPTX(handle, code, size, arch, identifier, optionsCmdLine)
 
 
+cdef nvFatbinResult nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddCubin(handle, code, size, arch, identifier)
+
+
+cdef nvFatbinResult nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddLTOIR(handle, code, size, arch, identifier, optionsCmdLine)
+
+
+cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddReloc(handle, code, size)
+
+
 cdef nvFatbinResult nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     return _nvfatbin._nvFatbinSize(handle, size)
 
@@ -36,3 +48,5 @@ cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) ex
 
 
 
+
+
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pxd b/cuda_bindings/cuda/bindings/nvfatbin.pxd
index 1350d0ed52..54c793962b 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pxd
@@ -29,9 +29,14 @@ ctypedef nvFatbinResult _Result
 
 cpdef intptr_t create(options, size_t options_count) except -1
 cpdef add_ptx(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line)
+cpdef add_cubin(intptr_t handle, code, size_t size, arch, identifier)
+cpdef add_ltoir(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line)
+cpdef add_reloc(intptr_t handle, code, size_t size)
 cpdef size_t size(intptr_t handle) except? 0
 cpdef get(intptr_t handle, buffer)
 cpdef tuple version()
 
 
 
+
+
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
index dcc669797e..92db285f8a 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -139,6 +139,79 @@ cpdef add_ptx(intptr_t handle, code, size_t size, arch, identifier, options_cmd_
     check_status(__status__)
 
 
+cpdef add_cubin(intptr_t handle, code, size_t size, arch, identifier):
+    """nvFatbinAddCubin adds a CUDA binary to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The cubin.
+        size (size_t): The size of the cubin.
+        arch (str): The numerical architecture that this cubin is for (the XX of any sm_XX, lto_XX, or compute_XX).
+        identifier (str): Name of the cubin, useful when extracting the fatbin with tools like cuobjdump.
+
+    .. seealso:: `nvFatbinAddCubin`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    if not isinstance(arch, str):
+        raise TypeError("arch must be a Python str")
+    cdef bytes _temp_arch_ = (<str>arch).encode()
+    cdef char* _arch_ = _temp_arch_
+    if not isinstance(identifier, str):
+        raise TypeError("identifier must be a Python str")
+    cdef bytes _temp_identifier_ = (<str>identifier).encode()
+    cdef char* _identifier_ = _temp_identifier_
+    with nogil:
+        __status__ = nvFatbinAddCubin(<Handle>handle, <const void*>_code_, size, <const char*>_arch_, <const char*>_identifier_)
+    check_status(__status__)
+
+
+cpdef add_ltoir(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line):
+    """nvFatbinAddLTOIR adds LTOIR to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The LTOIR code.
+        size (size_t): The size of the LTOIR code.
+        arch (str): The numerical architecture that this LTOIR is for (the XX of any sm_XX, lto_XX, or compute_XX).
+        identifier (str): Name of the LTOIR, useful when extracting the fatbin with tools like cuobjdump.
+        options_cmd_line (str): Options used during JIT compilation.
+
+    .. seealso:: `nvFatbinAddLTOIR`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    if not isinstance(arch, str):
+        raise TypeError("arch must be a Python str")
+    cdef bytes _temp_arch_ = (<str>arch).encode()
+    cdef char* _arch_ = _temp_arch_
+    if not isinstance(identifier, str):
+        raise TypeError("identifier must be a Python str")
+    cdef bytes _temp_identifier_ = (<str>identifier).encode()
+    cdef char* _identifier_ = _temp_identifier_
+    if not isinstance(options_cmd_line, str):
+        raise TypeError("options_cmd_line must be a Python str")
+    cdef bytes _temp_options_cmd_line_ = (<str>options_cmd_line).encode()
+    cdef char* _options_cmd_line_ = _temp_options_cmd_line_
+    with nogil:
+        __status__ = nvFatbinAddLTOIR(<Handle>handle, <const void*>_code_, size, <const char*>_arch_, <const char*>_identifier_, <const char*>_options_cmd_line_)
+    check_status(__status__)
+
+
+cpdef add_reloc(intptr_t handle, code, size_t size):
+    """nvFatbinAddReloc adds relocatable PTX entries from a host object to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The host object image.
+        size (size_t): The size of the host object image code.
+
+    .. seealso:: `nvFatbinAddReloc`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    with nogil:
+        __status__ = nvFatbinAddReloc(<Handle>handle, <const void*>_code_, size)
+    check_status(__status__)
+
+
 cpdef size_t size(intptr_t handle) except? 0:
     """nvFatbinSize returns the fatbinary's size.
 
@@ -192,3 +265,5 @@ cpdef tuple version():
 
 
 
+
+
diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index 627bd300e4..3e893852b5 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -1,7 +1,9 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-from cuda.bindings import nvfatbin
+import subprocess
+
+from cuda.bindings import nvfatbin, nvrtc
 
 import pytest
 
@@ -39,6 +41,12 @@
 }}
 """
 
+CODE = """
+int __device__ inc(int x) {
+    return x + 1;
+}
+"""
+
 @pytest.fixture(params=ARCHITECTURES)
 def arch(request):
     return request.param
@@ -51,6 +59,63 @@ def ptx_version(request):
 def PTX(arch, ptx_version):
     return PTX_TEMPLATE.format(PTX_VERSION=ptx_version, ARCH=arch)
 
+@pytest.fixture
+def CUBIN(arch):
+    def CHECK_NVRTC(err):
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError(repr(err))
+
+    err, program_handle = nvrtc.nvrtcCreateProgram(CODE.encode(), b"", 0, [], [])
+    CHECK_NVRTC(err)
+    err = nvrtc.nvrtcCompileProgram(program_handle, 1, [f"-arch={arch}".encode()])[0]
+    CHECK_NVRTC(err)
+    err, size = nvrtc.nvrtcGetCUBINSize(program_handle)
+    CHECK_NVRTC(err)
+    cubin = b" " * size
+    (err,) = nvrtc.nvrtcGetCUBIN(program_handle, cubin)
+    CHECK_NVRTC(err)
+    (err,) = nvrtc.nvrtcDestroyProgram(program_handle)
+    CHECK_NVRTC(err)
+    return cubin
+
+# create a valid LTOIR input for testing
+@pytest.fixture
+def LTOIR(arch):
+    arch = arch.replace("sm", "compute")
+    def CHECK_NVRTC(err):
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError(repr(err))
+
+    empty_cplusplus_kernel = "__global__ void A() {}"
+    err, program_handle = nvrtc.nvrtcCreateProgram(empty_cplusplus_kernel.encode(), b"", 0, [], [])
+    CHECK_NVRTC(err)
+    err = nvrtc.nvrtcCompileProgram(program_handle, 1, [b"-dlto", f"-arch={arch}".encode()])[0]
+    CHECK_NVRTC(err)
+    err, size = nvrtc.nvrtcGetLTOIRSize(program_handle)
+    CHECK_NVRTC(err)
+    empty_kernel_ltoir = b" " * size
+    (err,) = nvrtc.nvrtcGetLTOIR(program_handle, empty_kernel_ltoir)
+    CHECK_NVRTC(err)
+    (err,) = nvrtc.nvrtcDestroyProgram(program_handle)
+    CHECK_NVRTC(err)
+    return empty_kernel_ltoir
+
+@pytest.fixture
+def OBJECT(arch, tmpdir):
+    if arch == "sm_100":
+        pytest.skip("sm_100 is not supported on local system.")
+
+    empty_cplusplus_kernel = "__global__ void A() {} int main() { return 0; }"
+    with open(tmpdir / "object.cu", "w") as f:
+        f.write(empty_cplusplus_kernel)
+
+    subprocess.check_output(["nvcc", "-arch", arch, "-o", str(tmpdir / "object.o"), str(tmpdir / "object.cu")])
+    with open(tmpdir / "object.o", "rb") as f:
+        object = f.read()
+
+    return object
+
+
 def test_nvfatbin_get_version():
     major, minor = nvfatbin.version()
     assert major is not None
@@ -87,3 +152,64 @@ def test_nvfatbin_add_ptx(PTX, arch):
     nvfatbin.get(handle, buffer)
     nvfatbin.destroy(handle)
 
+
+@pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
+def test_nvfatbin_add_cubin_ELF_SIZE_MISMATCH(CUBIN, arch):
+    handle = nvfatbin.create([], 0)
+    with pytest.raises(nvfatbin.nvfatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
+        nvfatbin.add_cubin(handle, CUBIN, len(CUBIN), "75", "inc")
+
+    nvfatbin.destroy(handle)
+
+
+def test_nvfatbin_add_cubin(CUBIN, arch):
+    arch_numeric = arch.split("_")[1]
+
+    handle = nvfatbin.create([], 0)
+    nvfatbin.add_cubin(handle, CUBIN, len(CUBIN), arch_numeric, "inc")
+
+    buffer = bytearray(nvfatbin.size(handle))
+
+    nvfatbin.get(handle, buffer)
+    nvfatbin.destroy(handle)
+
+
+@pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
+def test_nvfatbin_add_cubin_ELF_ARCH_MISMATCH(CUBIN, arch):
+    handle = nvfatbin.create([], 0)
+    with pytest.raises(nvfatbin.nvfatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
+        nvfatbin.add_cubin(handle, CUBIN, len(CUBIN), "75", "inc")
+
+    nvfatbin.destroy(handle)
+
+
+def test_nvdfatbin_add_ltoir(LTOIR, arch):
+    arch_numeric = arch.split("_")[1]
+
+    handle = nvfatbin.create([], 0)
+    nvfatbin.add_ltoir(handle, LTOIR, len(LTOIR), arch_numeric, "inc", "")
+
+    buffer = bytearray(nvfatbin.size(handle))
+
+    nvfatbin.get(handle, buffer)
+    nvfatbin.destroy(handle)
+
+
+@pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
+def test_nvdfatbin_add_ltoir_ELF_ARCH_MISMATCH(LTOIR, arch):
+    pytest.skip()
+    handle = nvfatbin.create([], 0)
+    with pytest.raises(nvfatbin.nvfatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
+        nvfatbin.add_ltoir(handle, LTOIR, len(LTOIR), "75", "inc", "")
+
+    nvfatbin.destroy(handle)
+
+
+def test_nvfatbin_add_reloc(OBJECT):
+    handle = nvfatbin.create([], 0)
+    nvfatbin.add_reloc(handle, OBJECT, len(OBJECT))
+
+    buffer = bytearray(nvfatbin.size(handle))
+
+    nvfatbin.get(handle, buffer)
+    nvfatbin.destroy(handle)
\ No newline at end of file

From a979dd08988ab86efff816e139903c50e8081a11 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 13 Jan 2026 10:30:23 -0800
Subject: [PATCH 03/20] remove local skips

---
 cuda_bindings/tests/test_nvfatbin.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index 3e893852b5..ae29b3c5ae 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -102,9 +102,6 @@ def CHECK_NVRTC(err):
 
 @pytest.fixture
 def OBJECT(arch, tmpdir):
-    if arch == "sm_100":
-        pytest.skip("sm_100 is not supported on local system.")
-
     empty_cplusplus_kernel = "__global__ void A() {} int main() { return 0; }"
     with open(tmpdir / "object.cu", "w") as f:
         f.write(empty_cplusplus_kernel)
@@ -197,7 +194,6 @@ def test_nvdfatbin_add_ltoir(LTOIR, arch):
 
 @pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
 def test_nvdfatbin_add_ltoir_ELF_ARCH_MISMATCH(LTOIR, arch):
-    pytest.skip()
     handle = nvfatbin.create([], 0)
     with pytest.raises(nvfatbin.nvfatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
         nvfatbin.add_ltoir(handle, LTOIR, len(LTOIR), "75", "inc", "")

From fc4203ec0060257bb9d136866f31db2f3fdeeb35 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 13 Jan 2026 10:55:37 -0800
Subject: [PATCH 04/20] regenerate for CUDA 13.1 and with tile IR API

---
 .../cuda/bindings/_internal/nvfatbin.pxd      |  6 +-
 .../bindings/_internal/nvfatbin_linux.pyx     | 66 ++++++++++++-------
 .../bindings/_internal/nvfatbin_windows.pyx   | 54 ++++++++++-----
 cuda_bindings/cuda/bindings/cynvfatbin.pxd    |  6 +-
 cuda_bindings/cuda/bindings/cynvfatbin.pyx    | 15 +++--
 cuda_bindings/cuda/bindings/nvfatbin.pxd      |  6 +-
 cuda_bindings/cuda/bindings/nvfatbin.pyx      | 61 ++++++++++++-----
 7 files changed, 146 insertions(+), 68 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
index d421e8c21e..fed9968e39 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
 
 from ..cynvfatbin cimport *
 
@@ -16,10 +16,12 @@ cdef nvFatbinResult _nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NV
 cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
-cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+
 
 
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
index 097043f69a..b78f15e73b 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
@@ -64,10 +64,11 @@ cdef void* __nvFatbinDestroy = NULL
 cdef void* __nvFatbinAddPTX = NULL
 cdef void* __nvFatbinAddCubin = NULL
 cdef void* __nvFatbinAddLTOIR = NULL
-cdef void* __nvFatbinAddReloc = NULL
 cdef void* __nvFatbinSize = NULL
 cdef void* __nvFatbinGet = NULL
 cdef void* __nvFatbinVersion = NULL
+cdef void* __nvFatbinAddReloc = NULL
+cdef void* __nvFatbinAddTileIR = NULL
 
 
 cdef void* load_library() except* with gil:
@@ -121,13 +122,6 @@ cdef int _init_nvfatbin() except -1 nogil:
                 handle = load_library()
             __nvFatbinAddLTOIR = dlsym(handle, 'nvFatbinAddLTOIR')
 
-        global __nvFatbinAddReloc
-        __nvFatbinAddReloc = dlsym(RTLD_DEFAULT, 'nvFatbinAddReloc')
-        if __nvFatbinAddReloc == NULL:
-            if handle == NULL:
-                handle = load_library()
-            __nvFatbinAddReloc = dlsym(handle, 'nvFatbinAddReloc')
-
         global __nvFatbinSize
         __nvFatbinSize = dlsym(RTLD_DEFAULT, 'nvFatbinSize')
         if __nvFatbinSize == NULL:
@@ -149,6 +143,20 @@ cdef int _init_nvfatbin() except -1 nogil:
                 handle = load_library()
             __nvFatbinVersion = dlsym(handle, 'nvFatbinVersion')
 
+        global __nvFatbinAddReloc
+        __nvFatbinAddReloc = dlsym(RTLD_DEFAULT, 'nvFatbinAddReloc')
+        if __nvFatbinAddReloc == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddReloc = dlsym(handle, 'nvFatbinAddReloc')
+
+        global __nvFatbinAddTileIR
+        __nvFatbinAddTileIR = dlsym(RTLD_DEFAULT, 'nvFatbinAddTileIR')
+        if __nvFatbinAddTileIR == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinAddTileIR = dlsym(handle, 'nvFatbinAddTileIR')
+
         __py_nvfatbin_init = True
         return 0
 
@@ -185,9 +193,6 @@ cpdef dict _inspect_function_pointers():
     global __nvFatbinAddLTOIR
     data["__nvFatbinAddLTOIR"] = <intptr_t>__nvFatbinAddLTOIR
 
-    global __nvFatbinAddReloc
-    data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
-
     global __nvFatbinSize
     data["__nvFatbinSize"] = <intptr_t>__nvFatbinSize
 
@@ -197,6 +202,12 @@ cpdef dict _inspect_function_pointers():
     global __nvFatbinVersion
     data["__nvFatbinVersion"] = <intptr_t>__nvFatbinVersion
 
+    global __nvFatbinAddReloc
+    data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
+
+    global __nvFatbinAddTileIR
+    data["__nvFatbinAddTileIR"] = <intptr_t>__nvFatbinAddTileIR
+
     func_ptrs = data
     return data
 
@@ -262,16 +273,6 @@ cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, s
         handle, code, size, arch, identifier, optionsCmdLine)
 
 
-cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
-    global __nvFatbinAddReloc
-    _check_or_init_nvfatbin()
-    if __nvFatbinAddReloc == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvFatbinAddReloc is not found")
-    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t) noexcept nogil>__nvFatbinAddReloc)(
-        handle, code, size)
-
-
 cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     global __nvFatbinSize
     _check_or_init_nvfatbin()
@@ -302,6 +303,27 @@ cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) e
         major, minor)
 
 
+cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddReloc
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddReloc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddReloc is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t) noexcept nogil>__nvFatbinAddReloc)(
+        handle, code, size)
+
+
+cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddTileIR
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddTileIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddTileIR is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddTileIR)(
+        handle, code, size, identifier, optionsCmdLine)
+
+
+
 
 
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
index a499637f0d..2a4ff825ab 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -82,10 +82,11 @@ cdef void* __nvFatbinDestroy = NULL
 cdef void* __nvFatbinAddPTX = NULL
 cdef void* __nvFatbinAddCubin = NULL
 cdef void* __nvFatbinAddLTOIR = NULL
-cdef void* __nvFatbinAddReloc = NULL
 cdef void* __nvFatbinSize = NULL
 cdef void* __nvFatbinGet = NULL
 cdef void* __nvFatbinVersion = NULL
+cdef void* __nvFatbinAddReloc = NULL
+cdef void* __nvFatbinAddTileIR = NULL
 
 
 cdef int _init_nvfatbin() except -1 nogil:
@@ -115,9 +116,6 @@ cdef int _init_nvfatbin() except -1 nogil:
         global __nvFatbinAddLTOIR
         __nvFatbinAddLTOIR = GetProcAddress(handle, 'nvFatbinAddLTOIR')
 
-        global __nvFatbinAddReloc
-        __nvFatbinAddReloc = GetProcAddress(handle, 'nvFatbinAddReloc')
-
         global __nvFatbinSize
         __nvFatbinSize = GetProcAddress(handle, 'nvFatbinSize')
 
@@ -127,6 +125,12 @@ cdef int _init_nvfatbin() except -1 nogil:
         global __nvFatbinVersion
         __nvFatbinVersion = GetProcAddress(handle, 'nvFatbinVersion')
 
+        global __nvFatbinAddReloc
+        __nvFatbinAddReloc = GetProcAddress(handle, 'nvFatbinAddReloc')
+
+        global __nvFatbinAddTileIR
+        __nvFatbinAddTileIR = GetProcAddress(handle, 'nvFatbinAddTileIR')
+
         __py_nvfatbin_init = True
         return 0
 
@@ -164,9 +168,6 @@ cpdef dict _inspect_function_pointers():
     global __nvFatbinAddLTOIR
     data["__nvFatbinAddLTOIR"] = <intptr_t>__nvFatbinAddLTOIR
 
-    global __nvFatbinAddReloc
-    data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
-
     global __nvFatbinSize
     data["__nvFatbinSize"] = <intptr_t>__nvFatbinSize
 
@@ -176,6 +177,12 @@ cpdef dict _inspect_function_pointers():
     global __nvFatbinVersion
     data["__nvFatbinVersion"] = <intptr_t>__nvFatbinVersion
 
+    global __nvFatbinAddReloc
+    data["__nvFatbinAddReloc"] = <intptr_t>__nvFatbinAddReloc
+
+    global __nvFatbinAddTileIR
+    data["__nvFatbinAddTileIR"] = <intptr_t>__nvFatbinAddTileIR
+
     func_ptrs = data
     return data
 
@@ -241,16 +248,6 @@ cdef nvFatbinResult _nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, s
         handle, code, size, arch, identifier, optionsCmdLine)
 
 
-cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
-    global __nvFatbinAddReloc
-    _check_or_init_nvfatbin()
-    if __nvFatbinAddReloc == NULL:
-        with gil:
-            raise FunctionNotFoundError("function nvFatbinAddReloc is not found")
-    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t) noexcept nogil>__nvFatbinAddReloc)(
-        handle, code, size)
-
-
 cdef nvFatbinResult _nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     global __nvFatbinSize
     _check_or_init_nvfatbin()
@@ -281,6 +278,27 @@ cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) e
         major, minor)
 
 
+cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddReloc
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddReloc == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddReloc is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t) noexcept nogil>__nvFatbinAddReloc)(
+        handle, code, size)
+
+
+cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    global __nvFatbinAddTileIR
+    _check_or_init_nvfatbin()
+    if __nvFatbinAddTileIR == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinAddTileIR is not found")
+    return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddTileIR)(
+        handle, code, size, identifier, optionsCmdLine)
+
+
+
 
 
 
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pxd b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
index 55d8c83c1a..03262af56b 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
@@ -47,10 +47,12 @@ cdef nvFatbinResult nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVF
 cdef nvFatbinResult nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinAddCubin(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
-cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+cdef nvFatbinResult nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
+
 
 
 
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pyx b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
index 142f374c1b..98a2c63412 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
 
 from ._internal cimport nvfatbin as _nvfatbin
 
@@ -31,10 +31,6 @@ cdef nvFatbinResult nvFatbinAddLTOIR(nvFatbinHandle handle, const void* code, si
     return _nvfatbin._nvFatbinAddLTOIR(handle, code, size, arch, identifier, optionsCmdLine)
 
 
-cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
-    return _nvfatbin._nvFatbinAddReloc(handle, code, size)
-
-
 cdef nvFatbinResult nvFatbinSize(nvFatbinHandle handle, size_t* size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     return _nvfatbin._nvFatbinSize(handle, size)
 
@@ -47,6 +43,15 @@ cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) ex
     return _nvfatbin._nvFatbinVersion(major, minor)
 
 
+cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddReloc(handle, code, size)
+
+
+cdef nvFatbinResult nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
+    return _nvfatbin._nvFatbinAddTileIR(handle, code, size, identifier, optionsCmdLine)
+
+
+
 
 
 
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pxd b/cuda_bindings/cuda/bindings/nvfatbin.pxd
index 54c793962b..ae987e3a7d 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uint32_t
 
@@ -31,10 +31,12 @@ cpdef intptr_t create(options, size_t options_count) except -1
 cpdef add_ptx(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line)
 cpdef add_cubin(intptr_t handle, code, size_t size, arch, identifier)
 cpdef add_ltoir(intptr_t handle, code, size_t size, arch, identifier, options_cmd_line)
-cpdef add_reloc(intptr_t handle, code, size_t size)
 cpdef size_t size(intptr_t handle) except? 0
 cpdef get(intptr_t handle, buffer)
 cpdef tuple version()
+cpdef add_reloc(intptr_t handle, code, size_t size)
+cpdef add_tile_ir(intptr_t handle, code, size_t size, identifier, options_cmd_line)
+
 
 
 
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
index 92db285f8a..9470745dee 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.0.0. Do not modify it directly.
+# This code was automatically generated across versions from 12.4.1 to 13.1.0. Do not modify it directly.
 
 cimport cython  # NOQA
 
@@ -196,22 +196,6 @@ cpdef add_ltoir(intptr_t handle, code, size_t size, arch, identifier, options_cm
     check_status(__status__)
 
 
-cpdef add_reloc(intptr_t handle, code, size_t size):
-    """nvFatbinAddReloc adds relocatable PTX entries from a host object to the fatbinary.
-
-    Args:
-        handle (intptr_t): nvFatbin handle.
-        code (bytes): The host object image.
-        size (size_t): The size of the host object image code.
-
-    .. seealso:: `nvFatbinAddReloc`
-    """
-    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
-    with nogil:
-        __status__ = nvFatbinAddReloc(<Handle>handle, <const void*>_code_, size)
-    check_status(__status__)
-
-
 cpdef size_t size(intptr_t handle) except? 0:
     """nvFatbinSize returns the fatbinary's size.
 
@@ -264,6 +248,49 @@ cpdef tuple version():
     return (major, minor)
 
 
+cpdef add_reloc(intptr_t handle, code, size_t size):
+    """nvFatbinAddReloc adds relocatable PTX entries from a host object to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The host object image.
+        size (size_t): The size of the host object image code.
+
+    .. seealso:: `nvFatbinAddReloc`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    with nogil:
+        __status__ = nvFatbinAddReloc(<Handle>handle, <const void*>_code_, size)
+    check_status(__status__)
+
+
+cpdef add_tile_ir(intptr_t handle, code, size_t size, identifier, options_cmd_line):
+    """nvFatbinAddTileIR adds Tile IR to the fatbinary.
+
+    Args:
+        handle (intptr_t): nvFatbin handle.
+        code (bytes): The Tile IR.
+        size (size_t): The size of the Tile IR.
+        identifier (str): Name of the Tile IR, useful when extracting the fatbin with tools like cuobjdump.
+        options_cmd_line (str): Options used during JIT compilation.
+
+    .. seealso:: `nvFatbinAddTileIR`
+    """
+    cdef void* _code_ = get_buffer_pointer(code, size, readonly=True)
+    if not isinstance(identifier, str):
+        raise TypeError("identifier must be a Python str")
+    cdef bytes _temp_identifier_ = (<str>identifier).encode()
+    cdef char* _identifier_ = _temp_identifier_
+    if not isinstance(options_cmd_line, str):
+        raise TypeError("options_cmd_line must be a Python str")
+    cdef bytes _temp_options_cmd_line_ = (<str>options_cmd_line).encode()
+    cdef char* _options_cmd_line_ = _temp_options_cmd_line_
+    with nogil:
+        __status__ = nvFatbinAddTileIR(<Handle>handle, <const void*>_code_, size, <const char*>_identifier_, <const char*>_options_cmd_line_)
+    check_status(__status__)
+
+
+
 
 
 

From e7ace35c50105a8e2029ebd41de90df6a0947f10 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Tue, 13 Jan 2026 13:21:34 -0800
Subject: [PATCH 05/20] nvfatbinError -> nvFatbinError

---
 cuda_bindings/cuda/bindings/nvfatbin.pyx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
index 9470745dee..527b7846b0 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -43,13 +43,13 @@ class Result(_IntEnum):
 # Error handling
 ###############################################################################
 
-class nvfatbinError(Exception):
+class nvFatbinError(Exception):
 
     def __init__(self, status):
         self.status = status
         s = Result(status)
         cdef str err = f"{s.name} ({s.value})"
-        super(nvfatbinError, self).__init__(err)
+        super(nvFatbinError, self).__init__(err)
 
     def __reduce__(self):
         return (type(self), (self.status,))
@@ -59,7 +59,7 @@ class nvfatbinError(Exception):
 cdef int check_status(int status) except 1 nogil:
     if status != 0:
         with gil:
-            raise nvfatbinError(status)
+            raise nvFatbinError(status)
     return status
 
 

From d962f612da2d55d26f5d434d1baf14b96862ddb2 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 14 Jan 2026 11:55:15 -0800
Subject: [PATCH 06/20] regenerate for get_error_string

---
 .../cuda/bindings/_internal/nvfatbin.pxd      |  7 +--
 .../bindings/_internal/nvfatbin_linux.pyx     | 27 +++++++---
 .../bindings/_internal/nvfatbin_windows.pyx   | 23 +++++---
 cuda_bindings/cuda/bindings/cynvfatbin.pxd    |  7 +--
 cuda_bindings/cuda/bindings/cynvfatbin.pyx    | 10 ++--
 cuda_bindings/cuda/bindings/nvfatbin.pxd      |  6 ---
 cuda_bindings/cuda/bindings/nvfatbin.pyx      | 25 ++++++---
 cuda_bindings/tests/test_nvfatbin.py          | 52 +++++++++++++------
 8 files changed, 98 insertions(+), 59 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
index fed9968e39..843759b32c 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
@@ -11,6 +11,7 @@ from ..cynvfatbin cimport *
 # Wrapper functions
 ###############################################################################
 
+cdef const char* _nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil
 cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
@@ -21,9 +22,3 @@ cdef nvFatbinResult _nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NV
 cdef nvFatbinResult _nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
-
-
-
-
-
-
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
index b78f15e73b..b50efe02b7 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
@@ -59,6 +59,7 @@ cdef int get_cuda_version():
 cdef object __symbol_lock = threading.Lock()
 cdef bint __py_nvfatbin_init = False
 
+cdef void* __nvFatbinGetErrorString = NULL
 cdef void* __nvFatbinCreate = NULL
 cdef void* __nvFatbinDestroy = NULL
 cdef void* __nvFatbinAddPTX = NULL
@@ -87,6 +88,13 @@ cdef int _init_nvfatbin() except -1 nogil:
             return 0
 
         # Load function
+        global __nvFatbinGetErrorString
+        __nvFatbinGetErrorString = dlsym(RTLD_DEFAULT, 'nvFatbinGetErrorString')
+        if __nvFatbinGetErrorString == NULL:
+            if handle == NULL:
+                handle = load_library()
+            __nvFatbinGetErrorString = dlsym(handle, 'nvFatbinGetErrorString')
+
         global __nvFatbinCreate
         __nvFatbinCreate = dlsym(RTLD_DEFAULT, 'nvFatbinCreate')
         if __nvFatbinCreate == NULL:
@@ -178,6 +186,9 @@ cpdef dict _inspect_function_pointers():
     _check_or_init_nvfatbin()
     cdef dict data = {}
 
+    global __nvFatbinGetErrorString
+    data["__nvFatbinGetErrorString"] = <intptr_t>__nvFatbinGetErrorString
+
     global __nvFatbinCreate
     data["__nvFatbinCreate"] = <intptr_t>__nvFatbinCreate
 
@@ -223,6 +234,16 @@ cpdef _inspect_function_pointer(str name):
 # Wrapper functions
 ###############################################################################
 
+cdef const char* _nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil:
+    global __nvFatbinGetErrorString
+    _check_or_init_nvfatbin()
+    if __nvFatbinGetErrorString == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinGetErrorString is not found")
+    return (<const char* (*)(nvFatbinResult) noexcept nogil>__nvFatbinGetErrorString)(
+        result)
+
+
 cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     global __nvFatbinCreate
     _check_or_init_nvfatbin()
@@ -321,9 +342,3 @@ cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code,
             raise FunctionNotFoundError("function nvFatbinAddTileIR is not found")
     return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddTileIR)(
         handle, code, size, identifier, optionsCmdLine)
-
-
-
-
-
-
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
index 2a4ff825ab..e40777a7e4 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
@@ -77,6 +77,7 @@ cdef int get_cuda_version():
 cdef object __symbol_lock = threading.Lock()
 cdef bint __py_nvfatbin_init = False
 
+cdef void* __nvFatbinGetErrorString = NULL
 cdef void* __nvFatbinCreate = NULL
 cdef void* __nvFatbinDestroy = NULL
 cdef void* __nvFatbinAddPTX = NULL
@@ -101,6 +102,9 @@ cdef int _init_nvfatbin() except -1 nogil:
         handle = load_nvidia_dynamic_lib("nvfatbin")._handle_uint
 
         # Load function
+        global __nvFatbinGetErrorString
+        __nvFatbinGetErrorString = GetProcAddress(handle, 'nvFatbinGetErrorString')
+
         global __nvFatbinCreate
         __nvFatbinCreate = GetProcAddress(handle, 'nvFatbinCreate')
 
@@ -153,6 +157,9 @@ cpdef dict _inspect_function_pointers():
     _check_or_init_nvfatbin()
     cdef dict data = {}
 
+    global __nvFatbinGetErrorString
+    data["__nvFatbinGetErrorString"] = <intptr_t>__nvFatbinGetErrorString
+
     global __nvFatbinCreate
     data["__nvFatbinCreate"] = <intptr_t>__nvFatbinCreate
 
@@ -198,6 +205,16 @@ cpdef _inspect_function_pointer(str name):
 # Wrapper functions
 ###############################################################################
 
+cdef const char* _nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil:
+    global __nvFatbinGetErrorString
+    _check_or_init_nvfatbin()
+    if __nvFatbinGetErrorString == NULL:
+        with gil:
+            raise FunctionNotFoundError("function nvFatbinGetErrorString is not found")
+    return (<const char* (*)(nvFatbinResult) noexcept nogil>__nvFatbinGetErrorString)(
+        result)
+
+
 cdef nvFatbinResult _nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     global __nvFatbinCreate
     _check_or_init_nvfatbin()
@@ -296,9 +313,3 @@ cdef nvFatbinResult _nvFatbinAddTileIR(nvFatbinHandle handle, const void* code,
             raise FunctionNotFoundError("function nvFatbinAddTileIR is not found")
     return (<nvFatbinResult (*)(nvFatbinHandle, const void*, size_t, const char*, const char*) noexcept nogil>__nvFatbinAddTileIR)(
         handle, code, size, identifier, optionsCmdLine)
-
-
-
-
-
-
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pxd b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
index 03262af56b..a01a3ea1d9 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
@@ -42,6 +42,7 @@ ctypedef void* nvFatbinHandle 'nvFatbinHandle'
 # Functions
 ###############################################################################
 
+cdef const char* nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil
 cdef nvFatbinResult nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinDestroy(nvFatbinHandle* handle_indirect) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinAddPTX(nvFatbinHandle handle, const char* code, size_t size, const char* arch, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
@@ -52,9 +53,3 @@ cdef nvFatbinResult nvFatbinGet(nvFatbinHandle handle, void* buffer) except?_NVF
 cdef nvFatbinResult nvFatbinVersion(unsigned int* major, unsigned int* minor) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, size_t size) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
 cdef nvFatbinResult nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil
-
-
-
-
-
-
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pyx b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
index 98a2c63412..7cd77c2d75 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
@@ -11,6 +11,10 @@ from ._internal cimport nvfatbin as _nvfatbin
 # Wrapper functions
 ###############################################################################
 
+cdef const char* nvFatbinGetErrorString(nvFatbinResult result) except?NULL nogil:
+    return _nvfatbin._nvFatbinGetErrorString(result)
+
+
 cdef nvFatbinResult nvFatbinCreate(nvFatbinHandle* handle_indirect, const char** options, size_t optionsCount) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     return _nvfatbin._nvFatbinCreate(handle_indirect, options, optionsCount)
 
@@ -49,9 +53,3 @@ cdef nvFatbinResult nvFatbinAddReloc(nvFatbinHandle handle, const void* code, si
 
 cdef nvFatbinResult nvFatbinAddTileIR(nvFatbinHandle handle, const void* code, size_t size, const char* identifier, const char* optionsCmdLine) except?_NVFATBINRESULT_INTERNAL_LOADING_ERROR nogil:
     return _nvfatbin._nvFatbinAddTileIR(handle, code, size, identifier, optionsCmdLine)
-
-
-
-
-
-
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pxd b/cuda_bindings/cuda/bindings/nvfatbin.pxd
index ae987e3a7d..545d30dc0e 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pxd
@@ -36,9 +36,3 @@ cpdef get(intptr_t handle, buffer)
 cpdef tuple version()
 cpdef add_reloc(intptr_t handle, code, size_t size)
 cpdef add_tile_ir(intptr_t handle, code, size_t size, identifier, options_cmd_line)
-
-
-
-
-
-
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
index 527b7846b0..701d2dd502 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -81,6 +81,25 @@ cpdef destroy(intptr_t handle):
     check_status(status)
 
 
+cpdef str get_error_string(int result):
+    """nvFatbinGetErrorString returns an error description string for each error code.
+
+    Args:
+        result (Result): error code.
+
+    .. seealso:: `nvFatbinGetErrorString`
+    """
+    cdef char* _output_
+    cdef bytes _output_bytes_
+    _output_ = nvFatbinGetErrorString(<_Result>result)
+
+    if _output_ == NULL:
+        return ""
+
+    _output_bytes_ = <bytes>_output_
+    return _output_bytes_.decode()
+
+
 cpdef intptr_t create(options, size_t options_count) except -1:
     """nvFatbinCreate creates a new handle.
 
@@ -288,9 +307,3 @@ cpdef add_tile_ir(intptr_t handle, code, size_t size, identifier, options_cmd_li
     with nogil:
         __status__ = nvFatbinAddTileIR(<Handle>handle, <const void*>_code_, size, <const char*>_identifier_, <const char*>_options_cmd_line_)
     check_status(__status__)
-
-
-
-
-
-
diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index ae29b3c5ae..440305d51b 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -1,11 +1,9 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-import subprocess
-
-from cuda.bindings import nvfatbin, nvrtc
 
 import pytest
+from cuda.bindings import nvfatbin, nvrtc
 
 ARCHITECTURES = ["sm_75", "sm_80", "sm_90", "sm_100"]
 PTX_VERSIONS = ["6.4", "7.0", "8.5", "8.8"]
@@ -47,18 +45,22 @@
 }
 """
 
+
 @pytest.fixture(params=ARCHITECTURES)
 def arch(request):
     return request.param
 
+
 @pytest.fixture(params=PTX_VERSIONS)
 def ptx_version(request):
     return request.param
 
+
 @pytest.fixture
 def PTX(arch, ptx_version):
     return PTX_TEMPLATE.format(PTX_VERSION=ptx_version, ARCH=arch)
 
+
 @pytest.fixture
 def CUBIN(arch):
     def CHECK_NVRTC(err):
@@ -78,10 +80,12 @@ def CHECK_NVRTC(err):
     CHECK_NVRTC(err)
     return cubin
 
+
 # create a valid LTOIR input for testing
 @pytest.fixture
 def LTOIR(arch):
     arch = arch.replace("sm", "compute")
+
     def CHECK_NVRTC(err):
         if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
             raise RuntimeError(repr(err))
@@ -100,17 +104,28 @@ def CHECK_NVRTC(err):
     CHECK_NVRTC(err)
     return empty_kernel_ltoir
 
-@pytest.fixture
-def OBJECT(arch, tmpdir):
-    empty_cplusplus_kernel = "__global__ void A() {} int main() { return 0; }"
-    with open(tmpdir / "object.cu", "w") as f:
-        f.write(empty_cplusplus_kernel)
 
-    subprocess.check_output(["nvcc", "-arch", arch, "-o", str(tmpdir / "object.o"), str(tmpdir / "object.cu")])
-    with open(tmpdir / "object.o", "rb") as f:
-        object = f.read()
+# @pytest.fixture
+# def OBJECT(arch, tmpdir):
+#     empty_cplusplus_kernel = "__global__ void A() {} int main() { return 0; }"
+#     with open(tmpdir / "object.cu", "w") as f:
+#         f.write(empty_cplusplus_kernel)
+
+#     subprocess.check_output(["nvcc", "-arch", arch, "-o", str(tmpdir / "object.o"), str(tmpdir / "object.cu")])
+#     with open(tmpdir / "object.o", "rb") as f:
+#         object = f.read()
+
+#     return object
+
 
-    return object
+@pytest.mark.parametrize("error_enum", nvfatbin.Result)
+def test_get_error_string(error_enum):
+    es = nvfatbin.get_error_string(error_enum)
+
+    if error_enum is nvfatbin.Result.SUCCESS:
+        assert es == ""
+    else:
+        assert "error" in es
 
 
 def test_nvfatbin_get_version():
@@ -118,13 +133,15 @@ def test_nvfatbin_get_version():
     assert major is not None
     assert minor is not None
 
+
 def test_nvfatbin_empty_create_and_destroy():
     handle = nvfatbin.create([], 0)
     assert handle is not None
     nvfatbin.destroy(handle)
 
+
 def test_nvfatbin_invalid_input_create():
-    with pytest.raises(nvfatbin.nvfatbinError, match="ERROR_UNRECOGNIZED_OPTION"):
+    with pytest.raises(nvfatbin.nvFatbinError, match="ERROR_UNRECOGNIZED_OPTION"):
         nvfatbin.create(["--unsupported_option"], 1)
 
 
@@ -153,7 +170,7 @@ def test_nvfatbin_add_ptx(PTX, arch):
 @pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
 def test_nvfatbin_add_cubin_ELF_SIZE_MISMATCH(CUBIN, arch):
     handle = nvfatbin.create([], 0)
-    with pytest.raises(nvfatbin.nvfatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
+    with pytest.raises(nvfatbin.nvFatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
         nvfatbin.add_cubin(handle, CUBIN, len(CUBIN), "75", "inc")
 
     nvfatbin.destroy(handle)
@@ -174,7 +191,7 @@ def test_nvfatbin_add_cubin(CUBIN, arch):
 @pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
 def test_nvfatbin_add_cubin_ELF_ARCH_MISMATCH(CUBIN, arch):
     handle = nvfatbin.create([], 0)
-    with pytest.raises(nvfatbin.nvfatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
+    with pytest.raises(nvfatbin.nvFatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
         nvfatbin.add_cubin(handle, CUBIN, len(CUBIN), "75", "inc")
 
     nvfatbin.destroy(handle)
@@ -194,8 +211,9 @@ def test_nvdfatbin_add_ltoir(LTOIR, arch):
 
 @pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
 def test_nvdfatbin_add_ltoir_ELF_ARCH_MISMATCH(LTOIR, arch):
+    pytest.skip()
     handle = nvfatbin.create([], 0)
-    with pytest.raises(nvfatbin.nvfatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
+    with pytest.raises(nvfatbin.nvFatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
         nvfatbin.add_ltoir(handle, LTOIR, len(LTOIR), "75", "inc", "")
 
     nvfatbin.destroy(handle)
@@ -208,4 +226,4 @@ def test_nvfatbin_add_reloc(OBJECT):
     buffer = bytearray(nvfatbin.size(handle))
 
     nvfatbin.get(handle, buffer)
-    nvfatbin.destroy(handle)
\ No newline at end of file
+    nvfatbin.destroy(handle)

From bd868bdf7f523dbf6e52ff234966831844aa6032 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 14 Jan 2026 12:20:20 -0800
Subject: [PATCH 07/20] enable object creation and testing

---
 cuda_bindings/tests/test_nvfatbin.py | 33 +++++++++++++++++++---------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index 440305d51b..1779c8ab43 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 
+import shutil
+import subprocess
+
 import pytest
 from cuda.bindings import nvfatbin, nvrtc
 
@@ -105,17 +108,27 @@ def CHECK_NVRTC(err):
     return empty_kernel_ltoir
 
 
-# @pytest.fixture
-# def OBJECT(arch, tmpdir):
-#     empty_cplusplus_kernel = "__global__ void A() {} int main() { return 0; }"
-#     with open(tmpdir / "object.cu", "w") as f:
-#         f.write(empty_cplusplus_kernel)
+@pytest.fixture
+def OBJECT(arch, tmpdir):
+    empty_cplusplus_kernel = "__global__ void A() {} int main() { return 0; }"
+    with open(tmpdir / "object.cu", "w") as f:
+        f.write(empty_cplusplus_kernel)
+
+    nvcc = shutil.which("nvcc")
+    if nvcc is None:
+        pytest.skip("nvcc not found on PATH")
 
-#     subprocess.check_output(["nvcc", "-arch", arch, "-o", str(tmpdir / "object.o"), str(tmpdir / "object.cu")])
-#     with open(tmpdir / "object.o", "rb") as f:
-#         object = f.read()
+    # This is a test fixture that intentionally invokes a trusted tool (`nvcc`) to
+    # compile a temporary CUDA translation unit.
+    subprocess.run(  # noqa: S603
+        [nvcc, "-arch", arch, "-o", str(tmpdir / "object.o"), str(tmpdir / "object.cu")],
+        check=True,
+        capture_output=True,
+    )
+    with open(tmpdir / "object.o", "rb") as f:
+        object = f.read()
 
-#     return object
+    return object
 
 
 @pytest.mark.parametrize("error_enum", nvfatbin.Result)
@@ -125,7 +138,7 @@ def test_get_error_string(error_enum):
     if error_enum is nvfatbin.Result.SUCCESS:
         assert es == ""
     else:
-        assert "error" in es
+        assert es != ""
 
 
 def test_nvfatbin_get_version():

From d11cf026d583e0e4255b1a94a45e3108fc1c27f1 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 14 Jan 2026 16:38:02 -0800
Subject: [PATCH 08/20] remove the LTOIR mismatching arch failure test

---
 cuda_bindings/tests/test_nvfatbin.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index 1779c8ab43..07118032f4 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -48,6 +48,14 @@
 }
 """
 
+TILEIR = """
+cuda_tile.module @hello_world_module {
+    entry @hello_world_kernel() {
+        print "Hello World!\n"
+    }
+}
+"""
+
 
 @pytest.fixture(params=ARCHITECTURES)
 def arch(request):
@@ -222,19 +230,20 @@ def test_nvdfatbin_add_ltoir(LTOIR, arch):
     nvfatbin.destroy(handle)
 
 
-@pytest.mark.parametrize("arch", ["sm_80"], indirect=True)
-def test_nvdfatbin_add_ltoir_ELF_ARCH_MISMATCH(LTOIR, arch):
-    pytest.skip()
+def test_nvfatbin_add_reloc(OBJECT):
     handle = nvfatbin.create([], 0)
-    with pytest.raises(nvfatbin.nvFatbinError, match="ERROR_ELF_ARCH_MISMATCH"):
-        nvfatbin.add_ltoir(handle, LTOIR, len(LTOIR), "75", "inc", "")
+    nvfatbin.add_reloc(handle, OBJECT, len(OBJECT))
+
+    buffer = bytearray(nvfatbin.size(handle))
 
+    nvfatbin.get(handle, buffer)
     nvfatbin.destroy(handle)
 
 
-def test_nvfatbin_add_reloc(OBJECT):
+def test_nvfatbin_add_tile_ir():
+    pytest.skip()
     handle = nvfatbin.create([], 0)
-    nvfatbin.add_reloc(handle, OBJECT, len(OBJECT))
+    nvfatbin.add_tile_ir(handle, TILEIR.encode(), len(TILEIR), "hello_world_module", "")
 
     buffer = bytearray(nvfatbin.size(handle))
 

From d995809e4111f412e1e6d0068af846f8227be259 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 14 Jan 2026 20:49:47 -0800
Subject: [PATCH 09/20] encode a legal tileIR into test

---
 cuda_bindings/tests/test_nvfatbin.py | 57 +++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index 07118032f4..826279d832 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 
+import base64
 import shutil
 import subprocess
 
@@ -48,13 +49,44 @@
 }
 """
 
-TILEIR = """
-cuda_tile.module @hello_world_module {
-    entry @hello_world_kernel() {
-        print "Hello World!\n"
-    }
-}
-"""
+# This is the base64 encoded TileIR from sample VectorAddition program
+TILEIR_b64 = (
+    "f1RpbGVJUgANAQAAgssDCAECBgYBCwEECgC/A0QHEAUAEAUBBgQIEAAABgUMAQABBgUIEAAVBgUMAQAC"
+    "BgUMAQADBgUIBAAYBgQIEAAFBgUMAQAGBgUIEAAbBgUMAQAHBgUMAQAIBgUIBAAeBgQIEAAKBgUMAQAL"
+    "BgUIEAAhBgUMAQAMBgUMAQANBgUIBAAkMAUFBTAFBQVOBQAmEjoIWwgsAwgALi1OBQAqEzoJWwgwCwky"
+    "AwkAMzFbCi9bCzQlDQE1Cw43JQ8BFlsNOQsOOg8QAgA4OyUPARlbDT0LDj5ODgA4PyUOATYlDwEXWw1C"
+    "Cw5DDxACAEFEBBA8RQMOAEBBWxEUCxJIURJJRxATAlsUSwsVTD0VBxwASkZNESUNATULDlAlDwEcWw1S"
+    "Cw5TDxACAFFUJQ8BH1sNVgsOV04OAFFYJQ4BNiUPAR1bDVsLDlwPEAIAWl0EEFVeAw4AWVpbERoLEmFR"
+    "EmJgEBMCWxRkCxVlPRUHHABjX2YRAhUAAE5nJQ0BNQsOaiUPASJbDWwLDm0PEAIAa24lDwElWw1wCw5x"
+    "Tg4Aa3IlDgE2JQ8BI1sNdQsOdg8QAgB0dwQQb3gDDgBzdFsRIAsSe1ESfHplBwwAfWl5EVwAAIQvCMvL"
+    "A8vLy8vLy8sAAAAAAAAAAAUAAAAAAAAACgAAAAAAAAAEAQAAAAQABAAABAAAAACD/QcIy8vLy8sBy8vL"
+    "AAAAAGrLy8vLy8vLBAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAA"
+    "BAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAA"
+    "AAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAA"
+    "BAAAAAAAAAAFAAAAAAAAAAYAAAAAAAAABwAAAAAAAAAIAAAAAAAAAAcAAAAAAAAABwAAAAAAAAAJAAAA"
+    "AAAAAAoAAAAAAAAACQAAAAAAAAAJAAAAAAAAAAkAAAAAAAAACwAAAAAAAAAMAAAAAAAAAA0AAAAAAAAA"
+    "DQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAA"
+    "AAAAAA0AAAAAAAAADQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAAAAAAAA0AAAAAAAAA"
+    "DQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAA"
+    "AAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAA"
+    "DgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAADgAAAAAAAAAOAAAA"
+    "AAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAA"
+    "DgAAAAAAAAAOAAAAAAAAAA8AAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAA"
+    "AAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAA"
+    "EAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAA"
+    "AAAAABAAAAAAAAAABAAAAAAAAAAQy8vLAAAAAAMAAAAFAAAADAAAABEAAAAXAAAAHQAAACMAAAApAAAA"
+    "LwAAADUAAAA7AAAAQQAAAEcAAABNAAAAUwAAAAIAAQEBBQF9AgICfQQDA34ABAMDkQEMBAMDkgEMBAMD"
+    "lQEIBAMDlQEZBAMDlgEIBAMDlgEZBAMDmgEIBAMDmwEIBAMDnwENBAMDoAENBAMDowEPBAMDpwEEhcQC"
+    "BMvLyxbLy8sAAAAAAQAAAAIAAAADAAAABQAAAAgAAAALAAAAHwAAACAAAAArAAAANgAAAEkAAABcAAAA"
+    "XQAAAHAAAACDAAAAhgAAAJkAAACsAAAAvwAAAMIAAADVAAAAAAMHDAINAwANAQAQEQQFBQUFBAUFBQUE"
+    "BQUFBQUFABENAQEBAAAAAAAAAA0BAQAEAAAAAAAADQECAQAAAAAAAAABAAAAAAAAAA0BAgEAAAAAAAAA"
+    "AAQAAAAAAAAEDQwCAQAAAAAAAAABAAAAAAAAAA0MAgEAAAAAAAAAAAQAAAAAAAANDAANAAIBAAAAAAAA"
+    "AAAEAAAAAAAADQMCAQAAAAAAAAABAAAAAAAAAA0DAgEAAAAAAAAAAAQAAAAAAAANAgANAgIBAAAAAAAA"
+    "AAEAAAAAAAAADQICAQAAAAAAAAAABAAAAAAAAIGxAQQFy8vLAAAAABEAAAA9AAAAVQAAAJMAAABWZWN0"
+    "b3JBZGRpdGlvbi5weS9sb2NhbGhvbWUvbG9jYWwtd2FuZ20vY3V0aWxlLXB5dGhvbi9zYW1wbGVzdmVj"
+    "X2FkZF9rZXJuZWxfMmRfZ2F0aGVyL2xvY2FsaG9tZS9sb2NhbC13YW5nbS9jdXRpbGUtcHl0aG9uL3Nh"
+    "bXBsZXMvVmVjdG9yQWRkaXRpb24ucHlzbV8xMjAA"
+)
 
 
 @pytest.fixture(params=ARCHITECTURES)
@@ -139,6 +171,12 @@ def OBJECT(arch, tmpdir):
     return object
 
 
+@pytest.fixture
+def TILEIR(tmpdir):
+    binary_data = base64.b64decode(TILEIR_b64)
+    return binary_data
+
+
 @pytest.mark.parametrize("error_enum", nvfatbin.Result)
 def test_get_error_string(error_enum):
     es = nvfatbin.get_error_string(error_enum)
@@ -240,10 +278,9 @@ def test_nvfatbin_add_reloc(OBJECT):
     nvfatbin.destroy(handle)
 
 
-def test_nvfatbin_add_tile_ir():
-    pytest.skip()
+def test_nvfatbin_add_tile_ir(TILEIR):
     handle = nvfatbin.create([], 0)
-    nvfatbin.add_tile_ir(handle, TILEIR.encode(), len(TILEIR), "hello_world_module", "")
+    nvfatbin.add_tile_ir(handle, TILEIR, len(TILEIR), "VectorAdd", "")
 
     buffer = bytearray(nvfatbin.size(handle))
 

From 8e75d8acc73b66bb2c3fa0c80f9b35128d2a9f00 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Wed, 14 Jan 2026 21:29:01 -0800
Subject: [PATCH 10/20] update the license year

---
 cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd         | 2 +-
 cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx   | 2 +-
 cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx | 2 +-
 cuda_bindings/cuda/bindings/cynvfatbin.pxd                 | 2 +-
 cuda_bindings/cuda/bindings/cynvfatbin.pyx                 | 2 +-
 cuda_bindings/cuda/bindings/nvfatbin.pxd                   | 2 +-
 cuda_bindings/cuda/bindings/nvfatbin.pyx                   | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
index 843759b32c..9ed97e016b 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
index b50efe02b7..6305e49f8b 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
index e40777a7e4..1592b752a9 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pxd b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
index a01a3ea1d9..b75f866ae4 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pyx b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
index 7cd77c2d75..028440a444 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pxd b/cuda_bindings/cuda/bindings/nvfatbin.pxd
index 545d30dc0e..d21cca3508 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
index 701d2dd502..92b571a26f 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #

From 22bf2ce3565d67d111687c3c6f6f043188012225 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 12:13:02 -0800
Subject: [PATCH 11/20] add an embedded tile program for re-generation

---
 cuda_bindings/tests/test_nvfatbin.py | 59 ++++++++-------------
 toolshed/dump_cutile_b64.py          | 79 ++++++++++++++++++++++++++++
 2 files changed, 101 insertions(+), 37 deletions(-)
 create mode 100644 toolshed/dump_cutile_b64.py

diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index 826279d832..89fc3a1b32 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -3,6 +3,7 @@
 
 
 import base64
+import binascii
 import shutil
 import subprocess
 
@@ -49,43 +50,21 @@
 }
 """
 
-# This is the base64 encoded TileIR from sample VectorAddition program
+# Base64 encoded TileIR generated by the toolshed/dump_cutile_b64.py script.
 TILEIR_b64 = (
-    "f1RpbGVJUgANAQAAgssDCAECBgYBCwEECgC/A0QHEAUAEAUBBgQIEAAABgUMAQABBgUIEAAVBgUMAQAC"
-    "BgUMAQADBgUIBAAYBgQIEAAFBgUMAQAGBgUIEAAbBgUMAQAHBgUMAQAIBgUIBAAeBgQIEAAKBgUMAQAL"
-    "BgUIEAAhBgUMAQAMBgUMAQANBgUIBAAkMAUFBTAFBQVOBQAmEjoIWwgsAwgALi1OBQAqEzoJWwgwCwky"
-    "AwkAMzFbCi9bCzQlDQE1Cw43JQ8BFlsNOQsOOg8QAgA4OyUPARlbDT0LDj5ODgA4PyUOATYlDwEXWw1C"
-    "Cw5DDxACAEFEBBA8RQMOAEBBWxEUCxJIURJJRxATAlsUSwsVTD0VBxwASkZNESUNATULDlAlDwEcWw1S"
-    "Cw5TDxACAFFUJQ8BH1sNVgsOV04OAFFYJQ4BNiUPAR1bDVsLDlwPEAIAWl0EEFVeAw4AWVpbERoLEmFR"
-    "EmJgEBMCWxRkCxVlPRUHHABjX2YRAhUAAE5nJQ0BNQsOaiUPASJbDWwLDm0PEAIAa24lDwElWw1wCw5x"
-    "Tg4Aa3IlDgE2JQ8BI1sNdQsOdg8QAgB0dwQQb3gDDgBzdFsRIAsSe1ESfHplBwwAfWl5EVwAAIQvCMvL"
-    "A8vLy8vLy8sAAAAAAAAAAAUAAAAAAAAACgAAAAAAAAAEAQAAAAQABAAABAAAAACD/QcIy8vLy8sBy8vL"
-    "AAAAAGrLy8vLy8vLBAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAA"
-    "BAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAA"
-    "AAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAA"
-    "BAAAAAAAAAAFAAAAAAAAAAYAAAAAAAAABwAAAAAAAAAIAAAAAAAAAAcAAAAAAAAABwAAAAAAAAAJAAAA"
-    "AAAAAAoAAAAAAAAACQAAAAAAAAAJAAAAAAAAAAkAAAAAAAAACwAAAAAAAAAMAAAAAAAAAA0AAAAAAAAA"
-    "DQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAA"
-    "AAAAAA0AAAAAAAAADQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAAAAAAAA0AAAAAAAAA"
-    "DQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAAAAAAAA0AAAAAAAAADQAAAAAAAAANAAAA"
-    "AAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAA"
-    "DgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAADgAAAAAAAAAOAAAA"
-    "AAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAADgAAAAAAAAAOAAAAAAAAAA4AAAAAAAAA"
-    "DgAAAAAAAAAOAAAAAAAAAA8AAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAA"
-    "AAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAA"
-    "EAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAA"
-    "AAAAABAAAAAAAAAABAAAAAAAAAAQy8vLAAAAAAMAAAAFAAAADAAAABEAAAAXAAAAHQAAACMAAAApAAAA"
-    "LwAAADUAAAA7AAAAQQAAAEcAAABNAAAAUwAAAAIAAQEBBQF9AgICfQQDA34ABAMDkQEMBAMDkgEMBAMD"
-    "lQEIBAMDlQEZBAMDlgEIBAMDlgEZBAMDmgEIBAMDmwEIBAMDnwENBAMDoAENBAMDowEPBAMDpwEEhcQC"
-    "BMvLyxbLy8sAAAAAAQAAAAIAAAADAAAABQAAAAgAAAALAAAAHwAAACAAAAArAAAANgAAAEkAAABcAAAA"
-    "XQAAAHAAAACDAAAAhgAAAJkAAACsAAAAvwAAAMIAAADVAAAAAAMHDAINAwANAQAQEQQFBQUFBAUFBQUE"
-    "BQUFBQUFABENAQEBAAAAAAAAAA0BAQAEAAAAAAAADQECAQAAAAAAAAABAAAAAAAAAA0BAgEAAAAAAAAA"
-    "AAQAAAAAAAAEDQwCAQAAAAAAAAABAAAAAAAAAA0MAgEAAAAAAAAAAAQAAAAAAAANDAANAAIBAAAAAAAA"
-    "AAAEAAAAAAAADQMCAQAAAAAAAAABAAAAAAAAAA0DAgEAAAAAAAAAAAQAAAAAAAANAgANAgIBAAAAAAAA"
-    "AAEAAAAAAAAADQICAQAAAAAAAAAABAAAAAAAAIGxAQQFy8vLAAAAABEAAAA9AAAAVQAAAJMAAABWZWN0"
-    "b3JBZGRpdGlvbi5weS9sb2NhbGhvbWUvbG9jYWwtd2FuZ20vY3V0aWxlLXB5dGhvbi9zYW1wbGVzdmVj"
-    "X2FkZF9rZXJuZWxfMmRfZ2F0aGVyL2xvY2FsaG9tZS9sb2NhbC13YW5nbS9jdXRpbGUtcHl0aG9uL3Nh"
-    "bXBsZXMvVmVjdG9yQWRkaXRpb24ucHlzbV8xMjAA"
+    "f1RpbGVJUgANAQAAgo0BCAECBgYBCwEECgCBAUQHBgQIEAAABgUMAQABBgUIEAALQwEICgEMAAYE"
+    "CBAAAwYFDAEABAYFCBAAD0MBCA4BEAAGBAgQAAYGBQwBAAcGBQgQABNDAQgSARQAMAUFBUIJDT4C"
+    "CgcEABkBFglCCRE+AgoHBAAcARYJAgoAABodQgkVZgEHBAAfIAEWCVwAAIQICADLy8vLy8vLg5oC"
+    "CMvLy8sBy8vLAAAAABfLy8vLy8vLBAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAA"
+    "AAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAEAAAAAAAAAAQAAAAA"
+    "AAAABAAAAAAAAAAEAAAAAAAAAAUAAAAAAAAABgAAAAAAAAAGAAAAAAAAAAcAAAAAAAAABwAAAAAA"
+    "AAAIAAAAAAAAAAkAAAAAAAAACQAAAAAAAAAEAAAAAAAAAAnLy8sAAAAAAwAAAAUAAAAMAAAAEQAA"
+    "ABYAAAAbAAAAIAAAACUAAAACAAEBAQUBFwICAhcEAwMYBAQDAxkTBAMDGhEEAwMbEQQDAx0WBAMD"
+    "HgiFdATLy8sLy8vLAAAAAAEAAAACAAAAAwAAAAUAAAAIAAAACwAAABcAAAAYAAAALAAAADkAAAAA"
+    "AwcMAg0DAA0BABAJBAUFBAUFBAUFABEOAgEAAAAAAAAAgAEBAAAAAAAAAA8BEAAAAAgBAAAAAAAN"
+    "AgEQAAAAAAAAAIGIAQQFy8vLAAAAABIAAAAsAAAAPQAAAGoAAABkdW1wX2N1dGlsZV9iNjQucHkv"
+    "bG9jYWxob21lL2xvY2FsLXdhbmdtL3RveXZlY3Rvcl9hZGRfa2VybmVsL2xvY2FsaG9tZS9sb2Nh"
+    "bC13YW5nbS90b3kvZHVtcF9jdXRpbGVfYjY0LnB5c21fMTIwAA=="
 )
 
 
@@ -173,7 +152,13 @@ def OBJECT(arch, tmpdir):
 
 @pytest.fixture
 def TILEIR(tmpdir):
-    binary_data = base64.b64decode(TILEIR_b64)
+    try:
+        binary_data = base64.b64decode(TILEIR_b64)
+    except binascii.Error as e:
+        raise ValueError(
+            "Base64 encoded TileIR is corrupted. Please regenerate the TileIR"
+            "by executing the toolshed/dump_cutile_b64.py script."
+        ) from e
     return binary_data
 
 
diff --git a/toolshed/dump_cutile_b64.py b/toolshed/dump_cutile_b64.py
new file mode 100644
index 0000000000..4ce5a82a9f
--- /dev/null
+++ b/toolshed/dump_cutile_b64.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+"""
+Embeds a sample cuTile kernel, executes it with CUDA_TILE_DUMP_BYTECODE=.,
+loads the resulting .cutile file, and prints its base64-encoded content.
+"""
+
+import base64
+import glob
+import os
+import sys
+
+import cuda.tile as ct
+import cupy
+
+
+def _run_sample_cutile_kernel() -> None:
+    # Import after env var setup so CUDA_TILE_DUMP_BYTECODE is honored.
+    TILE_SIZE = 16
+
+    @ct.kernel
+    def vector_add_kernel(a, b, result):
+        block_id = ct.bid(0)
+        a_tile = ct.load(a, index=(block_id,), shape=(TILE_SIZE,))
+        b_tile = ct.load(b, index=(block_id,), shape=(TILE_SIZE,))
+
+        result_tile = a_tile + b_tile
+        ct.store(result, index=(block_id,), tile=result_tile)
+
+    a = cupy.arange(128, dtype="float32")
+    b = cupy.arange(128, dtype="float32")
+    result = cupy.zeros_like(a)
+
+    grid = (ct.cdiv(a.shape[0], TILE_SIZE), 1, 1)
+    ct.launch(cupy.cuda.get_current_stream(), grid, vector_add_kernel, (a, b, result))
+
+    cupy.cuda.get_current_stream().synchronize()
+
+    assert result[-1] == 254
+
+
+def main():
+    # CUDA_TILE_DUMP_BYTECODE=. means dump to current directory
+    os.environ["CUDA_TILE_DUMP_BYTECODE"] = "."
+
+    try:
+        _run_sample_cutile_kernel()
+    except Exception as e:
+        print(f"Sample kernel execution failed: {e}", file=sys.stderr)
+        raise
+
+    # Find the .cutile file in current directory
+    cutile_files = glob.glob("./*.cutile")
+    if not cutile_files:
+        print("No .cutile file found in current directory", file=sys.stderr)
+        sys.exit(1)
+
+    # Use the most recently modified one if multiple exist
+    cutile_path = max(cutile_files, key=os.path.getmtime)
+
+    # Read the binary content
+    with open(cutile_path, "rb") as f:
+        binary_content = f.read()
+
+    # Encode with base64 in ASCII mode
+    b64_encoded = base64.b64encode(binary_content).decode("ascii")
+
+    # Print with lines less than 79 characters, wrapped with quotes
+    line_width = 76  # 78 - 2 for the quotes on both sides
+    for i in range(0, len(b64_encoded), line_width):
+        chunk = b64_encoded[i : i + line_width]
+        print(f'"{chunk}"')
+
+
+if __name__ == "__main__":
+    main()

From cda0e2981d00ddf26e2636c91d42f9a145353da6 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 12:23:06 -0800
Subject: [PATCH 12/20] license date: 2026

---
 cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd         | 2 +-
 cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx   | 2 +-
 cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx | 2 +-
 cuda_bindings/cuda/bindings/cynvfatbin.pxd                 | 2 +-
 cuda_bindings/cuda/bindings/cynvfatbin.pyx                 | 2 +-
 cuda_bindings/cuda/bindings/nvfatbin.pxd                   | 2 +-
 cuda_bindings/cuda/bindings/nvfatbin.pyx                   | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
index 9ed97e016b..9f3b187560 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
index 6305e49f8b..1e1e8d9a12 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_linux.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
index 1592b752a9..b0b3f94e5a 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvfatbin_windows.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pxd b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
index b75f866ae4..5969fafee1 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/cynvfatbin.pyx b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
index 028440a444..f0f8300cb0 100644
--- a/cuda_bindings/cuda/bindings/cynvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/cynvfatbin.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pxd b/cuda_bindings/cuda/bindings/nvfatbin.pxd
index d21cca3508..e0744efbd7 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pxd
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
index 92b571a26f..0e6c95c893 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #

From 2e53f4689289a19dd86101bafd2f0ae72052c314 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 13:20:28 -0800
Subject: [PATCH 13/20] cdef const char *

---
 cuda_bindings/cuda/bindings/nvfatbin.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/cuda/bindings/nvfatbin.pyx b/cuda_bindings/cuda/bindings/nvfatbin.pyx
index 0e6c95c893..32cdeb300b 100644
--- a/cuda_bindings/cuda/bindings/nvfatbin.pyx
+++ b/cuda_bindings/cuda/bindings/nvfatbin.pyx
@@ -89,7 +89,7 @@ cpdef str get_error_string(int result):
 
     .. seealso:: `nvFatbinGetErrorString`
     """
-    cdef char* _output_
+    cdef const char* _output_
     cdef bytes _output_bytes_
     _output_ = nvFatbinGetErrorString(<_Result>result)
 

From d7dfdb81f53bdbb7116577a3eba90174fee2b09a Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 13:32:11 -0800
Subject: [PATCH 14/20] add dependency for CI

---
 .github/actions/fetch_ctk/action.yml | 2 +-
 cuda_bindings/pyproject.toml         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/actions/fetch_ctk/action.yml b/.github/actions/fetch_ctk/action.yml
index 559c6d5a8b..001e3a84d8 100644
--- a/.github/actions/fetch_ctk/action.yml
+++ b/.github/actions/fetch_ctk/action.yml
@@ -14,7 +14,7 @@ inputs:
   cuda-components:
     description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
     required: false
-    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile"
+    default: "cuda_nvcc,cuda_cudart,cuda_crt,libnvvm,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink,libcufile,libnvfatbin"
   cuda-path:
     description: "where the CTK components will be installed to, relative to $PWD"
     required: false
diff --git a/cuda_bindings/pyproject.toml b/cuda_bindings/pyproject.toml
index 7c4bddb434..614f7bb63a 100644
--- a/cuda_bindings/pyproject.toml
+++ b/cuda_bindings/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = ["cuda-pathfinder ~=1.1"]
 
 [project.optional-dependencies]
 all = [
-    "cuda-toolkit[nvrtc,nvjitlink,nvvm]==13.*",
+    "cuda-toolkit[nvrtc,nvjitlink,nvvm,nvfatbin]==13.*",
     "cuda-toolkit[cufile]==13.*; sys_platform == 'linux'",
 ]
 

From ff8f6a2f804505ea9c1353d0134d321e71ea9c92 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 13:48:52 -0800
Subject: [PATCH 15/20] add TODO to locate nvcc file

---
 cuda_bindings/tests/test_nvfatbin.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index 89fc3a1b32..f1a6cbda47 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -133,6 +133,7 @@ def OBJECT(arch, tmpdir):
     with open(tmpdir / "object.cu", "w") as f:
         f.write(empty_cplusplus_kernel)
 
+    # TODO: Use cuda-pathfinder to locate nvcc on system.
     nvcc = shutil.which("nvcc")
     if nvcc is None:
         pytest.skip("nvcc not found on PATH")

From 965c53b984d6f7575d1da01f7621966bf54f6938 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 13:52:40 -0800
Subject: [PATCH 16/20] add documentation and release notes

---
 cuda_bindings/docs/source/api.rst                  | 1 +
 cuda_bindings/docs/source/release/13.1.X-notes.rst | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/cuda_bindings/docs/source/api.rst b/cuda_bindings/docs/source/api.rst
index 4277bc745b..e6ee4b99dd 100644
--- a/cuda_bindings/docs/source/api.rst
+++ b/cuda_bindings/docs/source/api.rst
@@ -14,5 +14,6 @@ CUDA Python API Reference
    module/nvrtc
    module/nvjitlink
    module/nvvm
+   module/nvfatbin
    module/cufile
    module/utils
diff --git a/cuda_bindings/docs/source/release/13.1.X-notes.rst b/cuda_bindings/docs/source/release/13.1.X-notes.rst
index 21323682b1..92a3f8dbcb 100644
--- a/cuda_bindings/docs/source/release/13.1.X-notes.rst
+++ b/cuda_bindings/docs/source/release/13.1.X-notes.rst
@@ -9,6 +9,8 @@
 Highlights
 ----------
 
+* Add ``nvFatbin`` bindings. (PR #1467 <https://github.com/NVIDIA/cuda-python/pull/1467>_)
+
 Experimental
 ------------
 

From 2b54279649b02e676492236264b714596fbc8285 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 15:00:49 -0800
Subject: [PATCH 17/20] skip tileIR test for <13.1 installation

---
 cuda_bindings/tests/test_nvfatbin.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index f1a6cbda47..5a3b00df6b 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -68,6 +68,10 @@
 )
 
 
+def get_version() -> tuple[int, int]:
+    return nvfatbin.version()
+
+
 @pytest.fixture(params=ARCHITECTURES)
 def arch(request):
     return request.param
@@ -264,6 +268,7 @@ def test_nvfatbin_add_reloc(OBJECT):
     nvfatbin.destroy(handle)
 
 
+@pytest.skipIf(get_version() < (13, 1), reason="TileIR API is not supported in CUDA < 13.1")
 def test_nvfatbin_add_tile_ir(TILEIR):
     handle = nvfatbin.create([], 0)
     nvfatbin.add_tile_ir(handle, TILEIR, len(TILEIR), "VectorAdd", "")

From 4baeecda2e2808a486bff06f4cfe63413f445d74 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 15:36:48 -0800
Subject: [PATCH 18/20] add an nvcc smoke test to better determine compiler
 usability

---
 cuda_bindings/tests/test_nvfatbin.py | 64 +++++++++++++++++++++++-----
 1 file changed, 53 insertions(+), 11 deletions(-)

diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index 5a3b00df6b..a4535b8045 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -87,6 +87,39 @@ def PTX(arch, ptx_version):
     return PTX_TEMPLATE.format(PTX_VERSION=ptx_version, ARCH=arch)
 
 
+@pytest.fixture
+def nvcc_smoke(tmpdir) -> str:
+    # TODO: Use cuda-pathfinder to locate nvcc on system.
+    nvcc = shutil.which("nvcc")
+    if nvcc is None:
+        pytest.skip("nvcc not found on PATH")
+
+    # Smoke test: make sure nvcc is actually usable (toolkit + host compiler are set up),
+    # not merely present on PATH.
+    src = tmpdir / "nvcc_smoke.cu"
+    out = tmpdir / "nvcc_smoke.o"
+    with open(src, "w") as f:
+        f.write("")
+    try:
+        subprocess.run(  # noqa: S603
+            [nvcc, "-c", str(src), "-o", str(out)],
+            check=True,
+            capture_output=True,
+        )
+    except subprocess.CalledProcessError as e:
+        stdout = (e.stdout or b"").decode(errors="replace")
+        stderr = (e.stderr or b"").decode(errors="replace")
+        pytest.skip(
+            "nvcc found on PATH but failed to compile a trivial input.\n"
+            f"command: {[nvcc, '-c', str(src), '-o', str(out)]!r}\n"
+            f"exit_code: {e.returncode}\n"
+            f"stdout:\n{stdout}\n"
+            f"stderr:\n{stderr}\n"
+        )
+
+    return nvcc
+
+
 @pytest.fixture
 def CUBIN(arch):
     def CHECK_NVRTC(err):
@@ -132,23 +165,32 @@ def CHECK_NVRTC(err):
 
 
 @pytest.fixture
-def OBJECT(arch, tmpdir):
-    empty_cplusplus_kernel = "__global__ void A() {} int main() { return 0; }"
+def OBJECT(arch, tmpdir, nvcc_smoke):
+    empty_cplusplus_kernel = "__global__ void A() {}"
     with open(tmpdir / "object.cu", "w") as f:
         f.write(empty_cplusplus_kernel)
 
-    # TODO: Use cuda-pathfinder to locate nvcc on system.
-    nvcc = shutil.which("nvcc")
-    if nvcc is None:
-        pytest.skip("nvcc not found on PATH")
+    nvcc = nvcc_smoke
 
     # This is a test fixture that intentionally invokes a trusted tool (`nvcc`) to
     # compile a temporary CUDA translation unit.
-    subprocess.run(  # noqa: S603
-        [nvcc, "-arch", arch, "-o", str(tmpdir / "object.o"), str(tmpdir / "object.cu")],
-        check=True,
-        capture_output=True,
-    )
+    cmd = [nvcc, "-c", "-arch", arch, "-o", str(tmpdir / "object.o"), str(tmpdir / "object.cu")]
+    try:
+        subprocess.run(  # noqa: S603
+            cmd,
+            check=True,
+            capture_output=True,
+        )
+    except subprocess.CalledProcessError as e:
+        stdout = (e.stdout or b"").decode(errors="replace")
+        stderr = (e.stderr or b"").decode(errors="replace")
+        raise RuntimeError(
+            "nvcc smoke test passed, but nvcc failed while compiling the test object.\n"
+            f"command: {cmd!r}\n"
+            f"exit_code: {e.returncode}\n"
+            f"stdout:\n{stdout}\n"
+            f"stderr:\n{stderr}\n"
+        ) from e
     with open(tmpdir / "object.o", "rb") as f:
         object = f.read()
 

From 4c2f3f372f5f85760b3d6a72050dc4421eaab8da Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 16:49:24 -0800
Subject: [PATCH 19/20] fix typo

---
 cuda_bindings/tests/test_nvfatbin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cuda_bindings/tests/test_nvfatbin.py b/cuda_bindings/tests/test_nvfatbin.py
index a4535b8045..c3b25db2f2 100644
--- a/cuda_bindings/tests/test_nvfatbin.py
+++ b/cuda_bindings/tests/test_nvfatbin.py
@@ -310,7 +310,7 @@ def test_nvfatbin_add_reloc(OBJECT):
     nvfatbin.destroy(handle)
 
 
-@pytest.skipIf(get_version() < (13, 1), reason="TileIR API is not supported in CUDA < 13.1")
+@pytest.mark.skipif(get_version() < (13, 1), reason="TileIR API is not supported in CUDA < 13.1")
 def test_nvfatbin_add_tile_ir(TILEIR):
     handle = nvfatbin.create([], 0)
     nvfatbin.add_tile_ir(handle, TILEIR, len(TILEIR), "VectorAdd", "")

From 3e071e27aed4676d397a17198238bc5d14c6e3e4 Mon Sep 17 00:00:00 2001
From: Michael Wang <isVoid@users.noreply.github.com>
Date: Thu, 15 Jan 2026 22:42:16 -0800
Subject: [PATCH 20/20] add nvfatbin.rst

---
 cuda_bindings/docs/source/module/nvfatbin.rst | 89 +++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 cuda_bindings/docs/source/module/nvfatbin.rst

diff --git a/cuda_bindings/docs/source/module/nvfatbin.rst b/cuda_bindings/docs/source/module/nvfatbin.rst
new file mode 100644
index 0000000000..297d4baa85
--- /dev/null
+++ b/cuda_bindings/docs/source/module/nvfatbin.rst
@@ -0,0 +1,89 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+.. default-role:: cpp:any
+
+nvfatbin
+========
+
+Note
+----
+
+The nvfatbin bindings are not supported on nvFatbin installations <12.4. Ensure the installed CUDA toolkit's nvFatbin version is >=12.4.
+
+The Tile IR API (:func:`cuda.bindings.nvfatbin.add_tile_ir`) is only available in CUDA 13.1+.
+
+Functions
+---------
+
+NvFatbin defines the following functions for creating and populating fatbinaries.
+
+.. autofunction:: cuda.bindings.nvfatbin.create
+.. autofunction:: cuda.bindings.nvfatbin.destroy
+.. autofunction:: cuda.bindings.nvfatbin.add_ptx
+.. autofunction:: cuda.bindings.nvfatbin.add_cubin
+.. autofunction:: cuda.bindings.nvfatbin.add_ltoir
+.. autofunction:: cuda.bindings.nvfatbin.add_reloc
+.. autofunction:: cuda.bindings.nvfatbin.add_tile_ir
+.. autofunction:: cuda.bindings.nvfatbin.size
+.. autofunction:: cuda.bindings.nvfatbin.get
+.. autofunction:: cuda.bindings.nvfatbin.get_error_string
+.. autofunction:: cuda.bindings.nvfatbin.version
+
+Types
+---------
+.. autoclass:: cuda.bindings.nvfatbin.Result
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.SUCCESS
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INTERNAL
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_ELF_ARCH_MISMATCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_ELF_SIZE_MISMATCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_MISSING_PTX_VERSION
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_NULL_POINTER
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_COMPRESSION_FAILED
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_COMPRESSED_SIZE_EXCEEDED
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_UNRECOGNIZED_OPTION
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INVALID_ARCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INVALID_NVVM
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_EMPTY_INPUT
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_MISSING_PTX_ARCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_PTX_ARCH_MISMATCH
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_MISSING_FATBIN
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INVALID_INDEX
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_IDENTIFIER_REUSE
+
+
+    .. autoattribute:: cuda.bindings.nvfatbin.Result.ERROR_INTERNAL_PTX_OPTION
+