diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx index d7188f6c80..5281ab47ec 100644 --- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx @@ -22,6 +22,33 @@ ctypedef cpp_complex.complex[float] cpp_single_complex ctypedef cpp_complex.complex[double] cpp_double_complex +# We need an identifier for fp16 for copying scalars on the host. This is a minimal +# implementation borrowed from cuda_fp16.h. +cdef extern from *: + """ + #if __cplusplus >= 201103L + #define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ + #else + #if defined(__GNUC__) + #define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) + #elif defined(_MSC_VER) + #define __CUDA_ALIGN__(n) __declspec(align(n)) + #else + #define __CUDA_ALIGN__(n) + #endif /* defined(__GNUC__) */ + #endif /* __cplusplus >= 201103L */ + + typedef struct __CUDA_ALIGN__(2) { + /** + * Storage field contains bits representation of the \p half floating-point number. + */ + unsigned short x; + } __half_raw; + """ + ctypedef struct __half_raw: + unsigned short x + + ctypedef fused supported_type: cpp_bool int8_t @@ -32,6 +59,7 @@ ctypedef fused supported_type: uint16_t uint32_t uint64_t + __half_raw float double intptr_t @@ -85,6 +113,8 @@ cdef inline int prepare_arg( (ptr)[0] = cpp_complex.complex[float](arg.real, arg.imag) elif supported_type is cpp_double_complex: (ptr)[0] = cpp_complex.complex[double](arg.real, arg.imag) + elif supported_type is __half_raw: + (ptr).x = (arg.view(numpy_int16)) else: (ptr)[0] = (arg) data_addresses[idx] = ptr # take the address to the scalar @@ -147,8 +177,7 @@ cdef inline int prepare_numpy_arg( elif isinstance(arg, numpy_uint64): return prepare_arg[uint64_t](data, data_addresses, arg, idx) elif isinstance(arg, numpy_float16): - # use int16 as a proxy - return prepare_arg[int16_t](data, data_addresses, arg, idx) + return prepare_arg[__half_raw](data, data_addresses, arg, idx) elif isinstance(arg, numpy_float32): return prepare_arg[float](data, data_addresses, arg, idx) elif isinstance(arg, numpy_float64): @@ -207,7 +236,7 @@ cdef class ParamHolder: not_prepared = prepare_ctypes_arg(self.data, self.data_addresses, arg, i) if not_prepared: # TODO: support ctypes/numpy struct - raise TypeError + raise TypeError("the argument is of unsupported type: " + str(type(arg))) self.kernel_args = kernel_args self.ptr = self.data_addresses.data() diff --git a/cuda_core/docs/source/release/0.3.0-notes.rst b/cuda_core/docs/source/release/0.3.0-notes.rst index eb365b4cbf..eb52ca8400 100644 --- a/cuda_core/docs/source/release/0.3.0-notes.rst +++ b/cuda_core/docs/source/release/0.3.0-notes.rst @@ -30,3 +30,4 @@ Fixes and enhancements ---------------------- - An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively. +- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 0ad6f94e0c..9c72693a1b 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -1,9 +1,15 @@ -# Copyright 2024 NVIDIA Corporation. All rights reserved. +# Copyright 2024-2025 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import ctypes +import os +import pathlib + +import numpy as np import pytest -from cuda.core.experimental import Device, LaunchConfig, Program, launch +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch +from cuda.core.experimental._memory import _DefaultPinnedMemorySource def test_launch_config_init(init_cuda): @@ -59,3 +65,90 @@ def test_launch_invalid_values(init_cuda): launch(stream, ker, None) launch(stream, config, ker) + + +# Parametrize: (python_type, cpp_type, init_value) +PARAMS = ( + (bool, "bool", True), + (float, "double", 2.718), + (np.bool, "bool", True), + (np.int8, "signed char", -42), + (np.int16, "signed short", -1234), + (np.int32, "signed int", -123456), + (np.int64, "signed long long", -123456789), + (np.uint8, "unsigned char", 42), + (np.uint16, "unsigned short", 1234), + (np.uint32, "unsigned int", 123456), + (np.uint64, "unsigned long long", 123456789), + (np.float32, "float", 3.14), + (np.float64, "double", 2.718), + (ctypes.c_bool, "bool", True), + (ctypes.c_int8, "signed char", -42), + (ctypes.c_int16, "signed short", -1234), + (ctypes.c_int32, "signed int", -123456), + (ctypes.c_int64, "signed long long", -123456789), + (ctypes.c_uint8, "unsigned char", 42), + (ctypes.c_uint16, "unsigned short", 1234), + (ctypes.c_uint32, "unsigned int", 123456), + (ctypes.c_uint64, "unsigned long long", 123456789), + (ctypes.c_float, "float", 3.14), + (ctypes.c_double, "double", 2.718), +) +if os.environ.get("CUDA_PATH"): + PARAMS += ( + (np.float16, "half", 0.78), + (np.complex64, "cuda::std::complex", 1 + 2j), + (np.complex128, "cuda::std::complex", -3 - 4j), + (complex, "cuda::std::complex", 5 - 7j), + ) + + +@pytest.mark.parametrize("python_type, cpp_type, init_value", PARAMS) +@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+") +def test_launch_scalar_argument(python_type, cpp_type, init_value): + dev = Device() + dev.set_current() + + # Prepare pinned host array + mr = _DefaultPinnedMemorySource() + b = mr.allocate(np.dtype(python_type).itemsize) + arr = np.from_dlpack(b).view(python_type) + arr[:] = 0 + + # Prepare scalar argument in Python + scalar = python_type(init_value) + + # CUDA kernel templated on type T + code = r""" + template + __global__ void write_scalar(T* arr, T val) { + arr[0] = val; + } + """ + + # Compile and force instantiation for this type + arch = "".join(f"{i}" for i in dev.compute_capability) + if os.environ.get("CUDA_PATH"): + include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")) + code = ( + r""" + #include + #include + """ + + code + ) + else: + include_path = None + pro_opts = ProgramOptions(std="c++11", arch=f"sm_{arch}", include_path=include_path) + prog = Program(code, code_type="c++", options=pro_opts) + ker_name = f"write_scalar<{cpp_type}>" + mod = prog.compile("cubin", name_expressions=(ker_name,)) + ker = mod.get_kernel(ker_name) + + # Launch with 1 thread + config = LaunchConfig(grid=1, block=1) + launch(dev.default_stream, config, ker, arr.ctypes.data, scalar) + dev.default_stream.sync() + + # Check result + assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}"