diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_kernels.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_kernels.h index 3ffaa6cc660..da7312cd626 100644 --- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_kernels.h +++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_kernels.h @@ -391,10 +391,15 @@ template template -struct __radix_sort_onesweep_kernel +struct __radix_sort_onesweep_kernel; + +template +struct __radix_sort_onesweep_kernel<__esimd_tag, __is_ascending, __radix_bits, __data_per_work_item, __work_group_size, _InRngPack, _OutRngPack> { using _LocOffsetT = ::std::uint16_t; using _GlobOffsetT = ::std::uint32_t; + using _AtomicIdT = ::std::uint32_t; using _KeyT = typename _InRngPack::_KeyT; using _ValT = typename _InRngPack::_ValT; @@ -457,7 +462,7 @@ struct __radix_sort_onesweep_kernel _OutRngPack __out_pack; __radix_sort_onesweep_kernel(::std::uint32_t __n, ::std::uint32_t __stage, _GlobOffsetT* __p_global_hist, - _GlobOffsetT* __p_group_hists, const _InRngPack& __in_pack, + _GlobOffsetT* __p_group_hists, _AtomicIdT* /*__p_atomic_id*/, const _InRngPack& __in_pack, const _OutRngPack& __out_pack) : __n(__n), __stage(__stage), __p_global_hist(__p_global_hist), __p_group_hists(__p_group_hists), __in_pack(__in_pack), __out_pack(__out_pack) diff --git a/include/oneapi/dpl/experimental/kt/internal/radix_sort_dispatchers.h b/include/oneapi/dpl/experimental/kt/internal/radix_sort_dispatchers.h index 6387e095476..0e19afa65e3 100644 --- a/include/oneapi/dpl/experimental/kt/internal/radix_sort_dispatchers.h +++ b/include/oneapi/dpl/experimental/kt/internal/radix_sort_dispatchers.h @@ -86,6 +86,7 @@ class __onesweep_memory_holder // Memory to store intermediate results of sorting _KeyT* __m_keys_ptr = nullptr; _ValT* __m_vals_ptr = nullptr; + std::uint32_t* __m_atomic_id_pointer = nullptr; ::std::size_t __m_raw_mem_bytes = 0; ::std::size_t __m_keys_bytes = 0; @@ -93,13 +94,15 @@ class __onesweep_memory_holder ::std::size_t __m_global_hist_bytes = 0; ::std::size_t __m_group_hist_bytes = 0; + ::std::size_t __m_atomic_id_bytes = 4; + sycl::queue __m_q; void __calculate_raw_memory_amount() noexcept { // Extra bytes are added for potentiall padding - __m_raw_mem_bytes = __m_keys_bytes + __m_global_hist_bytes + __m_group_hist_bytes + sizeof(_KeyT); + __m_raw_mem_bytes = __m_keys_bytes + __m_global_hist_bytes + __m_group_hist_bytes + __m_atomic_id_bytes + sizeof(std::uint32_t) + sizeof(_KeyT); if constexpr (__has_values) { __m_raw_mem_bytes += (__m_vals_bytes + sizeof(_ValT)); @@ -135,6 +138,9 @@ class __onesweep_memory_holder __aligned_ptr = ::std::align(::std::alignment_of_v<_ValT>, __m_vals_bytes, __base_ptr, __remainder); __m_vals_ptr = reinterpret_cast<_ValT*>(__aligned_ptr); } + std::size_t __atomic_id_offset = __m_raw_mem_bytes - __m_atomic_id_bytes; + __atomic_id_offset -= (__atomic_id_offset % alignof(std::uint32_t)); + __m_atomic_id_pointer = reinterpret_cast(__m_raw_mem_ptr + __atomic_id_offset); } public: @@ -181,6 +187,11 @@ class __onesweep_memory_holder { return __m_group_hist_ptr; } + std::uint32_t* + __atomic_id_pointer() const noexcept + { + return __m_atomic_id_pointer; + } void __allocate() @@ -249,7 +260,7 @@ __onesweep_impl(_KtTag __kt_tag, sycl::queue __q, _RngPack1&& __input_pack, _Rng // TODO: consider adding a more versatile API, e.g. passing special kernel_config parameters for histogram computation // ESIMD work-group size: 64 XVEs ~ 2048 SIMD lanes // SYCL work-group size: Programming model enables 1024, so 128 required for PVC-1550 full concurrency. 10x HW oversubscription - constexpr ::std::uint32_t __hist_work_group_count = std::is_same_v<_KtTag, __sycl_tag> ? 128 * 10 : 32; + constexpr ::std::uint32_t __hist_work_group_count = std::is_same_v<_KtTag, __sycl_tag> ? 128 * 10 : 64; constexpr ::std::uint32_t __hist_work_group_size = std::is_same_v<_KtTag, __sycl_tag> ? 1024 : 64; __event_chain = __radix_sort_histogram_submitter<__is_ascending, __radix_bits, __hist_work_group_count, __hist_work_group_size, _RadixSortHistogram>()( @@ -260,7 +271,7 @@ __onesweep_impl(_KtTag __kt_tag, sycl::queue __q, _RngPack1&& __input_pack, _Rng __event_chain = __radix_sort_onesweep_submitter<__is_ascending, __radix_bits, __data_per_work_item, __work_group_size, _RadixSortSweepInitial>()( - __kt_tag, __q, __input_pack, __virt_pack1, __mem_holder.__global_hist_ptr(), __mem_holder.__group_hist_ptr(), + __kt_tag, __q, __input_pack, __virt_pack1, __mem_holder.__global_hist_ptr(), __mem_holder.__group_hist_ptr(), __mem_holder.__atomic_id_pointer(), __sweep_work_group_count, __n, 0, __event_chain); for (::std::uint32_t __stage = 1; __stage < __stage_count; __stage++) @@ -273,14 +284,14 @@ __onesweep_impl(_KtTag __kt_tag, sycl::queue __q, _RngPack1&& __input_pack, _Rng { __event_chain = __radix_sort_onesweep_submitter<__is_ascending, __radix_bits, __data_per_work_item, __work_group_size, _RadixSortSweepOdd>()( - __kt_tag, __q, __virt_pack1, __virt_pack2, __p_global_hist, __p_group_hists, __sweep_work_group_count, __n, + __kt_tag, __q, __virt_pack1, __virt_pack2, __p_global_hist, __p_group_hists, __mem_holder.__atomic_id_pointer(), __sweep_work_group_count, __n, __stage, __event_chain); } else { __event_chain = __radix_sort_onesweep_submitter<__is_ascending, __radix_bits, __data_per_work_item, __work_group_size, _RadixSortSweepEven>()( - __kt_tag, __q, __virt_pack2, __virt_pack1, __p_global_hist, __p_group_hists, __sweep_work_group_count, __n, + __kt_tag, __q, __virt_pack2, __virt_pack1, __p_global_hist, __p_group_hists, __mem_holder.__atomic_id_pointer(), __sweep_work_group_count, __n, __stage, __event_chain); } } @@ -389,23 +400,23 @@ __radix_sort(_KtTag __kt_tag, sycl::queue __q, _RngPack1&& __pack_in, _RngPack2& else { constexpr ::std::uint32_t __one_wg_cap = __data_per_workitem * __workgroup_size; - if (__n <= __one_wg_cap) + // TODO: this is temporary in the prototype until we have a SYCL one wg version to plugin. + if constexpr (std::is_same_v<_KtTag, __esimd_tag>) { - // TODO: support different RadixBits values (only 7, 8, 9 are currently supported) - // TODO: support more granular DataPerWorkItem and WorkGroupSize - - return __one_wg<_KernelName, __is_ascending, __radix_bits, __data_per_workitem, __workgroup_size>( - __kt_tag, __q, ::std::forward<_RngPack1>(__pack_in), ::std::forward<_RngPack2>(__pack_out), __n); - } - else - { - // TODO: avoid kernel duplication (generate the output storage with the same type as input storage and use swap) - // TODO: support different RadixBits - // TODO: support more granular DataPerWorkItem and WorkGroupSize - return __onesweep<_KernelName, __is_ascending, __radix_bits, __data_per_workitem, __workgroup_size, - __in_place>(__kt_tag, __q, ::std::forward<_RngPack1>(__pack_in), - ::std::forward<_RngPack2>(__pack_out), __n); + if (__n <= __one_wg_cap) + { + // TODO: support different RadixBits values (only 7, 8, 9 are currently supported) + // TODO: support more granular DataPerWorkItem and WorkGroupSize + + return __one_wg<_KernelName, __is_ascending, __radix_bits, __data_per_workitem, __workgroup_size>( + __kt_tag, __q, ::std::forward<_RngPack1>(__pack_in), ::std::forward<_RngPack2>(__pack_out), __n); + } } + // TODO: avoid kernel duplication (generate the output storage with the same type as input storage and use swap) + // TODO: support different RadixBits + // TODO: support more granular DataPerWorkItem and WorkGroupSize + return __onesweep<_KernelName, __is_ascending, __radix_bits, __data_per_workitem, __workgroup_size, __in_place>( + __kt_tag, __q, ::std::forward<_RngPack1>(__pack_in), ::std::forward<_RngPack2>(__pack_out), __n); } } diff --git a/include/oneapi/dpl/experimental/kt/internal/radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/radix_sort_submitters.h index eb6d0e05783..fa51a35140f 100644 --- a/include/oneapi/dpl/experimental/kt/internal/radix_sort_submitters.h +++ b/include/oneapi/dpl/experimental/kt/internal/radix_sort_submitters.h @@ -170,27 +170,80 @@ template > { - template + private: + // ESIMD kernel dispatch + template + sycl::event + __submit_esimd(sycl::queue& __q, _InRngPack&& __in_pack, _OutRngPack&& __out_pack, _GlobalHistT* __p_global_hist, + _GlobalHistT* __p_group_hists, _AtomicIdT* __p_atomic_id, ::std::uint32_t __sweep_work_group_count, + ::std::size_t __n, ::std::uint32_t __stage, const sycl::event& __e) const + { + sycl::nd_range<1> __nd_range(__sweep_work_group_count * __work_group_size, __work_group_size); + return __q.submit([&](sycl::handler& __cgh) { + oneapi::dpl::__ranges::__require_access(__cgh, __in_pack.__keys_rng(), __out_pack.__keys_rng()); + if constexpr (::std::decay_t<_InRngPack>::__has_values) + { + oneapi::dpl::__ranges::__require_access(__cgh, __in_pack.__vals_rng(), __out_pack.__vals_rng()); + } + __cgh.depends_on(__e); + __radix_sort_onesweep_kernel<__esimd_tag, __is_ascending, __radix_bits, __data_per_work_item, + __work_group_size, ::std::decay_t<_InRngPack>, ::std::decay_t<_OutRngPack>> + __kernel(__n, __stage, __p_global_hist, __p_group_hists, __p_atomic_id, + ::std::forward<_InRngPack>(__in_pack), ::std::forward<_OutRngPack>(__out_pack)); + __cgh.parallel_for<_Name...>(__nd_range, __kernel); + }); + } + + // SYCL kernel dispatch + template sycl::event - operator()(_KtTag, sycl::queue& __q, _InRngPack&& __in_pack, _OutRngPack&& __out_pack, _GlobalHistT* __p_global_hist, - _GlobalHistT* __p_group_hists, ::std::uint32_t __sweep_work_group_count, ::std::size_t __n, - ::std::uint32_t __stage, const sycl::event& __e) const + __submit_sycl(sycl::queue& __q, _InRngPack&& __in_pack, _OutRngPack&& __out_pack, _GlobalHistT* __p_global_hist, + _GlobalHistT* __p_group_hists, _AtomicIdT* __p_atomic_id, ::std::uint32_t __sweep_work_group_count, + ::std::size_t __n, ::std::uint32_t __stage, const sycl::event& __e) const { + using _KernelType = + __radix_sort_onesweep_kernel<__sycl_tag, __is_ascending, __radix_bits, __data_per_work_item, + __work_group_size, ::std::decay_t<_InRngPack>, ::std::decay_t<_OutRngPack>>; + constexpr ::std::uint32_t __slm_size_bytes = _KernelType::__calc_slm_alloc(); + constexpr ::std::uint32_t __slm_size_elements = __slm_size_bytes / sizeof(::std::uint32_t); + sycl::nd_range<1> __nd_range(__sweep_work_group_count * __work_group_size, __work_group_size); return __q.submit([&](sycl::handler& __cgh) { + sycl::local_accessor __slm_accessor(__slm_size_bytes, __cgh); oneapi::dpl::__ranges::__require_access(__cgh, __in_pack.__keys_rng(), __out_pack.__keys_rng()); if constexpr (::std::decay_t<_InRngPack>::__has_values) { oneapi::dpl::__ranges::__require_access(__cgh, __in_pack.__vals_rng(), __out_pack.__vals_rng()); } __cgh.depends_on(__e); - __radix_sort_onesweep_kernel<_KtTag, __is_ascending, __radix_bits, __data_per_work_item, __work_group_size, - ::std::decay_t<_InRngPack>, ::std::decay_t<_OutRngPack>> - __kernel(__n, __stage, __p_global_hist, __p_group_hists, ::std::forward<_InRngPack>(__in_pack), - ::std::forward<_OutRngPack>(__out_pack)); + _KernelType __kernel(__n, __stage, __p_global_hist, __p_group_hists, __p_atomic_id, + ::std::forward<_InRngPack>(__in_pack), ::std::forward<_OutRngPack>(__out_pack), + __slm_accessor); __cgh.parallel_for<_Name...>(__nd_range, __kernel); }); } + + public: + template + sycl::event + operator()(_KtTag, sycl::queue& __q, _InRngPack&& __in_pack, _OutRngPack&& __out_pack, + _GlobalHistT* __p_global_hist, _GlobalHistT* __p_group_hists, _AtomicIdT* __p_atomic_id, + ::std::uint32_t __sweep_work_group_count, ::std::size_t __n, ::std::uint32_t __stage, + const sycl::event& __e) const + { + if constexpr (std::is_same_v<_KtTag, __sycl_tag>) + { + return __submit_sycl(__q, ::std::forward<_InRngPack>(__in_pack), ::std::forward<_OutRngPack>(__out_pack), + __p_global_hist, __p_group_hists, __p_atomic_id, __sweep_work_group_count, __n, + __stage, __e); + } + else + { + return __submit_esimd(__q, ::std::forward<_InRngPack>(__in_pack), ::std::forward<_OutRngPack>(__out_pack), + __p_global_hist, __p_group_hists, __p_atomic_id, __sweep_work_group_count, __n, + __stage, __e); + } + } }; template diff --git a/include/oneapi/dpl/experimental/kt/internal/radix_sort_utils.h b/include/oneapi/dpl/experimental/kt/internal/radix_sort_utils.h index efac51b2815..fa827a196c7 100644 --- a/include/oneapi/dpl/experimental/kt/internal/radix_sort_utils.h +++ b/include/oneapi/dpl/experimental/kt/internal/radix_sort_utils.h @@ -34,8 +34,8 @@ constexpr void __check_onesweep_params() { static_assert(__radix_bits == 8); - static_assert(__data_per_workitem % 32 == 0); - static_assert(__workgroup_size == 32 || __workgroup_size == 64); + //static_assert(__data_per_workitem % 32 == 0); + //static_assert(__workgroup_size == 32 || __workgroup_size == 64); } //----------------------------------------------------------------------------- @@ -125,6 +125,57 @@ __order_preserving_cast_scalar(_Float __src) return __uint64_src ^ __mask; } +template +struct __keys_pack +{ + _KeyT __keys[_N]; +}; + +template +struct __pairs_pack +{ + _KeyT __keys[_N]; + _ValT __vals[_N]; +}; + +template +auto +__make_key_value_pack() +{ + if constexpr (::std::is_void_v<_T2>) + { + return __keys_pack<_N, _T1>{}; + } + else + { + return __pairs_pack<_N, _T1, _T2>{}; + } +} + +template +void +__sub_group_cross_segment_exclusive_scan(sycl::sub_group& __sub_group, _ScanBuffer* __scan_elements) +{ + // TODO: make it work if this static assert is not true + static_assert(__segment_width == __sub_group_size); + using _ElemT = std::remove_reference_t; + _ElemT __carry = 0; + auto __sub_group_local_id = __sub_group.get_local_linear_id(); + + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t __i = 0; __i < __num_segments; ++__i) + { + auto __element = __scan_elements[__i * __segment_width + __sub_group_local_id]; + auto __element_right_shift = sycl::shift_group_right(__sub_group, __element, 1); + if (__sub_group_local_id == 0) + __element_right_shift = 0; + __scan_elements[__i * __segment_width + __sub_group_local_id] = __element_right_shift + __carry; + + __carry += sycl::group_broadcast(__sub_group, __element, __sub_group_size - 1); + } +} + } // namespace oneapi::dpl::experimental::kt::gpu::__impl #endif // _ONEDPL_KT_SYCL_RADIX_SORT_UTILS_H diff --git a/include/oneapi/dpl/experimental/kt/internal/sub_group/sub_group_scan.h b/include/oneapi/dpl/experimental/kt/internal/sub_group/sub_group_scan.h index 0df7194f8fc..8e453ae5b37 100644 --- a/include/oneapi/dpl/experimental/kt/internal/sub_group/sub_group_scan.h +++ b/include/oneapi/dpl/experimental/kt/internal/sub_group/sub_group_scan.h @@ -17,11 +17,13 @@ #define _ONEDPL_KT_SUB_GROUP_SCAN_H #include +#include #include "../../../../pstl/utils.h" #include "../../../../pstl/hetero/dpcpp/sycl_defs.h" #include "../../../../pstl/hetero/dpcpp/unseq_backend_sycl.h" -#include "../../../../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h" +#include "../../../../pstl/hetero/dpcpp/parallel_backend_sycl.h" +//#include "../../../../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h" namespace oneapi::dpl::experimental::kt { @@ -32,6 +34,38 @@ namespace gpu namespace __impl { +template +struct __is_lazy_ctor_storage : std::false_type +{ +}; + +template +struct __is_lazy_ctor_storage> : std::true_type +{ +}; + +template +struct __scan_input_type +{ + using type = _T; +}; + +template +struct __scan_input_type<_T, true> +{ + using type = typename _T::__value_type; +}; + +template +decltype(auto) +__extract_scan_input(_T& __value) +{ + if constexpr (__is_lazy_ctor_storage>::value) + return (__value.__v); + else + return (__value); +} + // // An optimized scan in a sycl::sub_group performed in local registers. // Input is accepted in the form of an array in sub-group strided order. Formally, for some index i in __input, @@ -41,26 +75,28 @@ namespace __impl // the sub-group. This layout is to align with optimal loads from global memory without extra data movement. // The scan results are updated in __input. // -template -_InputType +auto __sub_group_scan(const _SubGroup& __sub_group, - oneapi::dpl::__internal::__lazy_ctor_storage<_InputType> __input[__iters_per_item], + _InputTypeWrapped __input[__iters_per_item], _BinaryOperation __binary_op, std::uint32_t __items_in_scan) { + using _InputType = typename __scan_input_type, + __is_lazy_ctor_storage>::value>::type; const bool __is_full = __items_in_scan == __sub_group_size * __iters_per_item; oneapi::dpl::__internal::__lazy_ctor_storage<_InputType> __carry; oneapi::dpl::__internal::__scoped_destroyer<_InputType> __destroy_when_leaving_scope{__carry}; if (__is_full) { oneapi::dpl::__par_backend_hetero::__sub_group_scan<__sub_group_size, /*__is_inclusive*/ true, - /*__init_present*/ false>(__sub_group, __input[0].__v, + /*__init_present*/ false>(__sub_group, __extract_scan_input(__input[0]), __binary_op, __carry); _ONEDPL_PRAGMA_UNROLL for (std::uint16_t __i = 1; __i < __iters_per_item; ++__i) { oneapi::dpl::__par_backend_hetero::__sub_group_scan<__sub_group_size, /*__is_inclusive*/ true, - /*__init_present*/ true>(__sub_group, __input[__i].__v, + /*__init_present*/ true>(__sub_group, __extract_scan_input(__input[__i]), __binary_op, __carry); } } @@ -73,22 +109,24 @@ __sub_group_scan(const _SubGroup& __sub_group, { oneapi::dpl::__par_backend_hetero::__sub_group_scan_partial<__sub_group_size, /*__is_inclusive*/ true, /*__init_present*/ false>( - __sub_group, __input[__i].__v, __binary_op, __carry, __items_in_scan - __i * __sub_group_size); + __sub_group, __extract_scan_input(__input[__i]), __binary_op, __carry, + __items_in_scan - __i * __sub_group_size); } else if (__limited_iters_per_item > 1) { oneapi::dpl::__par_backend_hetero::__sub_group_scan<__sub_group_size, /*__is_inclusive*/ true, /*__init_present*/ false>( - __sub_group, __input[__i++].__v, __binary_op, __carry); + __sub_group, __extract_scan_input(__input[__i++]), __binary_op, __carry); for (; __i < __limited_iters_per_item - 1; ++__i) { oneapi::dpl::__par_backend_hetero::__sub_group_scan<__sub_group_size, /*__is_inclusive*/ true, /*__init_present*/ true>( - __sub_group, __input[__i].__v, __binary_op, __carry); + __sub_group, __extract_scan_input(__input[__i]), __binary_op, __carry); } oneapi::dpl::__par_backend_hetero::__sub_group_scan_partial<__sub_group_size, /*__is_inclusive*/ true, /*__init_present*/ true>( - __sub_group, __input[__i].__v, __binary_op, __carry, __items_in_scan - __i * __sub_group_size); + __sub_group, __extract_scan_input(__input[__i]), __binary_op, __carry, + __items_in_scan - __i * __sub_group_size); } } return __carry.__v; diff --git a/include/oneapi/dpl/experimental/kt/internal/sycl_radix_sort_kernels.h b/include/oneapi/dpl/experimental/kt/internal/sycl_radix_sort_kernels.h index c45463f9cdd..4c051ca8183 100644 --- a/include/oneapi/dpl/experimental/kt/internal/sycl_radix_sort_kernels.h +++ b/include/oneapi/dpl/experimental/kt/internal/sycl_radix_sort_kernels.h @@ -17,6 +17,8 @@ #include "../../../pstl/utils.h" #include "../../../pstl/hetero/dpcpp/utils_ranges_sycl.h" +#include "sub_group/sub_group_scan.h" + #include "radix_sort_utils.h" #include "esimd_radix_sort_utils.h" @@ -137,6 +139,502 @@ __sycl_global_histogram(sycl::nd_item<1> __idx, std::size_t __n, const _KeysRng& } } +template +struct __radix_sort_onesweep_kernel; + +template +struct __radix_sort_onesweep_kernel<__sycl_tag, __is_ascending, __radix_bits, __data_per_work_item, __work_group_size, + _InRngPack, _OutRngPack> +{ + using _LocOffsetT = std::uint16_t; + using _GlobOffsetT = std::uint32_t; + using _AtomicIdT = std::uint32_t; + + using _KeyT = typename _InRngPack::_KeyT; + using _ValT = typename _InRngPack::_ValT; + static constexpr bool __has_values = !std::is_void_v<_ValT>; + + static constexpr std::uint32_t __bin_count = 1 << __radix_bits; + + static constexpr std::uint32_t __sub_group_size = 32; + static constexpr std::uint32_t __num_sub_groups_per_work_group = __work_group_size / __sub_group_size; + static constexpr std::uint32_t __data_per_sub_group = __data_per_work_item * __sub_group_size; + + static constexpr std::uint32_t __bit_count = sizeof(_KeyT) * 8; + static constexpr _LocOffsetT __mask = __bin_count - 1; + static constexpr std::uint32_t __hist_stride = __bin_count * sizeof(_LocOffsetT); + static constexpr std::uint32_t __work_item_all_hists_size = __num_sub_groups_per_work_group * __hist_stride; + static constexpr std::uint32_t __group_hist_size = __hist_stride; // _LocOffsetT + static constexpr std::uint32_t __global_hist_size = __bin_count * sizeof(_GlobOffsetT); + + static constexpr std::uint32_t + __calc_reorder_slm_size() + { + if constexpr (__has_values) + return __work_group_size * __data_per_work_item * (sizeof(_KeyT) + sizeof(_ValT)); + else + return __work_group_size * __data_per_work_item * sizeof(_KeyT); + } + + // Helper functions for SLM layout calculation + static constexpr std::uint32_t + __get_slm_subgroup_hists_offset() + { + return 0; // Sub-group histograms start at the beginning + } + + static constexpr std::uint32_t + __get_slm_group_hist_offset() + { + constexpr std::uint32_t __reorder_size = __calc_reorder_slm_size(); + return std::max(__work_item_all_hists_size, __reorder_size); // After max(sub-group hists, reorder space) + } + + static constexpr std::uint32_t + __get_slm_global_incoming_offset() + { + return __get_slm_group_hist_offset() + __group_hist_size; // After group histogram + } + + static constexpr std::uint32_t + __get_slm_global_fix_offset() + { + return __get_slm_global_incoming_offset() + __global_hist_size; // After global incoming histogram + } + + static constexpr std::uint32_t + __calc_slm_alloc() + { + // SLM Layout Visualization: + // + // Phase 1 (Offset Calculation): + // ┌────────────────────────┬─────────────┬──────────────────┐ + // │ Sub-group Hists │ Group Hist │ Global Incoming │ + // │ max(__work_item_all_ │ __group_ │ __global_hist │ + // │ hists_size,__reorder | | | + // | _size │ hist_size │ _size │ + // └────────────────────────┴─────────────┴──────────────────┘ + // | | + // v v + // Phase 2 (Reorder): + // ┌────────────────────────┬─────────────┬──────────────────┬─────────────────┐ + // │ Reorder Space │ Group Hist │ Global Incoming │ Global Fix │ + // │ max(__work_item_all_ │ __group_ │ __global_hist │ __global_hist │ + // │ hists_size,__reorder │ hist_size │ _size │ _size │ + // │ _size) │ │ │ │ + // └────────────────────────┴─────────────┴──────────────────┴─────────────────┘ + // + constexpr std::uint32_t __reorder_size = __calc_reorder_slm_size(); + + // TODO: does starting alignment significantly matter for correctness and performance? If so we may need + // padding between regions + constexpr std::uint32_t __slm_size = std::max(__work_item_all_hists_size, __reorder_size) + __group_hist_size + + __global_hist_size + __global_hist_size; + + return oneapi::dpl::__internal::__dpl_ceiling_div(__slm_size, 2048) * 2048; + } + + const std::uint32_t __n; + const std::uint32_t __stage; + _GlobOffsetT* __p_global_hist; + _GlobOffsetT* __p_group_hists; + _AtomicIdT* __p_atomic_id; + _InRngPack __in_pack; + _OutRngPack __out_pack; + sycl::local_accessor __slm_accessor; + + __radix_sort_onesweep_kernel(std::uint32_t __n, std::uint32_t __stage, _GlobOffsetT* __p_global_hist, + _GlobOffsetT* __p_group_hists, _AtomicIdT* __p_atomic_id, const _InRngPack& __in_pack, + const _OutRngPack& __out_pack, sycl::local_accessor __slm_acc) + : __n(__n), __stage(__stage), __p_global_hist(__p_global_hist), __p_group_hists(__p_group_hists), + __p_atomic_id(__p_atomic_id), __in_pack(__in_pack), __out_pack(__out_pack), __slm_accessor(__slm_acc) + { + } + + template + inline auto + __load_pack(_KVPack& __pack, std::uint32_t __wg_id, std::uint32_t __sg_id, std::uint32_t __sg_local_id) const + { + const _GlobOffsetT __offset = __data_per_sub_group * (__wg_id * __num_sub_groups_per_work_group + __sg_id); + auto __keys_seq = __rng_data(__in_pack.__keys_rng()); + __load(__pack.__keys, __keys_seq, __offset, __sg_local_id); + if constexpr (__has_values) + { + __load(__pack.__vals, __rng_data(__in_pack.__vals_rng()), __offset, + __sg_local_id); + } + } + + template + inline void + __load(_T __elements[__data_per_work_item], const _InSeq& __in_seq, _GlobOffsetT __glob_offset, + std::uint32_t __local_offset) const + { + bool __is_full_block = (__glob_offset + __data_per_sub_group) <= __n; + auto __offset = __glob_offset + __local_offset; + if (__is_full_block) + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i) + { + __elements[__i] = __in_seq[__offset + __i * __sub_group_size]; + } + } + else + { + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i) + { + auto __idx = __offset + __i * __sub_group_size; + if constexpr (__sort_identity_residual) + { + __elements[__i] = (__idx < __n) ? __in_seq[__idx] : __sort_identity<_T, __is_ascending>(); + } + else + { + if (__idx < __n) + { + __elements[__i] = __in_seq[__idx]; + } + } + } + } + } + + static inline std::uint32_t + __match_bins(sycl::nd_item<1> __idx, std::uint32_t __bin) + { + // start with all bits 1 + auto __matched_bins = sycl::ext::oneapi::group_ballot(__idx.get_sub_group()); + _ONEDPL_PRAGMA_UNROLL + for (int __i = 0; __i < __radix_bits; __i++) + { + auto __bit = (__bin >> __i) & 1; + auto __sg_vote = sycl::ext::oneapi::group_ballot(__idx.get_sub_group(), static_cast(__bit)); + // If we vote yes, then we want to set all bits that also voted yes. If no, then we want to + // zero out the bits that said yes as they don't match and preserve others as we have no info on these. + __matched_bins &= __bit ? __sg_vote : ~__sg_vote; + } + std::uint32_t __result = 0; + __matched_bins.extract_bits(__result); + return __result; + } + + inline auto + __rank_local(sycl::nd_item<1> __idx, _LocOffsetT __ranks[__data_per_work_item], + _LocOffsetT __bins[__data_per_work_item], _LocOffsetT* __slm_subgroup_hists, + std::uint32_t __sub_group_slm_offset) const + { + std::uint32_t __sub_group_local_id = __idx.get_sub_group().get_local_id(); + _LocOffsetT* __slm_offset = __slm_subgroup_hists + __sub_group_slm_offset; + + for (std::uint32_t __i = __idx.get_sub_group().get_local_id(); __i < __bin_count; __i += __sub_group_size) + { + __slm_offset[__i] = 0; + } + // TODO: sub-group barrier ? maybe not for simd architectures + + constexpr std::uint32_t __sub_group_full_bitmask = 0x7fffffff; + static_assert(__sub_group_size == 32); + // lower bits than my current will be set meaning we only preserve left lanes + std::uint32_t __remove_right_lanes = __sub_group_full_bitmask >> (__sub_group_size - 1 - __sub_group_local_id); + + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i) + { + _LocOffsetT __bin = __bins[__i]; + std::uint32_t __matched_bins = __match_bins(__idx, __bin); + std::uint32_t __pre_rank = __slm_offset[__bin]; + auto __matched_left_lanes = __matched_bins & __remove_right_lanes; + std::uint32_t __this_round_rank = sycl::popcount(__matched_left_lanes); + std::uint32_t __this_round_count = sycl::popcount(__matched_bins); + auto __rank_after = __pre_rank + __this_round_rank; + auto __is_leader = __this_round_rank == __this_round_count - 1; + if (__is_leader) + { + __slm_offset[__bin] = __rank_after + 1; + } + __ranks[__i] = __rank_after; + } + sycl::group_barrier(__idx.get_group()); + } + + inline void + __rank_global(sycl::nd_item<1> __idx, std::uint32_t __wg_id, _LocOffsetT* __slm_subgroup_hists, + _LocOffsetT* __slm_group_hist, _GlobOffsetT* __slm_global_incoming) const + { + // TODO: This exists in the ESIMD KT and was ported but are we not limiting max input size to + // 2^30 ~ 1 billion elements? We use 32-bit indexing / histogram which may already be too small + // but are then reserving the two upper bits for lookback flags. + constexpr std::uint32_t __global_accumulated = 0x40000000; + constexpr std::uint32_t __hist_updated = 0x80000000; + constexpr std::uint32_t __global_offset_mask = 0x3fffffff; + + _GlobOffsetT* __p_this_group_hist = __p_group_hists + __bin_count * __wg_id; + _GlobOffsetT* __p_prev_group_hist = __p_this_group_hist - __bin_count; + + // This is important so that we can evenly partition the radix bits across a number of sub-groups + // without masking lanes. Radix bits is always a power of two, so this requirement essentially just + // requires radix_bits >= 5 for sub-group size of 32. + static_assert(__bin_count % __sub_group_size == 0); + + constexpr std::uint32_t __bin_summary_sub_group_size = __bin_count / __sub_group_size; + constexpr std::uint32_t __bin_width = __sub_group_size; + + auto __sub_group_id = __idx.get_sub_group().get_group_linear_id(); + auto __sub_group_local_id = __idx.get_sub_group().get_local_linear_id(); + + // 1. Vector scan of histograms previously accumulated by each work-item + // update slm instead of grf summary due to perf issues with grf histogram + + // TODO: this single element array is a temporary workaround for sub group scan requiring an array + _LocOffsetT __item_grf_hist_summary_arr[1] = {0}; + _LocOffsetT& __item_grf_hist_summary = __item_grf_hist_summary_arr[0]; + _LocOffsetT __item_bin_count; + if (__sub_group_id < __bin_summary_sub_group_size) + { + // 1.1. Vector scan of the same bins across different histograms. + std::uint32_t __slm_bin_hist_summary_offset = __sub_group_id * __bin_width; + + for (std::uint32_t __s = 0; __s < __num_sub_groups_per_work_group; + __s++, __slm_bin_hist_summary_offset += __bin_count) + { + auto __slm_idx = __slm_bin_hist_summary_offset + __sub_group_local_id; + __item_grf_hist_summary += __slm_subgroup_hists[__slm_idx]; + __slm_subgroup_hists[__slm_idx] = __item_grf_hist_summary; + } + __item_bin_count = __item_grf_hist_summary; + + // 1.2. Vector scan of different bins inside one histogram, the final one for the whole work-group. + // Only "__bin_width" pieces of the histogram are scanned at this stage. + // This histogram will be further used for calculation of offsets of keys already reordered in SLM, + // it does not participate in sycnhronization between work-groups. + __sub_group_scan<__sub_group_size, 1>(__idx.get_sub_group(), __item_grf_hist_summary_arr, std::plus<>{}, + __bin_width); + + auto __write_idx = __sub_group_id * __bin_width + __sub_group_local_id; + __slm_group_hist[__write_idx] = __item_grf_hist_summary; + + // 1.3. Copy the histogram at the region designated for synchronization between work-groups. + // Write the histogram to global memory, bypassing caches, to ensure cross-work-group visibility. + if (__wg_id != 0) + { + // Copy the histogram, local to this WG + using _GlobalAtomicT = sycl::atomic_ref<_GlobOffsetT, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + _GlobalAtomicT __ref(__p_this_group_hist[__sub_group_id * __bin_width + __sub_group_local_id]); + __ref.store(__item_bin_count | __hist_updated); + } + else + { + // WG0 is a special case: it also retrieves the total global histogram and adds it to its local histogram + // This global histogram will be propagated to other work-groups through a chained scan at stage 2 + using _GlobalAtomicT = + sycl::atomic_ref<_GlobOffsetT, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + + auto __hist_idx = __sub_group_id * __bin_width + __sub_group_local_id; + _GlobOffsetT __global_hist = __p_global_hist[__hist_idx] & __global_offset_mask; + _GlobOffsetT __after_group_hist_sum = __global_hist + __item_bin_count; + _GlobalAtomicT __ref(__p_this_group_hist[__hist_idx]); + __ref.store(__after_group_hist_sum | __hist_updated | __global_accumulated); + // Copy the global histogram to local memory to share with other work-items + __slm_global_incoming[__hist_idx] = __global_hist; + } + } + sycl::group_barrier(__idx.get_group()); + + auto __sub_group = __idx.get_sub_group(); + auto __sub_group_group_id = __sub_group.get_group_linear_id(); + + // 1.4 One work-item finalizes scan performed at stage 1.2 + // by propagating prefixes accumulated after scanning individual "__bin_width" pieces. + if (__sub_group_group_id == __bin_summary_sub_group_size + 1) + { + __sub_group_cross_segment_exclusive_scan<__bin_width, __bin_summary_sub_group_size, __sub_group_size>( + __sub_group, __slm_group_hist); + } + + sycl::group_barrier(__idx.get_group()); + + // 2. Chained scan. Synchronization between work-groups. + if (__sub_group_group_id < __bin_summary_sub_group_size && __wg_id != 0) + { + using _GlobalAtomicT = + sycl::atomic_ref<_GlobOffsetT, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + // 2.1. Read the histograms scanned across work-groups + _GlobOffsetT __prev_group_hist_sum = 0; + _GlobOffsetT __prev_group_hist; + bool __is_not_accumulated = true; + do + { + auto __idx = __sub_group_group_id * __bin_width + __sub_group_local_id; + _GlobalAtomicT __ref(__p_prev_group_hist[__idx]); + do + { + __prev_group_hist = __ref.load(); + } while ((__prev_group_hist & __hist_updated) == 0); + __prev_group_hist_sum += __is_not_accumulated ? __prev_group_hist : 0; + __is_not_accumulated = (__prev_group_hist_sum & __global_accumulated) == 0; + __p_prev_group_hist -= __bin_count; + } while (sycl::any_of_group(__sub_group, __is_not_accumulated)); + + _GlobOffsetT __after_group_hist_sum = 0; + __prev_group_hist_sum &= __global_offset_mask; + __after_group_hist_sum = __prev_group_hist_sum + __item_bin_count; + auto __idx = __sub_group_group_id * __bin_width + __sub_group_local_id; + // 2.2. Write the histogram scanned across work-group, updated with the current work-group data + _GlobalAtomicT __ref(__p_this_group_hist[__idx]); + __ref.store(__after_group_hist_sum | __hist_updated | __global_accumulated); + // 2.3. Save the scanned histogram from previous work-groups locally + __slm_global_incoming[__idx] = __prev_group_hist_sum; + } + sycl::group_barrier(__idx.get_group()); + } + + template + void inline __reorder_reg_to_slm(sycl::nd_item<1> __idx, const _KVPack& __pack, + _LocOffsetT (&__ranks)[__data_per_work_item], + const _LocOffsetT (&__bins)[__data_per_work_item], + _LocOffsetT* __slm_subgroup_hists, _LocOffsetT* __slm_group_hist, + _GlobOffsetT* __slm_global_incoming, _GlobOffsetT* __slm_global_fix, + _KeyT* __slm_keys, _ValT* __slm_vals) const + { + auto __sub_group_id = __idx.get_sub_group().get_group_linear_id(); + const auto __wg_size = __idx.get_local_range(0); + + // 1. update ranks to reflect sub-group offsets in and across bins + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i) + { + auto __bin = __bins[__i]; + auto __group_incoming = __slm_group_hist[__bin]; + auto __offset_in_bin = + (__sub_group_id == 0) ? 0 : __slm_subgroup_hists[(__sub_group_id - 1) * __bin_count + __bin]; + auto __offset_across_bins = __group_incoming; + __ranks[__i] += __offset_in_bin + __offset_across_bins; + } + + // 2. compute __global_fix + for (std::uint32_t __i = __idx.get_local_id(); __i < __bin_count; __i += __wg_size) + { + __slm_global_fix[__i] = __slm_global_incoming[__i] - static_cast<_GlobOffsetT>(__slm_group_hist[__i]); + } + sycl::group_barrier(__idx.get_group()); + + // 3. Write keys (and values) to SLM at computed ranks + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i) + { + __slm_keys[__ranks[__i]] = __pack.__keys[__i]; + if constexpr (__has_values) + { + __slm_vals[__ranks[__i]] = __pack.__vals[__i]; + } + } + sycl::group_barrier(__idx.get_group()); + } + + template + void inline __reorder_slm_to_glob(sycl::nd_item<1> __idx, _KVPack& __pack, _GlobOffsetT* __slm_global_fix, + _KeyT* __slm_keys, _ValT* __slm_vals) const + { + auto __sub_group_id = __idx.get_sub_group().get_group_linear_id(); + auto __sub_group_local_id = __idx.get_sub_group().get_local_linear_id(); + + const _GlobOffsetT __keys_slm_offset = __data_per_sub_group * __sub_group_id; + + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i) + { + auto __slm_idx = __keys_slm_offset + __i * __sub_group_size + __sub_group_local_id; + auto __key = __slm_keys[__slm_idx]; + auto __bin = __get_bucket_scalar<__mask>(__order_preserving_cast_scalar<__is_ascending>(__key), + __stage * __radix_bits); + auto __global_fix = __slm_global_fix[__bin]; + auto __out_idx = __global_fix + __slm_idx; + + // TODO: we need to figure out how to relax this bounds checking for full unrolling + bool __output_mask = __out_idx < __n; + if (__output_mask) + __out_pack.__keys_rng()[__out_idx] = __key; + if constexpr (__has_values) + { + auto __val = __slm_vals[__slm_idx]; + if (__output_mask) + __out_pack.__vals_rng()[__out_idx] = __val; + } + } + } + + [[sycl::reqd_sub_group_size(__sub_group_size)]] void + operator()(sycl::nd_item<1> __idx) const + { + const std::uint32_t __local_tid = __idx.get_local_linear_id(); + const std::uint32_t __wg_size = __idx.get_local_range(0); + const std::uint32_t __sg_id = __idx.get_sub_group().get_group_linear_id(); + const std::uint32_t __sg_local_id = __idx.get_sub_group().get_local_id(); + + const std::uint32_t __num_wgs = __idx.get_group_range(0); + using _AtomicRefT = sycl::atomic_ref<_AtomicIdT, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + _AtomicRefT __atomic_id_ref(*__p_atomic_id); + std::uint32_t __wg_id = 0; + if (__idx.get_local_linear_id() == 0) + { + // Modulo num work-groups because onesweep gets invoked multiple times and we do not want an extra memset between + // invocations. + __wg_id = __atomic_id_ref.fetch_add(1) % __num_wgs; + } + __wg_id = sycl::group_broadcast(__idx.get_group(), __wg_id); + + const std::uint32_t __sub_group_slm_offset = __sg_id * __bin_count; + + auto __values_pack = __make_key_value_pack<__data_per_work_item, _KeyT, _ValT>(); + _LocOffsetT __bins[__data_per_work_item]; + _LocOffsetT __ranks[__data_per_work_item]; + + __load_pack(__values_pack, __wg_id, __sg_id, __sg_local_id); + + _ONEDPL_PRAGMA_UNROLL + for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i) + { + const auto __ordered = __order_preserving_cast_scalar<__is_ascending>(__values_pack.__keys[__i]); + __bins[__i] = __get_bucket_scalar<__mask>(__ordered, __stage * __radix_bits); + } + + // Get raw SLM pointer and create typed pointers for different regions using helper functions + unsigned char* __slm_raw = __slm_accessor.get_multi_ptr().get(); + _LocOffsetT* __slm_subgroup_hists = + reinterpret_cast<_LocOffsetT*>(__slm_raw + __get_slm_subgroup_hists_offset()); + _LocOffsetT* __slm_group_hist = reinterpret_cast<_LocOffsetT*>(__slm_raw + __get_slm_group_hist_offset()); + _GlobOffsetT* __slm_global_incoming = + reinterpret_cast<_GlobOffsetT*>(__slm_raw + __get_slm_global_incoming_offset()); + + __rank_local(__idx, __ranks, __bins, __slm_subgroup_hists, __sub_group_slm_offset); + __rank_global(__idx, __wg_id, __slm_subgroup_hists, __slm_group_hist, __slm_global_incoming); + + // For reorder phase, reinterpret the sub-group histogram space as key/value storage + // The reorder space overlaps with the sub-group histogram region (reinterpret_cast) + _KeyT* __slm_keys = reinterpret_cast<_KeyT*>(__slm_raw + __get_slm_subgroup_hists_offset()); + _ValT* __slm_vals = nullptr; + if constexpr (__has_values) + { + __slm_vals = reinterpret_cast<_ValT*>(__slm_raw + __get_slm_subgroup_hists_offset() + + __wg_size * __data_per_work_item * sizeof(_KeyT)); + } + _GlobOffsetT* __slm_global_fix = reinterpret_cast<_GlobOffsetT*>(__slm_raw + __get_slm_global_fix_offset()); + + __reorder_reg_to_slm(__idx, __values_pack, __ranks, __bins, __slm_subgroup_hists, __slm_group_hist, + __slm_global_incoming, __slm_global_fix, __slm_keys, __slm_vals); + + __reorder_slm_to_glob(__idx, __values_pack, __slm_global_fix, __slm_keys, __slm_vals); + } +}; + } // namespace oneapi::dpl::experimental::kt::gpu::__impl #endif // _ONEDPL_KT_SYCL_RADIX_SORT_KERNELS_H diff --git a/test/kt/CMakeLists.txt b/test/kt/CMakeLists.txt index 392be9ff5fb..a8d601e4995 100644 --- a/test/kt/CMakeLists.txt +++ b/test/kt/CMakeLists.txt @@ -101,8 +101,16 @@ endfunction() function(_generate_sort_tests _variant _key_value_pairs) set(_base_file_all "radix_sort" "radix_sort_out_of_place") set(_base_file_by_key_all "radix_sort_by_key" "radix_sort_by_key_out_of_place") - set(_data_per_work_item_all "32" "64" "96" "128" "160" "192" "224" "256" "288" "320" "352" "384" "416" "448" "480" "512") - set(_work_group_size_all "64") + + # Variant-specific configurations + if (${_variant} STREQUAL "sycl") + set(_data_per_work_item_all "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16") + set(_work_group_size_all "1024") + else() # esimd + set(_data_per_work_item_all "32" "64" "96" "128" "160" "192" "224" "256" "288" "320" "352" "384" "416" "448" "480" "512") + set(_work_group_size_all "64") + endif() + set(_type_all "char" "uint16_t" "int" "uint64_t" "float" "double") foreach (_data_per_work_item ${_data_per_work_item_all}) @@ -140,8 +148,8 @@ if (ONEDPL_TEST_ENABLE_KT_SYCL) _generate_sort_tests("sycl" TRUE) # radix_sort_by_key, random # Pin some cases to track them - _generate_sort_test("sycl" "radix_sort_by_key_out_of_place" "96" "64" "uint32_t" "uint32_t" 1000) - _generate_sort_test("sycl" "radix_sort" "384" "64" "int32_t" "" 1000) + _generate_sort_test("sycl" "radix_sort_by_key_out_of_place" "3" "1024" "uint32_t" "uint32_t" 1000) + _generate_sort_test("sycl" "radix_sort" "12" "1024" "int32_t" "" 1000) endif() function (_generate_gpu_scan_test _data_per_work_item _work_group_size _type _probability_permille) diff --git a/test/kt/sycl_radix_sort.cpp b/test/kt/sycl_radix_sort.cpp index 3dc31a307d2..056879b0349 100755 --- a/test/kt/sycl_radix_sort.cpp +++ b/test/kt/sycl_radix_sort.cpp @@ -123,18 +123,9 @@ test_sycl_iterators(sycl::queue q, std::size_t size, KernelParam param) std::stable_sort(std::begin(ref), std::end(ref), Compare{}); { sycl::buffer buf(input.data(), input.size()); - -#ifdef __clang__ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wdeprecated-declarations" -#endif - // Deprecated namespace is used deliberatelly to make sure the functionality is still available - oneapi::dpl::experimental::kt::esimd::radix_sort(q, oneapi::dpl::begin(buf), oneapi::dpl::end(buf), - param) + oneapi::dpl::experimental::kt::gpu::radix_sort(q, oneapi::dpl::begin(buf), + oneapi::dpl::end(buf), param) .wait(); -#ifdef __clang__ -#pragma clang diagnostic pop -#endif } std::string msg = "wrong results with oneapi::dpl::begin/end, n: " + std::to_string(size); diff --git a/test/kt/sycl_radix_sort_by_key.cpp b/test/kt/sycl_radix_sort_by_key.cpp index dff02e90a73..fa8873523a5 100755 --- a/test/kt/sycl_radix_sort_by_key.cpp +++ b/test/kt/sycl_radix_sort_by_key.cpp @@ -40,7 +40,7 @@ void test_sycl_buffer(sycl::queue q, std::size_t size, KernelParam param) { sycl::buffer keys(actual_keys.data(), actual_keys.size()); sycl::buffer values(actual_values.data(), actual_values.size()); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_by_key(q, keys, values, param).wait(); + oneapi::dpl::experimental::kt::gpu::radix_sort_by_key(q, keys, values, param).wait(); } auto expected_first = oneapi::dpl::make_zip_iterator(std::begin(expected_keys), std::begin(expected_values)); @@ -71,8 +71,9 @@ void test_usm(sycl::queue q, std::size_t size, KernelParam param) auto expected_first = oneapi::dpl::make_zip_iterator(std::begin(expected_keys), std::begin(expected_values)); std::stable_sort(expected_first, expected_first + size, CompareKey{}); - oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_by_key( - q, keys.get_data(), keys.get_data() + size, values.get_data(), param).wait(); + oneapi::dpl::experimental::kt::gpu::radix_sort_by_key( + q, keys.get_data(), keys.get_data() + size, values.get_data(), param) + .wait(); std::vector actual_keys(size); std::vector actual_values(size);