diff --git a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_kernels.h b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_kernels.h
index 3ffaa6cc660..da7312cd626 100644
--- a/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_kernels.h
+++ b/include/oneapi/dpl/experimental/kt/internal/esimd_radix_sort_kernels.h
@@ -391,10 +391,15 @@ template <typename _KtTag, bool __is_ascending, ::std::uint8_t __radix_bits, ::s
 //                                     __work_group_size, _InRngPack, _OutRngPack>
 template <typename _KtTag, bool __is_ascending, ::std::uint8_t __radix_bits, ::std::uint16_t __data_per_work_item,
           ::std::uint16_t __work_group_size, typename _InRngPack, typename _OutRngPack>
-struct __radix_sort_onesweep_kernel
+struct __radix_sort_onesweep_kernel;
+
+template <bool __is_ascending, ::std::uint8_t __radix_bits, ::std::uint16_t __data_per_work_item,
+          ::std::uint16_t __work_group_size, typename _InRngPack, typename _OutRngPack>
+struct __radix_sort_onesweep_kernel<__esimd_tag, __is_ascending, __radix_bits, __data_per_work_item, __work_group_size, _InRngPack, _OutRngPack>
 {
     using _LocOffsetT = ::std::uint16_t;
     using _GlobOffsetT = ::std::uint32_t;
+    using _AtomicIdT = ::std::uint32_t;
 
     using _KeyT = typename _InRngPack::_KeyT;
     using _ValT = typename _InRngPack::_ValT;
@@ -457,7 +462,7 @@ struct __radix_sort_onesweep_kernel
     _OutRngPack __out_pack;
 
     __radix_sort_onesweep_kernel(::std::uint32_t __n, ::std::uint32_t __stage, _GlobOffsetT* __p_global_hist,
-                                 _GlobOffsetT* __p_group_hists, const _InRngPack& __in_pack,
+                                 _GlobOffsetT* __p_group_hists, _AtomicIdT* /*__p_atomic_id*/, const _InRngPack& __in_pack,
                                  const _OutRngPack& __out_pack)
         : __n(__n), __stage(__stage), __p_global_hist(__p_global_hist), __p_group_hists(__p_group_hists),
           __in_pack(__in_pack), __out_pack(__out_pack)
diff --git a/include/oneapi/dpl/experimental/kt/internal/radix_sort_dispatchers.h b/include/oneapi/dpl/experimental/kt/internal/radix_sort_dispatchers.h
index 6387e095476..0e19afa65e3 100644
--- a/include/oneapi/dpl/experimental/kt/internal/radix_sort_dispatchers.h
+++ b/include/oneapi/dpl/experimental/kt/internal/radix_sort_dispatchers.h
@@ -86,6 +86,7 @@ class __onesweep_memory_holder
     // Memory to store intermediate results of sorting
     _KeyT* __m_keys_ptr = nullptr;
     _ValT* __m_vals_ptr = nullptr;
+    std::uint32_t* __m_atomic_id_pointer = nullptr;
 
     ::std::size_t __m_raw_mem_bytes = 0;
     ::std::size_t __m_keys_bytes = 0;
@@ -93,13 +94,15 @@ class __onesweep_memory_holder
     ::std::size_t __m_global_hist_bytes = 0;
     ::std::size_t __m_group_hist_bytes = 0;
 
+    ::std::size_t __m_atomic_id_bytes = 4;
+
     sycl::queue __m_q;
 
     void
     __calculate_raw_memory_amount() noexcept
     {
         // Extra bytes are added for potentiall padding
-        __m_raw_mem_bytes = __m_keys_bytes + __m_global_hist_bytes + __m_group_hist_bytes + sizeof(_KeyT);
+        __m_raw_mem_bytes = __m_keys_bytes + __m_global_hist_bytes + __m_group_hist_bytes + __m_atomic_id_bytes + sizeof(std::uint32_t) + sizeof(_KeyT);
         if constexpr (__has_values)
         {
             __m_raw_mem_bytes += (__m_vals_bytes + sizeof(_ValT));
@@ -135,6 +138,9 @@ class __onesweep_memory_holder
             __aligned_ptr = ::std::align(::std::alignment_of_v<_ValT>, __m_vals_bytes, __base_ptr, __remainder);
             __m_vals_ptr = reinterpret_cast<_ValT*>(__aligned_ptr);
         }
+        std::size_t __atomic_id_offset = __m_raw_mem_bytes - __m_atomic_id_bytes;
+        __atomic_id_offset -= (__atomic_id_offset % alignof(std::uint32_t));
+        __m_atomic_id_pointer = reinterpret_cast<std::uint32_t*>(__m_raw_mem_ptr + __atomic_id_offset);
     }
 
   public:
@@ -181,6 +187,11 @@ class __onesweep_memory_holder
     {
         return __m_group_hist_ptr;
     }
+    std::uint32_t*
+    __atomic_id_pointer() const noexcept
+    {
+        return __m_atomic_id_pointer;
+    }
 
     void
     __allocate()
@@ -249,7 +260,7 @@ __onesweep_impl(_KtTag __kt_tag, sycl::queue __q, _RngPack1&& __input_pack, _Rng
     // TODO: consider adding a more versatile API, e.g. passing special kernel_config parameters for histogram computation
     // ESIMD work-group size: 64 XVEs ~ 2048 SIMD lanes
     // SYCL work-group size: Programming model enables 1024, so 128 required for PVC-1550 full concurrency. 10x HW oversubscription
-    constexpr ::std::uint32_t __hist_work_group_count = std::is_same_v<_KtTag, __sycl_tag> ? 128 * 10 : 32;
+    constexpr ::std::uint32_t __hist_work_group_count = std::is_same_v<_KtTag, __sycl_tag> ? 128 * 10 : 64;
     constexpr ::std::uint32_t __hist_work_group_size = std::is_same_v<_KtTag, __sycl_tag> ? 1024 : 64;
     __event_chain = __radix_sort_histogram_submitter<__is_ascending, __radix_bits, __hist_work_group_count,
                                                      __hist_work_group_size, _RadixSortHistogram>()(
@@ -260,7 +271,7 @@ __onesweep_impl(_KtTag __kt_tag, sycl::queue __q, _RngPack1&& __input_pack, _Rng
 
     __event_chain = __radix_sort_onesweep_submitter<__is_ascending, __radix_bits, __data_per_work_item,
                                                     __work_group_size, _RadixSortSweepInitial>()(
-        __kt_tag, __q, __input_pack, __virt_pack1, __mem_holder.__global_hist_ptr(), __mem_holder.__group_hist_ptr(),
+        __kt_tag, __q, __input_pack, __virt_pack1, __mem_holder.__global_hist_ptr(), __mem_holder.__group_hist_ptr(), __mem_holder.__atomic_id_pointer(),
         __sweep_work_group_count, __n, 0, __event_chain);
 
     for (::std::uint32_t __stage = 1; __stage < __stage_count; __stage++)
@@ -273,14 +284,14 @@ __onesweep_impl(_KtTag __kt_tag, sycl::queue __q, _RngPack1&& __input_pack, _Rng
         {
             __event_chain = __radix_sort_onesweep_submitter<__is_ascending, __radix_bits, __data_per_work_item,
                                                             __work_group_size, _RadixSortSweepOdd>()(
-                __kt_tag, __q, __virt_pack1, __virt_pack2, __p_global_hist, __p_group_hists, __sweep_work_group_count, __n,
+                __kt_tag, __q, __virt_pack1, __virt_pack2, __p_global_hist, __p_group_hists, __mem_holder.__atomic_id_pointer(), __sweep_work_group_count, __n,
                 __stage, __event_chain);
         }
         else
         {
             __event_chain = __radix_sort_onesweep_submitter<__is_ascending, __radix_bits, __data_per_work_item,
                                                             __work_group_size, _RadixSortSweepEven>()(
-                __kt_tag, __q, __virt_pack2, __virt_pack1, __p_global_hist, __p_group_hists, __sweep_work_group_count, __n,
+                __kt_tag, __q, __virt_pack2, __virt_pack1, __p_global_hist, __p_group_hists, __mem_holder.__atomic_id_pointer(), __sweep_work_group_count, __n,
                 __stage, __event_chain);
         }
     }
@@ -389,23 +400,23 @@ __radix_sort(_KtTag __kt_tag, sycl::queue __q, _RngPack1&& __pack_in, _RngPack2&
     else
     {
         constexpr ::std::uint32_t __one_wg_cap = __data_per_workitem * __workgroup_size;
-        if (__n <= __one_wg_cap)
+        // TODO: this is temporary in the prototype until we have a SYCL one wg version to plugin.
+        if constexpr (std::is_same_v<_KtTag, __esimd_tag>)
         {
-            // TODO: support different RadixBits values (only 7, 8, 9 are currently supported)
-            // TODO: support more granular DataPerWorkItem and WorkGroupSize
-
-            return __one_wg<_KernelName, __is_ascending, __radix_bits, __data_per_workitem, __workgroup_size>(
-                __kt_tag, __q, ::std::forward<_RngPack1>(__pack_in), ::std::forward<_RngPack2>(__pack_out), __n);
-        }
-        else
-        {
-            // TODO: avoid kernel duplication (generate the output storage with the same type as input storage and use swap)
-            // TODO: support different RadixBits
-            // TODO: support more granular DataPerWorkItem and WorkGroupSize
-            return __onesweep<_KernelName, __is_ascending, __radix_bits, __data_per_workitem, __workgroup_size,
-                              __in_place>(__kt_tag, __q, ::std::forward<_RngPack1>(__pack_in),
-                                          ::std::forward<_RngPack2>(__pack_out), __n);
+            if (__n <= __one_wg_cap)
+            {
+                // TODO: support different RadixBits values (only 7, 8, 9 are currently supported)
+                // TODO: support more granular DataPerWorkItem and WorkGroupSize
+
+                return __one_wg<_KernelName, __is_ascending, __radix_bits, __data_per_workitem, __workgroup_size>(
+                    __kt_tag, __q, ::std::forward<_RngPack1>(__pack_in), ::std::forward<_RngPack2>(__pack_out), __n);
+            }
         }
+        // TODO: avoid kernel duplication (generate the output storage with the same type as input storage and use swap)
+        // TODO: support different RadixBits
+        // TODO: support more granular DataPerWorkItem and WorkGroupSize
+        return __onesweep<_KernelName, __is_ascending, __radix_bits, __data_per_workitem, __workgroup_size, __in_place>(
+            __kt_tag, __q, ::std::forward<_RngPack1>(__pack_in), ::std::forward<_RngPack2>(__pack_out), __n);
     }
 }
 
diff --git a/include/oneapi/dpl/experimental/kt/internal/radix_sort_submitters.h b/include/oneapi/dpl/experimental/kt/internal/radix_sort_submitters.h
index eb6d0e05783..fa51a35140f 100644
--- a/include/oneapi/dpl/experimental/kt/internal/radix_sort_submitters.h
+++ b/include/oneapi/dpl/experimental/kt/internal/radix_sort_submitters.h
@@ -170,27 +170,80 @@ template <bool __is_ascending, ::std::uint8_t __radix_bits, ::std::uint16_t __da
 struct __radix_sort_onesweep_submitter<__is_ascending, __radix_bits, __data_per_work_item, __work_group_size,
                                        oneapi::dpl::__par_backend_hetero::__internal::__optional_kernel_name<_Name...>>
 {
-    template <typename _KtTag, typename _InRngPack, typename _OutRngPack, typename _GlobalHistT>
+  private:
+    // ESIMD kernel dispatch
+    template <typename _InRngPack, typename _OutRngPack, typename _GlobalHistT, typename _AtomicIdT>
+    sycl::event
+    __submit_esimd(sycl::queue& __q, _InRngPack&& __in_pack, _OutRngPack&& __out_pack, _GlobalHistT* __p_global_hist,
+                   _GlobalHistT* __p_group_hists, _AtomicIdT* __p_atomic_id, ::std::uint32_t __sweep_work_group_count,
+                   ::std::size_t __n, ::std::uint32_t __stage, const sycl::event& __e) const
+    {
+        sycl::nd_range<1> __nd_range(__sweep_work_group_count * __work_group_size, __work_group_size);
+        return __q.submit([&](sycl::handler& __cgh) {
+            oneapi::dpl::__ranges::__require_access(__cgh, __in_pack.__keys_rng(), __out_pack.__keys_rng());
+            if constexpr (::std::decay_t<_InRngPack>::__has_values)
+            {
+                oneapi::dpl::__ranges::__require_access(__cgh, __in_pack.__vals_rng(), __out_pack.__vals_rng());
+            }
+            __cgh.depends_on(__e);
+            __radix_sort_onesweep_kernel<__esimd_tag, __is_ascending, __radix_bits, __data_per_work_item,
+                                         __work_group_size, ::std::decay_t<_InRngPack>, ::std::decay_t<_OutRngPack>>
+                __kernel(__n, __stage, __p_global_hist, __p_group_hists, __p_atomic_id,
+                         ::std::forward<_InRngPack>(__in_pack), ::std::forward<_OutRngPack>(__out_pack));
+            __cgh.parallel_for<_Name...>(__nd_range, __kernel);
+        });
+    }
+
+    // SYCL kernel dispatch
+    template <typename _InRngPack, typename _OutRngPack, typename _GlobalHistT, typename _AtomicIdT>
     sycl::event
-    operator()(_KtTag, sycl::queue& __q, _InRngPack&& __in_pack, _OutRngPack&& __out_pack, _GlobalHistT* __p_global_hist,
-               _GlobalHistT* __p_group_hists, ::std::uint32_t __sweep_work_group_count, ::std::size_t __n,
-               ::std::uint32_t __stage, const sycl::event& __e) const
+    __submit_sycl(sycl::queue& __q, _InRngPack&& __in_pack, _OutRngPack&& __out_pack, _GlobalHistT* __p_global_hist,
+                  _GlobalHistT* __p_group_hists, _AtomicIdT* __p_atomic_id, ::std::uint32_t __sweep_work_group_count,
+                  ::std::size_t __n, ::std::uint32_t __stage, const sycl::event& __e) const
     {
+        using _KernelType =
+            __radix_sort_onesweep_kernel<__sycl_tag, __is_ascending, __radix_bits, __data_per_work_item,
+                                         __work_group_size, ::std::decay_t<_InRngPack>, ::std::decay_t<_OutRngPack>>;
+        constexpr ::std::uint32_t __slm_size_bytes = _KernelType::__calc_slm_alloc();
+        constexpr ::std::uint32_t __slm_size_elements = __slm_size_bytes / sizeof(::std::uint32_t);
+
         sycl::nd_range<1> __nd_range(__sweep_work_group_count * __work_group_size, __work_group_size);
         return __q.submit([&](sycl::handler& __cgh) {
+            sycl::local_accessor<unsigned char, 1> __slm_accessor(__slm_size_bytes, __cgh);
             oneapi::dpl::__ranges::__require_access(__cgh, __in_pack.__keys_rng(), __out_pack.__keys_rng());
             if constexpr (::std::decay_t<_InRngPack>::__has_values)
             {
                 oneapi::dpl::__ranges::__require_access(__cgh, __in_pack.__vals_rng(), __out_pack.__vals_rng());
             }
             __cgh.depends_on(__e);
-            __radix_sort_onesweep_kernel<_KtTag, __is_ascending, __radix_bits, __data_per_work_item, __work_group_size,
-                                         ::std::decay_t<_InRngPack>, ::std::decay_t<_OutRngPack>>
-                __kernel(__n, __stage, __p_global_hist, __p_group_hists, ::std::forward<_InRngPack>(__in_pack),
-                         ::std::forward<_OutRngPack>(__out_pack));
+            _KernelType __kernel(__n, __stage, __p_global_hist, __p_group_hists, __p_atomic_id,
+                                 ::std::forward<_InRngPack>(__in_pack), ::std::forward<_OutRngPack>(__out_pack),
+                                 __slm_accessor);
             __cgh.parallel_for<_Name...>(__nd_range, __kernel);
         });
     }
+
+  public:
+    template <typename _KtTag, typename _InRngPack, typename _OutRngPack, typename _GlobalHistT, typename _AtomicIdT>
+    sycl::event
+    operator()(_KtTag, sycl::queue& __q, _InRngPack&& __in_pack, _OutRngPack&& __out_pack,
+               _GlobalHistT* __p_global_hist, _GlobalHistT* __p_group_hists, _AtomicIdT* __p_atomic_id,
+               ::std::uint32_t __sweep_work_group_count, ::std::size_t __n, ::std::uint32_t __stage,
+               const sycl::event& __e) const
+    {
+        if constexpr (std::is_same_v<_KtTag, __sycl_tag>)
+        {
+            return __submit_sycl(__q, ::std::forward<_InRngPack>(__in_pack), ::std::forward<_OutRngPack>(__out_pack),
+                                 __p_global_hist, __p_group_hists, __p_atomic_id, __sweep_work_group_count, __n,
+                                 __stage, __e);
+        }
+        else
+        {
+            return __submit_esimd(__q, ::std::forward<_InRngPack>(__in_pack), ::std::forward<_OutRngPack>(__out_pack),
+                                  __p_global_hist, __p_group_hists, __p_atomic_id, __sweep_work_group_count, __n,
+                                  __stage, __e);
+        }
+    }
 };
 
 template <typename _KernelName>
diff --git a/include/oneapi/dpl/experimental/kt/internal/radix_sort_utils.h b/include/oneapi/dpl/experimental/kt/internal/radix_sort_utils.h
index efac51b2815..fa827a196c7 100644
--- a/include/oneapi/dpl/experimental/kt/internal/radix_sort_utils.h
+++ b/include/oneapi/dpl/experimental/kt/internal/radix_sort_utils.h
@@ -34,8 +34,8 @@ constexpr void
 __check_onesweep_params()
 {
     static_assert(__radix_bits == 8);
-    static_assert(__data_per_workitem % 32 == 0);
-    static_assert(__workgroup_size == 32 || __workgroup_size == 64);
+    //static_assert(__data_per_workitem % 32 == 0);
+    //static_assert(__workgroup_size == 32 || __workgroup_size == 64);
 }
 
 //-----------------------------------------------------------------------------
@@ -125,6 +125,57 @@ __order_preserving_cast_scalar(_Float __src)
     return __uint64_src ^ __mask;
 }
 
+template <std::uint16_t _N, typename _KeyT>
+struct __keys_pack
+{
+    _KeyT __keys[_N];
+};
+
+template <std::uint16_t _N, typename _KeyT, typename _ValT>
+struct __pairs_pack
+{
+    _KeyT __keys[_N];
+    _ValT __vals[_N];
+};
+
+template <std::uint16_t _N, typename _T1, typename _T2 = void>
+auto
+__make_key_value_pack()
+{
+    if constexpr (::std::is_void_v<_T2>)
+    {
+        return __keys_pack<_N, _T1>{};
+    }
+    else
+    {
+        return __pairs_pack<_N, _T1, _T2>{};
+    }
+}
+
+template <std::uint32_t __segment_width, std::uint32_t __num_segments, std::uint32_t __sub_group_size,
+          typename _ScanBuffer>
+void
+__sub_group_cross_segment_exclusive_scan(sycl::sub_group& __sub_group, _ScanBuffer* __scan_elements)
+{
+    // TODO: make it work if this static assert is not true
+    static_assert(__segment_width == __sub_group_size);
+    using _ElemT = std::remove_reference_t<decltype(__scan_elements[0])>;
+    _ElemT __carry = 0;
+    auto __sub_group_local_id = __sub_group.get_local_linear_id();
+
+    _ONEDPL_PRAGMA_UNROLL
+    for (std::uint32_t __i = 0; __i < __num_segments; ++__i)
+    {
+        auto __element = __scan_elements[__i * __segment_width + __sub_group_local_id];
+        auto __element_right_shift = sycl::shift_group_right(__sub_group, __element, 1);
+        if (__sub_group_local_id == 0)
+            __element_right_shift = 0;
+        __scan_elements[__i * __segment_width + __sub_group_local_id] = __element_right_shift + __carry;
+
+        __carry += sycl::group_broadcast(__sub_group, __element, __sub_group_size - 1);
+    }
+}
+
 } // namespace oneapi::dpl::experimental::kt::gpu::__impl
 
 #endif // _ONEDPL_KT_SYCL_RADIX_SORT_UTILS_H
diff --git a/include/oneapi/dpl/experimental/kt/internal/sub_group/sub_group_scan.h b/include/oneapi/dpl/experimental/kt/internal/sub_group/sub_group_scan.h
index 0df7194f8fc..8e453ae5b37 100644
--- a/include/oneapi/dpl/experimental/kt/internal/sub_group/sub_group_scan.h
+++ b/include/oneapi/dpl/experimental/kt/internal/sub_group/sub_group_scan.h
@@ -17,11 +17,13 @@
 #define _ONEDPL_KT_SUB_GROUP_SCAN_H
 
 #include <cstdint>
+#include <type_traits>
 
 #include "../../../../pstl/utils.h"
 #include "../../../../pstl/hetero/dpcpp/sycl_defs.h"
 #include "../../../../pstl/hetero/dpcpp/unseq_backend_sycl.h"
-#include "../../../../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h"
+#include "../../../../pstl/hetero/dpcpp/parallel_backend_sycl.h"
+//#include "../../../../pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h"
 
 namespace oneapi::dpl::experimental::kt
 {
@@ -32,6 +34,38 @@ namespace gpu
 namespace __impl
 {
 
+template <typename _T>
+struct __is_lazy_ctor_storage : std::false_type
+{
+};
+
+template <typename _T>
+struct __is_lazy_ctor_storage<oneapi::dpl::__internal::__lazy_ctor_storage<_T>> : std::true_type
+{
+};
+
+template <typename _T, bool __is_lazy>
+struct __scan_input_type
+{
+    using type = _T;
+};
+
+template <typename _T>
+struct __scan_input_type<_T, true>
+{
+    using type = typename _T::__value_type;
+};
+
+template <typename _T>
+decltype(auto)
+__extract_scan_input(_T& __value)
+{
+    if constexpr (__is_lazy_ctor_storage<std::decay_t<_T>>::value)
+        return (__value.__v);
+    else
+        return (__value);
+}
+
 //
 // An optimized scan in a sycl::sub_group performed in local registers.
 // Input is accepted in the form of an array in sub-group strided order. Formally, for some index i in __input,
@@ -41,26 +75,28 @@ namespace __impl
 // the sub-group. This layout is to align with optimal loads from global memory without extra data movement.
 // The scan results are updated in __input.
 //
-template <std::uint8_t __sub_group_size, std::uint16_t __iters_per_item, typename _InputType, typename _SubGroup,
+template <std::uint8_t __sub_group_size, std::uint16_t __iters_per_item, typename _InputTypeWrapped, typename _SubGroup,
           typename _BinaryOperation>
-_InputType
+auto
 __sub_group_scan(const _SubGroup& __sub_group,
-                 oneapi::dpl::__internal::__lazy_ctor_storage<_InputType> __input[__iters_per_item],
+                 _InputTypeWrapped __input[__iters_per_item],
                  _BinaryOperation __binary_op, std::uint32_t __items_in_scan)
 {
+    using _InputType = typename __scan_input_type<std::decay_t<_InputTypeWrapped>,
+                                                  __is_lazy_ctor_storage<std::decay_t<_InputTypeWrapped>>::value>::type;
     const bool __is_full = __items_in_scan == __sub_group_size * __iters_per_item;
     oneapi::dpl::__internal::__lazy_ctor_storage<_InputType> __carry;
     oneapi::dpl::__internal::__scoped_destroyer<_InputType> __destroy_when_leaving_scope{__carry};
     if (__is_full)
     {
         oneapi::dpl::__par_backend_hetero::__sub_group_scan<__sub_group_size, /*__is_inclusive*/ true,
-                                                            /*__init_present*/ false>(__sub_group, __input[0].__v,
+                                                            /*__init_present*/ false>(__sub_group, __extract_scan_input(__input[0]),
                                                                                       __binary_op, __carry);
         _ONEDPL_PRAGMA_UNROLL
         for (std::uint16_t __i = 1; __i < __iters_per_item; ++__i)
         {
             oneapi::dpl::__par_backend_hetero::__sub_group_scan<__sub_group_size, /*__is_inclusive*/ true,
-                                                                /*__init_present*/ true>(__sub_group, __input[__i].__v,
+                                                                /*__init_present*/ true>(__sub_group, __extract_scan_input(__input[__i]),
                                                                                          __binary_op, __carry);
         }
     }
@@ -73,22 +109,24 @@ __sub_group_scan(const _SubGroup& __sub_group,
         {
             oneapi::dpl::__par_backend_hetero::__sub_group_scan_partial<__sub_group_size, /*__is_inclusive*/ true,
                                                                         /*__init_present*/ false>(
-                __sub_group, __input[__i].__v, __binary_op, __carry, __items_in_scan - __i * __sub_group_size);
+                __sub_group, __extract_scan_input(__input[__i]), __binary_op, __carry,
+                __items_in_scan - __i * __sub_group_size);
         }
         else if (__limited_iters_per_item > 1)
         {
             oneapi::dpl::__par_backend_hetero::__sub_group_scan<__sub_group_size, /*__is_inclusive*/ true,
                                                                 /*__init_present*/ false>(
-                __sub_group, __input[__i++].__v, __binary_op, __carry);
+                __sub_group, __extract_scan_input(__input[__i++]), __binary_op, __carry);
             for (; __i < __limited_iters_per_item - 1; ++__i)
             {
                 oneapi::dpl::__par_backend_hetero::__sub_group_scan<__sub_group_size, /*__is_inclusive*/ true,
                                                                     /*__init_present*/ true>(
-                    __sub_group, __input[__i].__v, __binary_op, __carry);
+                    __sub_group, __extract_scan_input(__input[__i]), __binary_op, __carry);
             }
             oneapi::dpl::__par_backend_hetero::__sub_group_scan_partial<__sub_group_size, /*__is_inclusive*/ true,
                                                                         /*__init_present*/ true>(
-                __sub_group, __input[__i].__v, __binary_op, __carry, __items_in_scan - __i * __sub_group_size);
+                __sub_group, __extract_scan_input(__input[__i]), __binary_op, __carry,
+                __items_in_scan - __i * __sub_group_size);
         }
     }
     return __carry.__v;
diff --git a/include/oneapi/dpl/experimental/kt/internal/sycl_radix_sort_kernels.h b/include/oneapi/dpl/experimental/kt/internal/sycl_radix_sort_kernels.h
index c45463f9cdd..4c051ca8183 100644
--- a/include/oneapi/dpl/experimental/kt/internal/sycl_radix_sort_kernels.h
+++ b/include/oneapi/dpl/experimental/kt/internal/sycl_radix_sort_kernels.h
@@ -17,6 +17,8 @@
 #include "../../../pstl/utils.h"
 #include "../../../pstl/hetero/dpcpp/utils_ranges_sycl.h"
 
+#include "sub_group/sub_group_scan.h"
+
 #include "radix_sort_utils.h"
 #include "esimd_radix_sort_utils.h"
 
@@ -137,6 +139,502 @@ __sycl_global_histogram(sycl::nd_item<1> __idx, std::size_t __n, const _KeysRng&
     }
 }
 
+template <typename _KtTag, bool __is_ascending, std::uint8_t __radix_bits, std::uint16_t __data_per_work_item,
+          std::uint16_t __work_group_size, typename _InRngPack, typename _OutRngPack>
+struct __radix_sort_onesweep_kernel;
+
+template <bool __is_ascending, std::uint8_t __radix_bits, std::uint16_t __data_per_work_item,
+          std::uint16_t __work_group_size, typename _InRngPack, typename _OutRngPack>
+struct __radix_sort_onesweep_kernel<__sycl_tag, __is_ascending, __radix_bits, __data_per_work_item, __work_group_size,
+                                    _InRngPack, _OutRngPack>
+{
+    using _LocOffsetT = std::uint16_t;
+    using _GlobOffsetT = std::uint32_t;
+    using _AtomicIdT = std::uint32_t;
+
+    using _KeyT = typename _InRngPack::_KeyT;
+    using _ValT = typename _InRngPack::_ValT;
+    static constexpr bool __has_values = !std::is_void_v<_ValT>;
+
+    static constexpr std::uint32_t __bin_count = 1 << __radix_bits;
+
+    static constexpr std::uint32_t __sub_group_size = 32;
+    static constexpr std::uint32_t __num_sub_groups_per_work_group = __work_group_size / __sub_group_size;
+    static constexpr std::uint32_t __data_per_sub_group = __data_per_work_item * __sub_group_size;
+
+    static constexpr std::uint32_t __bit_count = sizeof(_KeyT) * 8;
+    static constexpr _LocOffsetT __mask = __bin_count - 1;
+    static constexpr std::uint32_t __hist_stride = __bin_count * sizeof(_LocOffsetT);
+    static constexpr std::uint32_t __work_item_all_hists_size = __num_sub_groups_per_work_group * __hist_stride;
+    static constexpr std::uint32_t __group_hist_size = __hist_stride; // _LocOffsetT
+    static constexpr std::uint32_t __global_hist_size = __bin_count * sizeof(_GlobOffsetT);
+
+    static constexpr std::uint32_t
+    __calc_reorder_slm_size()
+    {
+        if constexpr (__has_values)
+            return __work_group_size * __data_per_work_item * (sizeof(_KeyT) + sizeof(_ValT));
+        else
+            return __work_group_size * __data_per_work_item * sizeof(_KeyT);
+    }
+
+    // Helper functions for SLM layout calculation
+    static constexpr std::uint32_t
+    __get_slm_subgroup_hists_offset()
+    {
+        return 0; // Sub-group histograms start at the beginning
+    }
+
+    static constexpr std::uint32_t
+    __get_slm_group_hist_offset()
+    {
+        constexpr std::uint32_t __reorder_size = __calc_reorder_slm_size();
+        return std::max(__work_item_all_hists_size, __reorder_size); // After max(sub-group hists, reorder space)
+    }
+
+    static constexpr std::uint32_t
+    __get_slm_global_incoming_offset()
+    {
+        return __get_slm_group_hist_offset() + __group_hist_size; // After group histogram
+    }
+
+    static constexpr std::uint32_t
+    __get_slm_global_fix_offset()
+    {
+        return __get_slm_global_incoming_offset() + __global_hist_size; // After global incoming histogram
+    }
+
+    static constexpr std::uint32_t
+    __calc_slm_alloc()
+    {
+        // SLM Layout Visualization:
+        //
+        // Phase 1 (Offset Calculation):
+        // ┌────────────────────────┬─────────────┬──────────────────┐
+        // │   Sub-group Hists      │ Group Hist  │ Global Incoming  │
+        // │ max(__work_item_all_   │ __group_    │ __global_hist    │
+        // │ hists_size,__reorder   |             |                  |
+        // |     _size              │ hist_size   │     _size        │
+        // └────────────────────────┴─────────────┴──────────────────┘
+        //                                 |               |
+        //                                 v               v
+        // Phase 2 (Reorder):
+        // ┌────────────────────────┬─────────────┬──────────────────┬─────────────────┐
+        // │   Reorder Space        │ Group Hist  │ Global Incoming  │  Global Fix     │
+        // │ max(__work_item_all_   │ __group_    │ __global_hist    │ __global_hist   │
+        // │ hists_size,__reorder   │ hist_size   │     _size        │     _size       │
+        // │     _size)             │             │                  │                 │
+        // └────────────────────────┴─────────────┴──────────────────┴─────────────────┘
+        //
+        constexpr std::uint32_t __reorder_size = __calc_reorder_slm_size();
+
+        // TODO: does starting alignment significantly matter for correctness and performance? If so we may need
+        // padding between regions
+        constexpr std::uint32_t __slm_size = std::max(__work_item_all_hists_size, __reorder_size) + __group_hist_size +
+                                             __global_hist_size + __global_hist_size;
+
+        return oneapi::dpl::__internal::__dpl_ceiling_div(__slm_size, 2048) * 2048;
+    }
+
+    const std::uint32_t __n;
+    const std::uint32_t __stage;
+    _GlobOffsetT* __p_global_hist;
+    _GlobOffsetT* __p_group_hists;
+    _AtomicIdT* __p_atomic_id;
+    _InRngPack __in_pack;
+    _OutRngPack __out_pack;
+    sycl::local_accessor<unsigned char, 1> __slm_accessor;
+
+    __radix_sort_onesweep_kernel(std::uint32_t __n, std::uint32_t __stage, _GlobOffsetT* __p_global_hist,
+                                 _GlobOffsetT* __p_group_hists, _AtomicIdT* __p_atomic_id, const _InRngPack& __in_pack,
+                                 const _OutRngPack& __out_pack, sycl::local_accessor<unsigned char, 1> __slm_acc)
+        : __n(__n), __stage(__stage), __p_global_hist(__p_global_hist), __p_group_hists(__p_group_hists),
+          __p_atomic_id(__p_atomic_id), __in_pack(__in_pack), __out_pack(__out_pack), __slm_accessor(__slm_acc)
+    {
+    }
+
+    template <typename _KVPack>
+    inline auto
+    __load_pack(_KVPack& __pack, std::uint32_t __wg_id, std::uint32_t __sg_id, std::uint32_t __sg_local_id) const
+    {
+        const _GlobOffsetT __offset = __data_per_sub_group * (__wg_id * __num_sub_groups_per_work_group + __sg_id);
+        auto __keys_seq = __rng_data(__in_pack.__keys_rng());
+        __load</*__sort_identity_residual=*/true>(__pack.__keys, __keys_seq, __offset, __sg_local_id);
+        if constexpr (__has_values)
+        {
+            __load</*__sort_identity_residual=*/false>(__pack.__vals, __rng_data(__in_pack.__vals_rng()), __offset,
+                                                       __sg_local_id);
+        }
+    }
+
+    template <bool __sort_identity_residual, typename _T, typename _InSeq>
+    inline void
+    __load(_T __elements[__data_per_work_item], const _InSeq& __in_seq, _GlobOffsetT __glob_offset,
+           std::uint32_t __local_offset) const
+    {
+        bool __is_full_block = (__glob_offset + __data_per_sub_group) <= __n;
+        auto __offset = __glob_offset + __local_offset;
+        if (__is_full_block)
+        {
+            _ONEDPL_PRAGMA_UNROLL
+            for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i)
+            {
+                __elements[__i] = __in_seq[__offset + __i * __sub_group_size];
+            }
+        }
+        else
+        {
+            _ONEDPL_PRAGMA_UNROLL
+            for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i)
+            {
+                auto __idx = __offset + __i * __sub_group_size;
+                if constexpr (__sort_identity_residual)
+                {
+                    __elements[__i] = (__idx < __n) ? __in_seq[__idx] : __sort_identity<_T, __is_ascending>();
+                }
+                else
+                {
+                    if (__idx < __n)
+                    {
+                        __elements[__i] = __in_seq[__idx];
+                    }
+                }
+            }
+        }
+    }
+
+    static inline std::uint32_t
+    __match_bins(sycl::nd_item<1> __idx, std::uint32_t __bin)
+    {
+        // start with all bits 1
+        auto __matched_bins = sycl::ext::oneapi::group_ballot(__idx.get_sub_group());
+        _ONEDPL_PRAGMA_UNROLL
+        for (int __i = 0; __i < __radix_bits; __i++)
+        {
+            auto __bit = (__bin >> __i) & 1;
+            auto __sg_vote = sycl::ext::oneapi::group_ballot(__idx.get_sub_group(), static_cast<bool>(__bit));
+            // If we vote yes, then we want to set all bits that also voted yes. If no, then we want to
+            // zero out the bits that said yes as they don't match and preserve others as we have no info on these.
+            __matched_bins &= __bit ? __sg_vote : ~__sg_vote;
+        }
+        std::uint32_t __result = 0;
+        __matched_bins.extract_bits(__result);
+        return __result;
+    }
+
+    inline auto
+    __rank_local(sycl::nd_item<1> __idx, _LocOffsetT __ranks[__data_per_work_item],
+                 _LocOffsetT __bins[__data_per_work_item], _LocOffsetT* __slm_subgroup_hists,
+                 std::uint32_t __sub_group_slm_offset) const
+    {
+        std::uint32_t __sub_group_local_id = __idx.get_sub_group().get_local_id();
+        _LocOffsetT* __slm_offset = __slm_subgroup_hists + __sub_group_slm_offset;
+
+        for (std::uint32_t __i = __idx.get_sub_group().get_local_id(); __i < __bin_count; __i += __sub_group_size)
+        {
+            __slm_offset[__i] = 0;
+        }
+        // TODO: sub-group barrier ? maybe not for simd architectures
+
+        constexpr std::uint32_t __sub_group_full_bitmask = 0x7fffffff;
+        static_assert(__sub_group_size == 32);
+        // lower bits than my current will be set meaning we only preserve left lanes
+        std::uint32_t __remove_right_lanes = __sub_group_full_bitmask >> (__sub_group_size - 1 - __sub_group_local_id);
+
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i)
+        {
+            _LocOffsetT __bin = __bins[__i];
+            std::uint32_t __matched_bins = __match_bins(__idx, __bin);
+            std::uint32_t __pre_rank = __slm_offset[__bin];
+            auto __matched_left_lanes = __matched_bins & __remove_right_lanes;
+            std::uint32_t __this_round_rank = sycl::popcount(__matched_left_lanes);
+            std::uint32_t __this_round_count = sycl::popcount(__matched_bins);
+            auto __rank_after = __pre_rank + __this_round_rank;
+            auto __is_leader = __this_round_rank == __this_round_count - 1;
+            if (__is_leader)
+            {
+                __slm_offset[__bin] = __rank_after + 1;
+            }
+            __ranks[__i] = __rank_after;
+        }
+        sycl::group_barrier(__idx.get_group());
+    }
+
+    inline void
+    __rank_global(sycl::nd_item<1> __idx, std::uint32_t __wg_id, _LocOffsetT* __slm_subgroup_hists,
+                  _LocOffsetT* __slm_group_hist, _GlobOffsetT* __slm_global_incoming) const
+    {
+        // TODO: This exists in the ESIMD KT and was ported but are we not limiting max input size to
+        // 2^30 ~ 1 billion elements? We use 32-bit indexing / histogram which may already be too small
+        // but are then reserving the two upper bits for lookback flags.
+        constexpr std::uint32_t __global_accumulated = 0x40000000;
+        constexpr std::uint32_t __hist_updated = 0x80000000;
+        constexpr std::uint32_t __global_offset_mask = 0x3fffffff;
+
+        _GlobOffsetT* __p_this_group_hist = __p_group_hists + __bin_count * __wg_id;
+        _GlobOffsetT* __p_prev_group_hist = __p_this_group_hist - __bin_count;
+
+        // This is important so that we can evenly partition the radix bits across a number of sub-groups
+        // without masking lanes. Radix bits is always a power of two, so this requirement essentially just
+        // requires radix_bits >= 5 for sub-group size of 32.
+        static_assert(__bin_count % __sub_group_size == 0);
+
+        constexpr std::uint32_t __bin_summary_sub_group_size = __bin_count / __sub_group_size;
+        constexpr std::uint32_t __bin_width = __sub_group_size;
+
+        auto __sub_group_id = __idx.get_sub_group().get_group_linear_id();
+        auto __sub_group_local_id = __idx.get_sub_group().get_local_linear_id();
+
+        // 1. Vector scan of histograms previously accumulated by each work-item
+        // update slm instead of grf summary due to perf issues with grf histogram
+
+        // TODO: this single element array is a temporary workaround for sub group scan requiring an array
+        _LocOffsetT __item_grf_hist_summary_arr[1] = {0};
+        _LocOffsetT& __item_grf_hist_summary = __item_grf_hist_summary_arr[0];
+        _LocOffsetT __item_bin_count;
+        if (__sub_group_id < __bin_summary_sub_group_size)
+        {
+            // 1.1. Vector scan of the same bins across different histograms.
+            std::uint32_t __slm_bin_hist_summary_offset = __sub_group_id * __bin_width;
+
+            for (std::uint32_t __s = 0; __s < __num_sub_groups_per_work_group;
+                 __s++, __slm_bin_hist_summary_offset += __bin_count)
+            {
+                auto __slm_idx = __slm_bin_hist_summary_offset + __sub_group_local_id;
+                __item_grf_hist_summary += __slm_subgroup_hists[__slm_idx];
+                __slm_subgroup_hists[__slm_idx] = __item_grf_hist_summary;
+            }
+            __item_bin_count = __item_grf_hist_summary;
+
+            // 1.2. Vector scan of different bins inside one histogram, the final one for the whole work-group.
+            // Only "__bin_width" pieces of the histogram are scanned at this stage.
+            // This histogram will be further used for calculation of offsets of keys already reordered in SLM,
+            // it does not participate in sycnhronization between work-groups.
+            __sub_group_scan<__sub_group_size, 1>(__idx.get_sub_group(), __item_grf_hist_summary_arr, std::plus<>{},
+                                                  __bin_width);
+
+            auto __write_idx = __sub_group_id * __bin_width + __sub_group_local_id;
+            __slm_group_hist[__write_idx] = __item_grf_hist_summary;
+
+            // 1.3. Copy the histogram at the region designated for synchronization between work-groups.
+            // Write the histogram to global memory, bypassing caches, to ensure cross-work-group visibility.
+            if (__wg_id != 0)
+            {
+                // Copy the histogram, local to this WG
+                using _GlobalAtomicT = sycl::atomic_ref<_GlobOffsetT, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                                         sycl::access::address_space::global_space>;
+                _GlobalAtomicT __ref(__p_this_group_hist[__sub_group_id * __bin_width + __sub_group_local_id]);
+                __ref.store(__item_bin_count | __hist_updated);
+            }
+            else
+            {
+                // WG0 is a special case: it also retrieves the total global histogram and adds it to its local histogram
+                // This global histogram will be propagated to other work-groups through a chained scan at stage 2
+                using _GlobalAtomicT =
+                    sycl::atomic_ref<_GlobOffsetT, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                     sycl::access::address_space::global_space>;
+
+                auto __hist_idx = __sub_group_id * __bin_width + __sub_group_local_id;
+                _GlobOffsetT __global_hist = __p_global_hist[__hist_idx] & __global_offset_mask;
+                _GlobOffsetT __after_group_hist_sum = __global_hist + __item_bin_count;
+                _GlobalAtomicT __ref(__p_this_group_hist[__hist_idx]);
+                __ref.store(__after_group_hist_sum | __hist_updated | __global_accumulated);
+                // Copy the global histogram to local memory to share with other work-items
+                __slm_global_incoming[__hist_idx] = __global_hist;
+            }
+        }
+        sycl::group_barrier(__idx.get_group());
+
+        auto __sub_group = __idx.get_sub_group();
+        auto __sub_group_group_id = __sub_group.get_group_linear_id();
+
+        // 1.4 One work-item finalizes scan performed at stage 1.2
+        // by propagating prefixes accumulated after scanning individual "__bin_width" pieces.
+        if (__sub_group_group_id == __bin_summary_sub_group_size + 1)
+        {
+            __sub_group_cross_segment_exclusive_scan<__bin_width, __bin_summary_sub_group_size, __sub_group_size>(
+                __sub_group, __slm_group_hist);
+        }
+
+        sycl::group_barrier(__idx.get_group());
+
+        // 2. Chained scan. Synchronization between work-groups.
+        if (__sub_group_group_id < __bin_summary_sub_group_size && __wg_id != 0)
+        {
+            using _GlobalAtomicT =
+                sycl::atomic_ref<_GlobOffsetT, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                 sycl::access::address_space::global_space>;
+            // 2.1. Read the histograms scanned across work-groups
+            _GlobOffsetT __prev_group_hist_sum = 0;
+            _GlobOffsetT __prev_group_hist;
+            bool __is_not_accumulated = true;
+            do
+            {
+                auto __idx = __sub_group_group_id * __bin_width + __sub_group_local_id;
+                _GlobalAtomicT __ref(__p_prev_group_hist[__idx]);
+                do
+                {
+                    __prev_group_hist = __ref.load();
+                } while ((__prev_group_hist & __hist_updated) == 0);
+                __prev_group_hist_sum += __is_not_accumulated ? __prev_group_hist : 0;
+                __is_not_accumulated = (__prev_group_hist_sum & __global_accumulated) == 0;
+                __p_prev_group_hist -= __bin_count;
+            } while (sycl::any_of_group(__sub_group, __is_not_accumulated));
+
+            _GlobOffsetT __after_group_hist_sum = 0;
+            __prev_group_hist_sum &= __global_offset_mask;
+            __after_group_hist_sum = __prev_group_hist_sum + __item_bin_count;
+            auto __idx = __sub_group_group_id * __bin_width + __sub_group_local_id;
+            // 2.2. Write the histogram scanned across work-group, updated with the current work-group data
+            _GlobalAtomicT __ref(__p_this_group_hist[__idx]);
+            __ref.store(__after_group_hist_sum | __hist_updated | __global_accumulated);
+            // 2.3. Save the scanned histogram from previous work-groups locally
+            __slm_global_incoming[__idx] = __prev_group_hist_sum;
+        }
+        sycl::group_barrier(__idx.get_group());
+    }
+
+    template <typename _KVPack>
+    void inline __reorder_reg_to_slm(sycl::nd_item<1> __idx, const _KVPack& __pack,
+                                     _LocOffsetT (&__ranks)[__data_per_work_item],
+                                     const _LocOffsetT (&__bins)[__data_per_work_item],
+                                     _LocOffsetT* __slm_subgroup_hists, _LocOffsetT* __slm_group_hist,
+                                     _GlobOffsetT* __slm_global_incoming, _GlobOffsetT* __slm_global_fix,
+                                     _KeyT* __slm_keys, _ValT* __slm_vals) const
+    {
+        auto __sub_group_id = __idx.get_sub_group().get_group_linear_id();
+        const auto __wg_size = __idx.get_local_range(0);
+
+        // 1. update ranks to reflect sub-group offsets in and across bins
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i)
+        {
+            auto __bin = __bins[__i];
+            auto __group_incoming = __slm_group_hist[__bin];
+            auto __offset_in_bin =
+                (__sub_group_id == 0) ? 0 : __slm_subgroup_hists[(__sub_group_id - 1) * __bin_count + __bin];
+            auto __offset_across_bins = __group_incoming;
+            __ranks[__i] += __offset_in_bin + __offset_across_bins;
+        }
+
+        // 2. compute __global_fix
+        for (std::uint32_t __i = __idx.get_local_id(); __i < __bin_count; __i += __wg_size)
+        {
+            __slm_global_fix[__i] = __slm_global_incoming[__i] - static_cast<_GlobOffsetT>(__slm_group_hist[__i]);
+        }
+        sycl::group_barrier(__idx.get_group());
+
+        // 3. Write keys (and values) to SLM at computed ranks
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i) 
+        {
+            __slm_keys[__ranks[__i]] = __pack.__keys[__i];
+            if constexpr (__has_values)
+            {
+                __slm_vals[__ranks[__i]] = __pack.__vals[__i];
+            }
+        }
+        sycl::group_barrier(__idx.get_group());
+    }
+
+    template <typename _KVPack>
+    void inline __reorder_slm_to_glob(sycl::nd_item<1> __idx, _KVPack& __pack, _GlobOffsetT* __slm_global_fix,
+                                      _KeyT* __slm_keys, _ValT* __slm_vals) const
+    {
+        auto __sub_group_id = __idx.get_sub_group().get_group_linear_id();
+        auto __sub_group_local_id = __idx.get_sub_group().get_local_linear_id();
+
+        const _GlobOffsetT __keys_slm_offset = __data_per_sub_group * __sub_group_id;
+
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i)
+        {
+            auto __slm_idx = __keys_slm_offset + __i * __sub_group_size + __sub_group_local_id;
+            auto __key = __slm_keys[__slm_idx];
+            auto __bin = __get_bucket_scalar<__mask>(__order_preserving_cast_scalar<__is_ascending>(__key),
+                                                     __stage * __radix_bits);
+            auto __global_fix = __slm_global_fix[__bin];
+            auto __out_idx = __global_fix + __slm_idx;
+
+            // TODO: we need to figure out how to relax this bounds checking for full unrolling
+            bool __output_mask = __out_idx < __n;
+            if (__output_mask)
+                __out_pack.__keys_rng()[__out_idx] = __key;
+            if constexpr (__has_values)
+            {
+                auto __val = __slm_vals[__slm_idx];
+                if (__output_mask)
+                    __out_pack.__vals_rng()[__out_idx] = __val;
+            }
+        }
+    }
+
+    [[sycl::reqd_sub_group_size(__sub_group_size)]] void
+    operator()(sycl::nd_item<1> __idx) const
+    {
+        const std::uint32_t __local_tid = __idx.get_local_linear_id();
+        const std::uint32_t __wg_size = __idx.get_local_range(0);
+        const std::uint32_t __sg_id = __idx.get_sub_group().get_group_linear_id();
+        const std::uint32_t __sg_local_id = __idx.get_sub_group().get_local_id();
+
+        const std::uint32_t __num_wgs = __idx.get_group_range(0);
+        using _AtomicRefT = sycl::atomic_ref<_AtomicIdT, sycl::memory_order::relaxed, sycl::memory_scope::device,
+                                             sycl::access::address_space::global_space>;
+        _AtomicRefT __atomic_id_ref(*__p_atomic_id);
+        std::uint32_t __wg_id = 0;
+        if (__idx.get_local_linear_id() == 0)
+        {
+            // Modulo num work-groups because onesweep gets invoked multiple times and we do not want an extra memset between
+            // invocations.
+            __wg_id = __atomic_id_ref.fetch_add(1) % __num_wgs;
+        }
+        __wg_id = sycl::group_broadcast(__idx.get_group(), __wg_id);
+
+        const std::uint32_t __sub_group_slm_offset = __sg_id * __bin_count;
+
+        auto __values_pack = __make_key_value_pack<__data_per_work_item, _KeyT, _ValT>();
+        _LocOffsetT __bins[__data_per_work_item];
+        _LocOffsetT __ranks[__data_per_work_item];
+
+        __load_pack(__values_pack, __wg_id, __sg_id, __sg_local_id);
+
+        _ONEDPL_PRAGMA_UNROLL
+        for (std::uint32_t __i = 0; __i < __data_per_work_item; ++__i)
+        {
+            const auto __ordered = __order_preserving_cast_scalar<__is_ascending>(__values_pack.__keys[__i]);
+            __bins[__i] = __get_bucket_scalar<__mask>(__ordered, __stage * __radix_bits);
+        }
+
+        // Get raw SLM pointer and create typed pointers for different regions using helper functions
+        unsigned char* __slm_raw = __slm_accessor.get_multi_ptr<sycl::access::decorated::no>().get();
+        _LocOffsetT* __slm_subgroup_hists =
+            reinterpret_cast<_LocOffsetT*>(__slm_raw + __get_slm_subgroup_hists_offset());
+        _LocOffsetT* __slm_group_hist = reinterpret_cast<_LocOffsetT*>(__slm_raw + __get_slm_group_hist_offset());
+        _GlobOffsetT* __slm_global_incoming =
+            reinterpret_cast<_GlobOffsetT*>(__slm_raw + __get_slm_global_incoming_offset());
+
+        __rank_local(__idx, __ranks, __bins, __slm_subgroup_hists, __sub_group_slm_offset);
+        __rank_global(__idx, __wg_id, __slm_subgroup_hists, __slm_group_hist, __slm_global_incoming);
+
+        // For reorder phase, reinterpret the sub-group histogram space as key/value storage
+        // The reorder space overlaps with the sub-group histogram region (reinterpret_cast)
+        _KeyT* __slm_keys = reinterpret_cast<_KeyT*>(__slm_raw + __get_slm_subgroup_hists_offset());
+        _ValT* __slm_vals = nullptr;
+        if constexpr (__has_values)
+        {
+            __slm_vals = reinterpret_cast<_ValT*>(__slm_raw + __get_slm_subgroup_hists_offset() +
+                                                  __wg_size * __data_per_work_item * sizeof(_KeyT));
+        }
+        _GlobOffsetT* __slm_global_fix = reinterpret_cast<_GlobOffsetT*>(__slm_raw + __get_slm_global_fix_offset());
+
+        __reorder_reg_to_slm(__idx, __values_pack, __ranks, __bins, __slm_subgroup_hists, __slm_group_hist,
+                             __slm_global_incoming, __slm_global_fix, __slm_keys, __slm_vals);
+
+        __reorder_slm_to_glob(__idx, __values_pack, __slm_global_fix, __slm_keys, __slm_vals);
+    }
+};
+
 } // namespace oneapi::dpl::experimental::kt::gpu::__impl
 
 #endif // _ONEDPL_KT_SYCL_RADIX_SORT_KERNELS_H
diff --git a/test/kt/CMakeLists.txt b/test/kt/CMakeLists.txt
index 392be9ff5fb..a8d601e4995 100644
--- a/test/kt/CMakeLists.txt
+++ b/test/kt/CMakeLists.txt
@@ -101,8 +101,16 @@ endfunction()
 function(_generate_sort_tests _variant _key_value_pairs)
     set(_base_file_all "radix_sort" "radix_sort_out_of_place")
     set(_base_file_by_key_all "radix_sort_by_key" "radix_sort_by_key_out_of_place")
-    set(_data_per_work_item_all "32" "64" "96" "128" "160" "192" "224" "256" "288" "320" "352" "384" "416" "448" "480" "512")
-    set(_work_group_size_all "64")
+    
+    # Variant-specific configurations
+    if (${_variant} STREQUAL "sycl")
+        set(_data_per_work_item_all "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" "16")
+        set(_work_group_size_all "1024")
+    else() # esimd
+        set(_data_per_work_item_all "32" "64" "96" "128" "160" "192" "224" "256" "288" "320" "352" "384" "416" "448" "480" "512")
+        set(_work_group_size_all "64")
+    endif()
+    
     set(_type_all "char" "uint16_t" "int" "uint64_t" "float" "double")
 
     foreach (_data_per_work_item ${_data_per_work_item_all})
@@ -140,8 +148,8 @@ if (ONEDPL_TEST_ENABLE_KT_SYCL)
     _generate_sort_tests("sycl" TRUE) # radix_sort_by_key, random
 
     # Pin some cases to track them
-    _generate_sort_test("sycl" "radix_sort_by_key_out_of_place" "96" "64" "uint32_t" "uint32_t" 1000)
-    _generate_sort_test("sycl" "radix_sort" "384" "64" "int32_t" "" 1000)
+    _generate_sort_test("sycl" "radix_sort_by_key_out_of_place" "3" "1024" "uint32_t" "uint32_t" 1000)
+    _generate_sort_test("sycl" "radix_sort" "12" "1024" "int32_t" "" 1000)
 endif()
 
 function (_generate_gpu_scan_test _data_per_work_item _work_group_size _type _probability_permille)
diff --git a/test/kt/sycl_radix_sort.cpp b/test/kt/sycl_radix_sort.cpp
index 3dc31a307d2..056879b0349 100755
--- a/test/kt/sycl_radix_sort.cpp
+++ b/test/kt/sycl_radix_sort.cpp
@@ -123,18 +123,9 @@ test_sycl_iterators(sycl::queue q, std::size_t size, KernelParam param)
     std::stable_sort(std::begin(ref), std::end(ref), Compare<T, IsAscending>{});
     {
         sycl::buffer<T> buf(input.data(), input.size());
-
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
-        // Deprecated namespace is used deliberatelly to make sure the functionality is still available
-        oneapi::dpl::experimental::kt::esimd::radix_sort<IsAscending>(q, oneapi::dpl::begin(buf), oneapi::dpl::end(buf),
-                                                                      param)
+        oneapi::dpl::experimental::kt::gpu::radix_sort<IsAscending, RadixBits>(q, oneapi::dpl::begin(buf),
+                                                                               oneapi::dpl::end(buf), param)
             .wait();
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
     }
 
     std::string msg = "wrong results with oneapi::dpl::begin/end, n: " + std::to_string(size);
diff --git a/test/kt/sycl_radix_sort_by_key.cpp b/test/kt/sycl_radix_sort_by_key.cpp
index dff02e90a73..fa8873523a5 100755
--- a/test/kt/sycl_radix_sort_by_key.cpp
+++ b/test/kt/sycl_radix_sort_by_key.cpp
@@ -40,7 +40,7 @@ void test_sycl_buffer(sycl::queue q, std::size_t size, KernelParam param)
     {
         sycl::buffer<KeyT> keys(actual_keys.data(), actual_keys.size());
         sycl::buffer<ValueT> values(actual_values.data(), actual_values.size());
-        oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_by_key<isAscending, RadixBits>(q, keys, values, param).wait();
+        oneapi::dpl::experimental::kt::gpu::radix_sort_by_key<isAscending, RadixBits>(q, keys, values, param).wait();
     }
 
     auto expected_first  = oneapi::dpl::make_zip_iterator(std::begin(expected_keys), std::begin(expected_values));
@@ -71,8 +71,9 @@ void test_usm(sycl::queue q, std::size_t size, KernelParam param)
     auto expected_first  = oneapi::dpl::make_zip_iterator(std::begin(expected_keys), std::begin(expected_values));
     std::stable_sort(expected_first, expected_first + size, CompareKey<isAscending>{});
 
-    oneapi::dpl::experimental::kt::gpu::esimd::radix_sort_by_key<isAscending, RadixBits>(
-        q, keys.get_data(), keys.get_data() + size, values.get_data(), param).wait();
+    oneapi::dpl::experimental::kt::gpu::radix_sort_by_key<isAscending, RadixBits>(
+        q, keys.get_data(), keys.get_data() + size, values.get_data(), param)
+        .wait();
 
     std::vector<KeyT> actual_keys(size);
     std::vector<ValueT> actual_values(size);