NVIDIA · cliffburdick · Dec 19, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/docs_input/api/logic/truth/allclose.rst b/docs_input/api/logic/truth/allclose.rst
@@ -8,7 +8,7 @@ from allclose is an ``int`` value since boolean reductions are not available in
 
 
 .. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, const HostExecutor<MODE> &exec)
-.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, cudaExecutor exec = 0)
+.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, const cudaExecutor &exec)
 
 Examples
 ~~~~~~~~

diff --git a/include/matx/transforms/chol/chol_cuda.h b/include/matx/transforms/chol/chol_cuda.h
@@ -58,7 +58,7 @@ struct DnCholCUDAParams_t {
   size_t batch_size;
   cublasFillMode_t uplo;
   MatXDataType_t dtype;
-  cudaExecutor exec;
+  cudaStream_t stream;
 };
 
 template <typename OutputTensor, typename ATensor>
@@ -128,7 +128,7 @@ class matxDnCholCUDAPlan_t : matxDnCUDASolver_t {
     params.n = a.Size(RANK - 1);
     params.A = a.Data();
     params.uplo = uplo;
-    params.exec = exec;    
+    params.stream = exec.getStream();    
     params.dtype = TypeToInt<T1>();
 
     return params;
@@ -208,7 +208,7 @@ struct DnCholCUDAParamsKeyHash {
   {
     return  (std::hash<uint64_t>()(k.n)) + 
             (std::hash<uint64_t>()(k.batch_size)) + 
-            (std::hash<uint64_t>()((uint64_t)(k.exec.getStream())));
+            (std::hash<uint64_t>()((uint64_t)(k.stream)));
   }
 };
 
@@ -223,7 +223,7 @@ struct DnCholCUDAParamsKeyEq {
     return  l.n == t.n && 
             l.batch_size == t.batch_size && 
             l.dtype == t.dtype &&
-            l.exec.getStream() == t.exec.getStream();
+            l.stream == t.stream;
   }
 };
 

diff --git a/include/matx/transforms/cub.h b/include/matx/transforms/cub.h
@@ -1523,7 +1523,7 @@ using cub_cache_t = std::unordered_map<CubParams_t, std::any, CubParamsKeyHash,
 template <typename OutputTensor, typename InputOperator>
 void sort_impl_inner(OutputTensor &a_out, const InputOperator &a,
           const SortDirection_t dir,
-          cudaExecutor exec = 0)
+          const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
@@ -1569,7 +1569,7 @@ template <typename OutputIndexTensor, typename InputIndexTensor, typename Output
 void sort_pairs_impl_inner(OutputIndexTensor &idx_out, const InputIndexTensor &idx_in,
           OutputKeyTensor &a_out, const InputKeyTensor &a_in,
           const SortDirection_t dir,
-          [[maybe_unused]] cudaExecutor exec = 0)
+          const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
@@ -2074,7 +2074,7 @@ void cub_dualargreduce(OutputTensor &a1_out,
 template <typename OutputTensor, typename InputOperator>
 void sort_impl(OutputTensor &a_out, const InputOperator &a,
           const SortDirection_t dir,
-          cudaExecutor exec = 0)
+          const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
@@ -2139,7 +2139,7 @@ void sort_impl(OutputTensor &a_out, const InputOperator &a,
 template <typename OutputTensor, typename InputOperator>
 void argsort_impl(OutputTensor &idx_out, const InputOperator &a,
           const SortDirection_t dir,
-          cudaExecutor exec = 0)
+          const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
@@ -2294,7 +2294,7 @@ void sort_impl(OutputTensor &a_out, const InputOperator &a,
  */
 template <typename OutputTensor, typename InputOperator>
 void cumsum_impl(OutputTensor &a_out, const InputOperator &a,
-            cudaExecutor exec = 0)
+            const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
@@ -2531,7 +2531,7 @@ struct GTE
  *   CUDA executor or stream
  */
 template <typename SelectType, typename CountTensor, typename OutputTensor, typename InputOperator>
-void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, cudaExecutor exec = 0)
+void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   static_assert(CountTensor::Rank() == 0, "Num found output tensor rank must be 0");
@@ -2652,7 +2652,7 @@ void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator
  *   CUDA executor stream
  */
 template <typename SelectType, typename CountTensor, typename OutputTensor, typename InputOperator>
-void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, cudaExecutor exec = 0)
+void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   static_assert(CountTensor::Rank() == 0, "Num found output tensor rank must be 0");
@@ -2767,7 +2767,7 @@ void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOpera
  *   CUDA executor
  */
 template <typename CountTensor, typename OutputTensor, typename InputOperator>
-void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a,  cudaExecutor exec = 0)
+void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a,  const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   static_assert(CountTensor::Rank() == 0, "Num found output tensor rank must be 0");

diff --git a/include/matx/transforms/reduce.h b/include/matx/transforms/reduce.h
@@ -263,7 +263,7 @@ void __MATX_INLINE__ reduce(OutType dest, const InType &in, ReduceOp op,
  */
 template <typename OutType, typename InType>
 void __MATX_INLINE__ mean_impl(OutType dest, const InType &in,
-                 cudaExecutor exec = 0)
+                 const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("mean_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
@@ -468,7 +468,7 @@ void __MATX_INLINE__ softmax_impl(OutType dest, const InType &in, PermDims dims,
  */
 template <typename OutType, typename InType>
 void __MATX_INLINE__ median_impl(OutType dest,
-                   const InType &in, cudaExecutor exec = 0)
+                   const InType &in, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   if constexpr ( OutType::Rank() <= 1 && InType::Rank() <=2 ) {
@@ -640,7 +640,7 @@ void __MATX_INLINE__ median_impl(OutType dest, const InType &in, [[maybe_unused]
  *   CUDA executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
+void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("sum_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
@@ -706,7 +706,7 @@ void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, [[maybe_unused]] c
  *   CUDA executor
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
+void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("prod_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
@@ -778,7 +778,7 @@ void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, [[maybe_unused]]
  *   CUDA executor or stream ID
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ max_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
+void __MATX_INLINE__ max_impl(OutType dest, const InType &in, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("max_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
@@ -848,7 +848,7 @@ void __MATX_INLINE__ max_impl(OutType dest, const InType &in, [[maybe_unused]] c
  *   CUDA executor or stream ID
  */
 template <typename OutType, typename TensorIndexType, typename InType>
-void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InType &in, cudaExecutor exec = 0)
+void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InType &in, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("argmax_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
@@ -926,7 +926,7 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
  *   CUDA executor or stream ID
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ min_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
+void __MATX_INLINE__ min_impl(OutType dest, const InType &in, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("min_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
@@ -995,7 +995,7 @@ void __MATX_INLINE__ min_impl(OutType dest, const InType &in, [[maybe_unused]] c
  *   CUDA executor or stream ID
  */
 template <typename OutType, typename TensorIndexType, typename InType>
-void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, cudaExecutor exec = 0)
+void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, const cudaExecutor &exec)
 {
   static_assert(OutType::Rank() == TensorIndexType::Rank());
 #ifdef __CUDACC__
@@ -1082,7 +1082,7 @@ void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InT
  *   CUDA executor or stream ID
  */
 template <typename OutType, typename TensorIndexType, typename InType>
-void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin, OutType destmax, TensorIndexType &idestmax, const InType &in, cudaExecutor exec = 0)
+void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin, OutType destmax, TensorIndexType &idestmax, const InType &in, const cudaExecutor &exec)
 {
   static_assert(OutType::Rank() == TensorIndexType::Rank());
 #ifdef __CUDACC__
@@ -1162,7 +1162,7 @@ void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin,
  *   CUDA executor or stream ID
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ any_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
+void __MATX_INLINE__ any_impl(OutType dest, const InType &in, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("any_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
@@ -1235,7 +1235,7 @@ void __MATX_INLINE__ any_impl(OutType dest, const InType &in, [[maybe_unused]] c
  *   CUDA executor or stream ID
  */
 template <typename OutType, typename InType>
-void __MATX_INLINE__ all_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
+void __MATX_INLINE__ all_impl(OutType dest, const InType &in, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("all_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
@@ -1313,7 +1313,7 @@ void __MATX_INLINE__ all_impl(OutType dest, const InType &in, [[maybe_unused]] c
  *   CUDA executor or stream ID
  */
 template <typename OutType, typename InType1, typename InType2>
-void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, cudaExecutor exec = 0)
+void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, const cudaExecutor &exec)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("allclose(" + get_type_str(in1) + ", " + get_type_str(in2) + ")", matx::MATX_NVTX_LOG_API)

diff --git a/include/matx/transforms/transpose.h b/include/matx/transforms/transpose.h
@@ -60,7 +60,7 @@ namespace matx
    */
   template <typename OutputTensor, typename InputTensor>
     __MATX_INLINE__ void transpose_matrix_impl([[maybe_unused]] OutputTensor &out,
-        const InputTensor &in, cudaExecutor exec)
+        const InputTensor &in, const cudaExecutor &exec)
     {
       MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)