Fixed const issues seen in user's code (#1044)

cliffburdick · web-flow · commit 1df71375a6b6 · 2025-08-25T20:02:46.000-07:00
diff --git a/include/matx/core/tensor_utils.h b/include/matx/core/tensor_utils.h
@@ -247,21 +247,37 @@ namespace matx
      * @param indices indices
      * @return Value after broadcasting
      */
-    template <ElementsPerThread EPT, typename T, typename... Is, std::enable_if_t<std::conjunction_v<std::is_integral<Is>...>, bool> = true>
+    // Const-qualified RHS fetch
+    template <ElementsPerThread EPT, typename T, typename... Is, std::enable_if_t<std::conjunction_v<std::is_integral<Is>...> && std::is_const_v<std::remove_reference_t<T>>, bool> = true>
+    __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) get_matx_value(T &&i, Is... indices)
+    {
+      using OpT = remove_cvref_t<T>;
+      constexpr int RANK = OpT::Rank();
+      const OpT &ci = i;
+      if constexpr (RANK == int(sizeof...(Is)) || RANK == matxNoRank) {
+        return ci.template operator()<EPT>(indices...);
+      }
+      else
+      {
+        using seq = offset_sequence_t<sizeof...(Is) - RANK, std::make_index_sequence<RANK>>;
+        auto tup = cuda::std::make_tuple(indices...);
+        auto sliced_tup = select_tuple(std::forward<decltype(tup)>(tup), seq{});
+        return cuda::std::apply([&](auto... args) {
+          return ci.template operator()<EPT>(args...);
+        }, sliced_tup);
+      }
+    }
+
+    // Non-const fetch preserves original behavior (may return refs)
+    template <ElementsPerThread EPT, typename T, typename... Is, std::enable_if_t<std::conjunction_v<std::is_integral<Is>...> && !std::is_const_v<std::remove_reference_t<T>>, bool> = true>
     __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) get_matx_value(T &&i, Is... indices)
     {
       constexpr int RANK = remove_cvref_t<T>::Rank();
       if constexpr (RANK == int(sizeof...(Is)) || RANK == matxNoRank) {
-        // If we're only indexing with the same number of arguments as the rank of the operator, just return operator()
         return cuda::std::forward<T>(i).template operator()<EPT>(indices...);
       }
       else
       {
-        // Otherwise we need to broadcast by constructing a large set of indices
-        // Construct an integer sequence of the length of the tuple, but only using the last indices. We construct an offset sequence
-        // to index into the broadcasted dimensions. For example, if T is a 3D tensor and we want to index as a 5D, we take the indices
-        // {0, 1, 2} we'd normally index with, and add the difference in rank (2), to get {2, 3, 4}. Another way to think of this is it
-        // simply chops off the first sizeof...(Is) - RANK indices since they're not used for operator().
         using seq = offset_sequence_t<sizeof...(Is) - RANK, std::make_index_sequence<RANK>>;
         auto tup = cuda::std::make_tuple(indices...);
         auto sliced_tup = select_tuple(std::forward<decltype(tup)>(tup), seq{});
@@ -271,25 +287,41 @@ namespace matx
       }
     }
 
-    template <ElementsPerThread EPT, typename T, typename IdxType, size_t N>
+    template <ElementsPerThread EPT, typename T, typename IdxType, size_t N, std::enable_if_t<std::is_const_v<std::remove_reference_t<T>>, bool> = true>
+    __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) get_matx_value(T &&i, const cuda::std::array<IdxType, N> idx)
+    {
+      using OpT = remove_cvref_t<T>;
+      constexpr int RANK = OpT::Rank();
+      const OpT &ci = i;
+      if constexpr (RANK == N || RANK == matxNoRank) {
+        return cuda::std::apply([&ci](auto... args) -> decltype(auto) {
+          return ci.template operator()<EPT>(args...);
+        }, idx);
+      } else {
+        cuda::std::array<index_t, RANK> nbc_idx;
+        cuda::std::copy(idx.begin() + (N - RANK), idx.end(), nbc_idx.begin());
+        return cuda::std::apply([&ci](auto... args) -> decltype(auto) {
+          return ci.template operator()<EPT>(args...);
+        }, nbc_idx);
+      }
+    }
+
+    template <ElementsPerThread EPT, typename T, typename IdxType, size_t N, std::enable_if_t<!std::is_const_v<std::remove_reference_t<T>>, bool> = true>
     __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) get_matx_value(T &&i, const cuda::std::array<IdxType, N> idx)
     {
       constexpr int RANK = remove_cvref_t<T>::Rank();
       if constexpr (RANK == N || RANK == matxNoRank) {
-        // If we're only indexing with the same number of arguments as the rank of the operator, just return operator()
         return cuda::std::apply([&i](auto... args) -> decltype(auto) {
           return cuda::std::forward<T>(i).template operator()<EPT>(args...);
-        }, idx);        
-      }
-      else
-      {
-        cuda::std::array<index_t, RANK> nbc_idx; // non-broadcast indices
+        }, idx);
+      } else {
+        cuda::std::array<index_t, RANK> nbc_idx;
         cuda::std::copy(idx.begin() + (N - RANK), idx.end(), nbc_idx.begin());
         return cuda::std::apply([&i](auto... args) -> decltype(auto) {
           return cuda::std::forward<T>(i).template operator()<EPT>(args...);
         }, nbc_idx);
       }
-    }    
+    }
 
 
     template <ElementsPerThread EPT, typename T, typename... Is, std::enable_if_t<std::conjunction_v<std::is_integral<Is>...>, bool> = true>
@@ -317,7 +349,8 @@ namespace matx
       {
         return i;
       }
-    }    
+    }
+    
 
     template <typename T> __MATX_INLINE__ std::string to_short_str() {
       if constexpr (!is_complex_v<T>) {
diff --git a/include/matx/operators/binary_operators.h b/include/matx/operators/binary_operators.h
@@ -118,21 +118,24 @@ namespace matx
       }
 
       template <detail::ElementsPerThread EPT, typename... Is, std::enable_if_t<std::conjunction_v<std::is_integral<Is>...>, bool> = true>
-      __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ decltype(auto) operator()(Is... indices) const
+      __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ auto operator()(Is... indices) const
       {
-        auto i1 = get_value<EPT>(in1_, indices...);
-        auto i2 = get_value<EPT>(in2_, indices...);
+        // Bind operands as const to ensure RHS value-return semantics for composite ops
+        const auto &lhs = in1_;
+        const auto &rhs = in2_;
+        const auto i1 = get_value<EPT>(lhs, indices...);
+        const auto i2 = get_value<EPT>(rhs, indices...);
         return op_.template operator()<EPT>(i1, i2);
       }
 
       template <typename... Is, std::enable_if_t<std::conjunction_v<std::is_integral<Is>...>, bool> = true>
-      __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ decltype(auto) operator()(Is... indices) const
+      __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ auto operator()(Is... indices) const
       {
         return this->template operator()<detail::ElementsPerThread::ONE>(indices...);
       }      
 
       template <ElementsPerThread EPT, typename ArrayType, std::enable_if_t<is_std_array_v<ArrayType>, bool> = true>
-      __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ decltype(auto) operator()(const ArrayType &idx) const noexcept
+      __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto operator()(const ArrayType &idx) const noexcept
       {
         return cuda::std::apply([&](auto &&...args)  {
             return this->operator()<EPT>(args...);
diff --git a/include/matx/operators/concat.h b/include/matx/operators/concat.h
@@ -90,14 +90,16 @@ namespace matx
         }
       }
 
+
+      // Non-const path returns references where available (used for LHS writes)
       template <ElementsPerThread EPT, int I = 0, int N>
-      __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) GetVal(cuda::std::array<index_t,RANK> &indices) const {
+      __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) get_impl(cuda::std::array<index_t,RANK> &indices) {
         if constexpr ( I == N ) {
           // This should never happen, but we return a fake value from the first tuple element anyways
-          const auto &op = cuda::std::get<0>(ops_);
+          auto &op = cuda::std::get<0>(ops_);
           return cuda::std::apply([&](auto &&...call_args) -> decltype(auto) { return op.template operator()<EPT>(call_args...); }, indices);
         } else {
-          const auto &op = cuda::std::get<I>(ops_);
+          auto &op = cuda::std::get<I>(ops_);
           auto idx = indices[axis_];
           auto size = op.Size(axis_);
           // If in range of this operator
@@ -107,30 +109,34 @@ namespace matx
           } else {
             // otherwise remove this operator and recurse
             indices[axis_] -= size;
-            return GetVal<EPT, I+1, N>(indices);
+            return get_impl<EPT, I+1, N>(indices);
           }
         }
       }
 
-
+      // Const path: unify scalar return type to value_type to avoid ref/value conflicts
       template <ElementsPerThread EPT, int I = 0, int N>
-      __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) GetVal(cuda::std::array<index_t,RANK> &indices) {
+      __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ auto get_impl(cuda::std::array<index_t,RANK> &indices) const {
+        using return_t = cuda::std::conditional_t<
+            (EPT == ElementsPerThread::ONE),
+            value_type,
+            Vector<value_type, static_cast<index_t>(EPT)>>;
         if constexpr ( I == N ) {
-          // This should never happen, but we return a fake value from the first tuple element anyways
-          auto &op = cuda::std::get<0>(ops_);
-          return cuda::std::apply([&](auto &&...call_args) -> decltype(auto) { return op.template operator()<EPT>(call_args...); }, indices);
+          const auto &op = cuda::std::get<0>(ops_);
+          return cuda::std::apply([&](auto &&...call_args) -> return_t {
+            return op.template operator()<EPT>(call_args...);
+          }, indices);
         } else {
-          auto &op = cuda::std::get<I>(ops_);
+          const auto &op = cuda::std::get<I>(ops_);
           auto idx = indices[axis_];
           auto size = op.Size(axis_);
-          // If in range of this operator
           if(idx < size) {
-            // evaluate operator
-            return cuda::std::apply([&](auto &&...call_args) -> decltype(auto) { return op.template operator()<EPT>(call_args...); }, indices);
+            return cuda::std::apply([&](auto &&...call_args) -> return_t {
+              return op.template operator()<EPT>(call_args...);
+            }, indices);
           } else {
-            // otherwise remove this operator and recurse
             indices[axis_] -= size;
-            return GetVal<EPT, I+1, N>(indices);
+            return get_impl<EPT, I+1, N>(indices);
           }
         }
       }
@@ -140,13 +146,15 @@ namespace matx
       {
         if constexpr (EPT == ElementsPerThread::ONE) {
           cuda::std::array<index_t, sizeof...(Is)> indices = {{is...}};
-          return GetVal<EPT, 0, sizeof...(Ts)>(indices);
+          return get_impl<EPT, 0, sizeof...(Ts)>(indices);
         }
         else {
           return Vector<value_type, static_cast<index_t>(EPT)>{};
         }
       }
 
+      
+
       template <typename... Is>
       __MATX_INLINE__ __MATX_DEVICE__ __MATX_HOST__ decltype(auto) operator()(Is... is) const
       {
@@ -158,7 +166,7 @@ namespace matx
       {
         if constexpr (EPT == ElementsPerThread::ONE) {
           cuda::std::array<index_t, sizeof...(Is)> indices = {{is...}};
-          return GetVal<EPT, 0, sizeof...(Ts)>(indices);
+          return get_impl<EPT, 0, sizeof...(Ts)>(indices);
         }
         else {
           return Vector<value_type, static_cast<index_t>(EPT)>{};
diff --git a/include/matx/operators/set.h b/include/matx/operators/set.h
@@ -109,7 +109,7 @@ class set : public BaseOp<set<T, Op>> {
   // functions, so we have to make a separate one.
   template <ElementsPerThread EPT, typename... Ts>
   __MATX_DEVICE__ __MATX_HOST__ inline auto _internal_mapply(Ts&&... args) const noexcept {
-    auto r = detail::get_value<EPT>(op_, args...);
+    const auto r = detail::get_value<EPT>(op_, args...);
     out_(args...) = r;
     return r;
   }

Original file line number	Diff line number	Diff line change
`@@ -118,21 +118,24 @@ namespace matx`
`118`	`118`	`}`
`119`	`119`
`120`	`120`	`template <detail::ElementsPerThread EPT, typename... Is, std::enable_if_t<std::conjunction_v<std::is_integral<Is>...>, bool> = true>`
`121`		`- __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ decltype(auto) operator()(Is... indices) const`
	`121`	`+ __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ auto operator()(Is... indices) const`
`122`	`122`	`{`
`123`		`- auto i1 = get_value<EPT>(in1_, indices...);`
`124`		`- auto i2 = get_value<EPT>(in2_, indices...);`
	`123`	`+ // Bind operands as const to ensure RHS value-return semantics for composite ops`
	`124`	`+ const auto &lhs = in1_;`
	`125`	`+ const auto &rhs = in2_;`
	`126`	`+ const auto i1 = get_value<EPT>(lhs, indices...);`
	`127`	`+ const auto i2 = get_value<EPT>(rhs, indices...);`
`125`	`128`	`return op_.template operator()<EPT>(i1, i2);`
`126`	`129`	`}`
`127`	`130`
`128`	`131`	`template <typename... Is, std::enable_if_t<std::conjunction_v<std::is_integral<Is>...>, bool> = true>`
`129`		`- __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ decltype(auto) operator()(Is... indices) const`
	`132`	`+ __MATX_DEVICE__ __MATX_HOST__ __MATX_INLINE__ auto operator()(Is... indices) const`
`130`	`133`	`{`
`131`	`134`	`return this->template operator()<detail::ElementsPerThread::ONE>(indices...);`
`132`	`135`	`}`
`133`	`136`
`134`	`137`	`template <ElementsPerThread EPT, typename ArrayType, std::enable_if_t<is_std_array_v<ArrayType>, bool> = true>`
`135`		`- __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ decltype(auto) operator()(const ArrayType &idx) const noexcept`
	`138`	`+ __MATX_INLINE__ __MATX_HOST__ __MATX_DEVICE__ auto operator()(const ArrayType &idx) const noexcept`
`136`	`139`	`{`
`137`	`140`	`return cuda::std::apply([&](auto &&...args) {`
`138`	`141`	`return this->operator()<EPT>(args...);`
Original file line number	Diff line number	Diff line change
`@@ -109,7 +109,7 @@ class set : public BaseOp<set<T, Op>> {`
`109`	`109`	`// functions, so we have to make a separate one.`
`110`	`110`	`template <ElementsPerThread EPT, typename... Ts>`
`111`	`111`	`__MATX_DEVICE__ __MATX_HOST__ inline auto _internal_mapply(Ts&&... args) const noexcept {`
`112`		`- auto r = detail::get_value<EPT>(op_, args...);`
	`112`	`+ const auto r = detail::get_value<EPT>(op_, args...);`
`113`	`113`	`out_(args...) = r;`
`114`	`114`	`return r;`
`115`	`115`	`}`