Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs_input/api/logic/truth/allclose.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ from allclose is an ``int`` value since boolean reductions are not available in


.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, const HostExecutor<MODE> &exec)
.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, cudaExecutor exec = 0)
.. doxygenfunction:: allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, const cudaExecutor &exec)

Examples
~~~~~~~~
Expand Down
8 changes: 4 additions & 4 deletions include/matx/transforms/chol/chol_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ struct DnCholCUDAParams_t {
size_t batch_size;
cublasFillMode_t uplo;
MatXDataType_t dtype;
cudaExecutor exec;
cudaStream_t stream;
};

template <typename OutputTensor, typename ATensor>
Expand Down Expand Up @@ -128,7 +128,7 @@ class matxDnCholCUDAPlan_t : matxDnCUDASolver_t {
params.n = a.Size(RANK - 1);
params.A = a.Data();
params.uplo = uplo;
params.exec = exec;
params.stream = exec.getStream();
params.dtype = TypeToInt<T1>();

return params;
Expand Down Expand Up @@ -208,7 +208,7 @@ struct DnCholCUDAParamsKeyHash {
{
return (std::hash<uint64_t>()(k.n)) +
(std::hash<uint64_t>()(k.batch_size)) +
(std::hash<uint64_t>()((uint64_t)(k.exec.getStream())));
(std::hash<uint64_t>()((uint64_t)(k.stream)));
}
};

Expand All @@ -223,7 +223,7 @@ struct DnCholCUDAParamsKeyEq {
return l.n == t.n &&
l.batch_size == t.batch_size &&
l.dtype == t.dtype &&
l.exec.getStream() == t.exec.getStream();
l.stream == t.stream;
}
};

Expand Down
16 changes: 8 additions & 8 deletions include/matx/transforms/cub.h
Original file line number Diff line number Diff line change
Expand Up @@ -1523,7 +1523,7 @@ using cub_cache_t = std::unordered_map<CubParams_t, std::any, CubParamsKeyHash,
template <typename OutputTensor, typename InputOperator>
void sort_impl_inner(OutputTensor &a_out, const InputOperator &a,
const SortDirection_t dir,
cudaExecutor exec = 0)
const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -1569,7 +1569,7 @@ template <typename OutputIndexTensor, typename InputIndexTensor, typename Output
void sort_pairs_impl_inner(OutputIndexTensor &idx_out, const InputIndexTensor &idx_in,
OutputKeyTensor &a_out, const InputKeyTensor &a_in,
const SortDirection_t dir,
[[maybe_unused]] cudaExecutor exec = 0)
const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -2074,7 +2074,7 @@ void cub_dualargreduce(OutputTensor &a1_out,
template <typename OutputTensor, typename InputOperator>
void sort_impl(OutputTensor &a_out, const InputOperator &a,
const SortDirection_t dir,
cudaExecutor exec = 0)
const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -2139,7 +2139,7 @@ void sort_impl(OutputTensor &a_out, const InputOperator &a,
template <typename OutputTensor, typename InputOperator>
void argsort_impl(OutputTensor &idx_out, const InputOperator &a,
const SortDirection_t dir,
cudaExecutor exec = 0)
const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -2294,7 +2294,7 @@ void sort_impl(OutputTensor &a_out, const InputOperator &a,
*/
template <typename OutputTensor, typename InputOperator>
void cumsum_impl(OutputTensor &a_out, const InputOperator &a,
cudaExecutor exec = 0)
const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -2531,7 +2531,7 @@ struct GTE
* CUDA executor or stream
*/
template <typename SelectType, typename CountTensor, typename OutputTensor, typename InputOperator>
void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, cudaExecutor exec = 0)
void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, const cudaExecutor &exec)
{
#ifdef __CUDACC__
static_assert(CountTensor::Rank() == 0, "Num found output tensor rank must be 0");
Expand Down Expand Up @@ -2652,7 +2652,7 @@ void find_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator
* CUDA executor stream
*/
template <typename SelectType, typename CountTensor, typename OutputTensor, typename InputOperator>
void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, cudaExecutor exec = 0)
void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, SelectType sel, const cudaExecutor &exec)
{
#ifdef __CUDACC__
static_assert(CountTensor::Rank() == 0, "Num found output tensor rank must be 0");
Expand Down Expand Up @@ -2767,7 +2767,7 @@ void find_idx_impl(OutputTensor &a_out, CountTensor &num_found, const InputOpera
* CUDA executor
*/
template <typename CountTensor, typename OutputTensor, typename InputOperator>
void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, cudaExecutor exec = 0)
void unique_impl(OutputTensor &a_out, CountTensor &num_found, const InputOperator &a, const cudaExecutor &exec)
{
#ifdef __CUDACC__
static_assert(CountTensor::Rank() == 0, "Num found output tensor rank must be 0");
Expand Down
24 changes: 12 additions & 12 deletions include/matx/transforms/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ void __MATX_INLINE__ reduce(OutType dest, const InType &in, ReduceOp op,
*/
template <typename OutType, typename InType>
void __MATX_INLINE__ mean_impl(OutType dest, const InType &in,
cudaExecutor exec = 0)
const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("mean_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -468,7 +468,7 @@ void __MATX_INLINE__ softmax_impl(OutType dest, const InType &in, PermDims dims,
*/
template <typename OutType, typename InType>
void __MATX_INLINE__ median_impl(OutType dest,
const InType &in, cudaExecutor exec = 0)
const InType &in, const cudaExecutor &exec)
{
#ifdef __CUDACC__
if constexpr ( OutType::Rank() <= 1 && InType::Rank() <=2 ) {
Expand Down Expand Up @@ -640,7 +640,7 @@ void __MATX_INLINE__ median_impl(OutType dest, const InType &in, [[maybe_unused]
* CUDA executor
*/
template <typename OutType, typename InType>
void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("sum_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -706,7 +706,7 @@ void __MATX_INLINE__ sum_impl(OutType dest, const InType &in, [[maybe_unused]] c
* CUDA executor
*/
template <typename OutType, typename InType>
void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("prod_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -778,7 +778,7 @@ void __MATX_INLINE__ prod_impl(OutType dest, const InType &in, [[maybe_unused]]
* CUDA executor or stream ID
*/
template <typename OutType, typename InType>
void __MATX_INLINE__ max_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
void __MATX_INLINE__ max_impl(OutType dest, const InType &in, const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("max_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -848,7 +848,7 @@ void __MATX_INLINE__ max_impl(OutType dest, const InType &in, [[maybe_unused]] c
* CUDA executor or stream ID
*/
template <typename OutType, typename TensorIndexType, typename InType>
void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InType &in, cudaExecutor exec = 0)
void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InType &in, const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("argmax_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -926,7 +926,7 @@ void __MATX_INLINE__ argmax_impl(OutType dest, TensorIndexType &idest, const InT
* CUDA executor or stream ID
*/
template <typename OutType, typename InType>
void __MATX_INLINE__ min_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
void __MATX_INLINE__ min_impl(OutType dest, const InType &in, const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("min_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -995,7 +995,7 @@ void __MATX_INLINE__ min_impl(OutType dest, const InType &in, [[maybe_unused]] c
* CUDA executor or stream ID
*/
template <typename OutType, typename TensorIndexType, typename InType>
void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, cudaExecutor exec = 0)
void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InType &in, const cudaExecutor &exec)
{
static_assert(OutType::Rank() == TensorIndexType::Rank());
#ifdef __CUDACC__
Expand Down Expand Up @@ -1082,7 +1082,7 @@ void __MATX_INLINE__ argmin_impl(OutType dest, TensorIndexType &idest, const InT
* CUDA executor or stream ID
*/
template <typename OutType, typename TensorIndexType, typename InType>
void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin, OutType destmax, TensorIndexType &idestmax, const InType &in, cudaExecutor exec = 0)
void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin, OutType destmax, TensorIndexType &idestmax, const InType &in, const cudaExecutor &exec)
{
static_assert(OutType::Rank() == TensorIndexType::Rank());
#ifdef __CUDACC__
Expand Down Expand Up @@ -1162,7 +1162,7 @@ void __MATX_INLINE__ argminmax_impl(OutType destmin, TensorIndexType &idestmin,
* CUDA executor or stream ID
*/
template <typename OutType, typename InType>
void __MATX_INLINE__ any_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
void __MATX_INLINE__ any_impl(OutType dest, const InType &in, const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("any_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -1235,7 +1235,7 @@ void __MATX_INLINE__ any_impl(OutType dest, const InType &in, [[maybe_unused]] c
* CUDA executor or stream ID
*/
template <typename OutType, typename InType>
void __MATX_INLINE__ all_impl(OutType dest, const InType &in, cudaExecutor exec = 0)
void __MATX_INLINE__ all_impl(OutType dest, const InType &in, const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("all_impl(" + get_type_str(in) + ")", matx::MATX_NVTX_LOG_API)
Expand Down Expand Up @@ -1313,7 +1313,7 @@ void __MATX_INLINE__ all_impl(OutType dest, const InType &in, [[maybe_unused]] c
* CUDA executor or stream ID
*/
template <typename OutType, typename InType1, typename InType2>
void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, cudaExecutor exec = 0)
void __MATX_INLINE__ allclose(OutType dest, const InType1 &in1, const InType2 &in2, double rtol, double atol, const cudaExecutor &exec)
{
#ifdef __CUDACC__
MATX_NVTX_START("allclose(" + get_type_str(in1) + ", " + get_type_str(in2) + ")", matx::MATX_NVTX_LOG_API)
Expand Down
2 changes: 1 addition & 1 deletion include/matx/transforms/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ namespace matx
*/
template <typename OutputTensor, typename InputTensor>
__MATX_INLINE__ void transpose_matrix_impl([[maybe_unused]] OutputTensor &out,
const InputTensor &in, cudaExecutor exec)
const InputTensor &in, const cudaExecutor &exec)
{
MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)

Expand Down