NVIDIA
diff --git a/‎CMakeLists.txt‎
Lines changed: 10 additions & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎docs_input/api/dft/fft/fft2d.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs_input/api/dft/fft/fft2d.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs_input/api/dft/fft/ifft2.rst‎
Lines changed: 2 additions & 2 deletions b/‎docs_input/api/dft/fft/ifft2.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs_input/api/manipulation/basic/copy.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs_input/api/manipulation/basic/copy.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs_input/build.rst‎
Lines changed: 10 additions & 0 deletions b/‎docs_input/build.rst‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎include/matx/core/error.h‎
Lines changed: 2 additions & 1 deletion b/‎include/matx/core/error.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/matx/core/tensor.h‎
Lines changed: 1 addition & 0 deletions b/‎include/matx/core/tensor.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/matx/core/type_utils.h‎
Lines changed: 6 additions & 6 deletions b/‎include/matx/core/type_utils.h‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎include/matx/executors/device.h‎
Lines changed: 1 addition & 1 deletion b/‎include/matx/executors/device.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/matx/executors/executors.h‎
Lines changed: 1 addition & 0 deletions b/‎include/matx/executors/executors.h‎
Lines changed: 1 addition & 0 deletions
@@ -21,6 +21,7 @@ option(MATX_EN_VISUALIZATION "Enable visualization support" OFF)
 option(MATX_EN_CUTLASS OFF)
 option(MATX_EN_CUTENSOR OFF)
 option(MATX_EN_FILEIO OFF)
+option(MATX_EN_NVPL OFF, "Enable NVIDIA Performance Libraries for optimized ARM CPU support")
 option(MATX_DISABLE_CUB_CACHE "Disable caching for CUB allocations" ON)
 
 set(MATX_EN_PYBIND11 OFF CACHE BOOL "Enable pybind11 support")
@@ -152,6 +153,15 @@ else()
     target_compile_definitions(matx INTERFACE MATX_ENABLE_CUTLASS=0)
 endif()
 
+if (MATX_EN_NVPL)
+    message(STATUS "Enabling NVPL library support")
+    # find_package is currently broken in NVPL. Use proper targets once working
+    #find_package(nvpl REQUIRED COMPONENTS fft)
+    #target_link_libraries(matx INTERFACE nvpl::fftw)
+    target_link_libraries(matx INTERFACE nvpl_fftw)
+    target_compile_definitions(matx INTERFACE MATX_EN_NVPL=1)
+endif()
+
 if (MATX_DISABLE_CUB_CACHE)
     target_compile_definitions(matx INTERFACE MATX_DISABLE_CUB_CACHE=1)
 endif()
@@ -291,4 +301,3 @@ if (MATX_BUILD_TESTS)
     include(cmake/GetGTest.cmake)
     add_subdirectory(test)
 endif()
-
 
@@ -9,8 +9,8 @@ Perform a 2D FFT
    These functions are currently not supported with host-based executors (CPU)
 
 
-.. doxygenfunction:: fft2(OpA &&a)
-.. doxygenfunction:: fft2(OpA &&a, const int32_t (&axis)[2])  
+.. doxygenfunction:: fft2(OpA &&a, FFTNorm norm = FFTNorm::BACKWARD)
+.. doxygenfunction:: fft2(OpA &&a, const int32_t (&axis)[2], FFTNorm norm = FFTNorm::BACKWARD)  
 
 Examples
 ~~~~~~~~
 
@@ -9,8 +9,8 @@ Perform a 2D inverse FFT
    These functions are currently not supported with host-based executors (CPU)
 
 
-.. doxygenfunction:: ifft2(OpA &&a)
-.. doxygenfunction:: ifft2(OpA &&a, const int32_t (&axis)[2])  
+.. doxygenfunction:: ifft2(OpA &&a, FFTNorm norm = FFTNorm::BACKWARD)
+.. doxygenfunction:: ifft2(OpA &&a, const int32_t (&axis)[2], FFTNorm norm = FFTNorm::BACKWARD)  
 
 Examples
 ~~~~~~~~
 
@@ -14,7 +14,7 @@ since it cannot be chained with other expressions.
 Examples
 ~~~~~~~~
 
-.. literalinclude:: ../../../../include/matx/transforms/fft.h
+.. literalinclude:: ../../../../include/matx/transforms/fft/fft_common.h
    :language: cpp
    :start-after: example-begin copy-test-1
    :end-before: example-end copy-test-1
 
@@ -43,6 +43,16 @@ Optional Third-party Dependencies
 - `cutensor <https://developer.nvidia.com/cutensor>`_ 1.7.0.1+ (Required when using `einsum`)
 - `cutensornet <https://docs.nvidia.com/cuda/cuquantum/cutensornet>`_ 23.03.0.20+ (Required when using `einsum`)
 
+Host (CPU) Support
+------------------
+Host support is provided both by the C++ standard library and NVIDIA's NVPL_ library. Host support is
+considered experimental and is still a work in progress. Currently all reduction functions are supported, 
+but only FFT transforms are supported. All host support is limited to a single thread in this release.
+
+To enable NVPL support use the CMake option `-DMATX_EN_NVPL=ON`.
+
+.. _NVPL: https://developer.nvidia.com/nvpl
+
 Build Options
 =============
 MatX provides 5 primary options for builds, and each can be configured independently:
 
@@ -65,7 +65,8 @@ namespace matx
     matxLUError,
     matxInverseError,
     matxSolverError,
-    matxcuTensorError
+    matxcuTensorError,
+    matxInvalidExecutor
   };
 
   static constexpr const char *matxErrorString(matxError_t e)
 
@@ -1486,6 +1486,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {
 
       if constexpr (N > 0) {
         if (end != matxDropDim) {
+          MATX_ASSERT_STR(end != matxKeepDim, matxInvalidParameter, "matxKeepDim only valid for clone(), not slice()");
           if (end == matxEnd) {
             n[d] = this->Size(i) - first;
           }
 
@@ -272,8 +272,8 @@ constexpr bool is_executor_t()
 
 
 namespace detail {
-template<typename T> struct is_device_executor : std::false_type {};
-template<> struct is_device_executor<matx::cudaExecutor> : std::true_type {};
+template<typename T> struct is_cuda_executor : std::false_type {};
+template<> struct is_cuda_executor<matx::cudaExecutor> : std::true_type {};
 }
 
 /**
@@ -282,11 +282,11 @@ template<> struct is_device_executor<matx::cudaExecutor> : std::true_type {};
  * @tparam T Type to test
  */
 template <typename T> 
-inline constexpr bool is_device_executor_v = detail::is_device_executor<typename remove_cvref<T>::type>::value;
+inline constexpr bool is_cuda_executor_v = detail::is_cuda_executor<typename remove_cvref<T>::type>::value;
 
 namespace detail {
-template<typename T> struct is_single_thread_host_executor : std::false_type {};
-template<> struct is_single_thread_host_executor<matx::HostExecutor> : std::true_type {};
+template<typename T> struct is_host_executor : std::false_type {};
+template<> struct is_host_executor<matx::HostExecutor> : std::true_type {};
 }
 
 /**
@@ -295,7 +295,7 @@ template<> struct is_single_thread_host_executor<matx::HostExecutor> : std::true
  * @tparam T Type to test
  */
 template <typename T> 
-inline constexpr bool is_single_thread_host_executor_v = detail::is_single_thread_host_executor<remove_cvref_t<T>>::value;
+inline constexpr bool is_host_executor_v = detail::is_host_executor<remove_cvref_t<T>>::value;
 
 
 namespace detail {
 
@@ -66,7 +66,7 @@ namespace matx
       /*
        * @breif Returns stream associated with executor
        */
-      auto getStream() { return stream_; }
+      auto getStream() const { return stream_; }
 
       /**
        * Execute an operator on a device
 
@@ -32,5 +32,6 @@
 
 #pragma once
 
+#include "matx/executors/support.h"
 #include "matx/executors/device.h"
 #include "matx/executors/host.h"
Original file line number	Diff line number	Diff line change
`@@ -1486,6 +1486,7 @@ class tensor_t : public detail::tensor_impl_t<T,RANK,Desc> {`
`1486`	`1486`
`1487`	`1487`	`if constexpr (N > 0) {`
`1488`	`1488`	`if (end != matxDropDim) {`
	`1489`	`+ MATX_ASSERT_STR(end != matxKeepDim, matxInvalidParameter, "matxKeepDim only valid for clone(), not slice()");`
`1489`	`1490`	`if (end == matxEnd) {`
`1490`	`1491`	`n[d] = this->Size(i) - first;`
`1491`	`1492`	`}`