[CMSIS-NN] Support for Softmax Int16 operator

Author Codrut-Grigore Irimie · Codrut-Grigore Irimie · commit 1567daea3477 · 2023-07-24T14:08:21.000+01:00
diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py
@@ -86,11 +86,17 @@ def check_qnn_softmax(pattern):
         zero_point = pattern.args[2].data.numpy().item(0)
 
         # check for dtypes of quantize and dequantize
-        return (
-            (scale == 1.0 / 256 and zero_point == -128)
+        if ((scale == 1.0 / 256 and zero_point == -128)
             and pattern.attrs.out_dtype == "int8"
-            and dequantize_call.args[0].checked_type.dtype == "int8"
-        )
+            and dequantize_call.args[0].checked_type.dtype == "int8"):
+            return True
+        
+        if ((scale == 1.0 / 32768 and zero_point == 0)
+            and pattern.attrs.out_dtype == "int16"
+            and dequantize_call.args[0].checked_type.dtype == "int16"):
+            return True
+        
+        return False
 
     def qnn_conv2d_pattern(with_pad):
         """Create pattern for qnn.conv2D with optional pad and/or optional fused relu."""
diff --git a/src/relay/backend/contrib/cmsisnn/compute_luts.cc b/src/relay/backend/contrib/cmsisnn/compute_luts.cc
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "compute_luts.h"
+
+#include <cmath>
+#include <algorithm>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cmsisnn {
+
+void CalculateLUTInt16(int key_zero_point, float key_scale, int value_zero_point, float value_scale,
+                          float (*func)(float), const int steps, int16_t* lut) {
+
+    const float value_min = static_cast<float>(std::numeric_limits<int16_t>::min());
+    const float value_max = static_cast<float>(std::numeric_limits<int16_t>::max());
+    const float key_min_deq = key_scale * (std::numeric_limits<int16_t>::min() - key_zero_point);
+    const float key_max_deq = key_scale * (std::numeric_limits<int16_t>::max() - key_zero_point);
+    const float value_min_deq = value_scale * (std::numeric_limits<int16_t>::min() - value_zero_point);
+    const float value_max_deq = value_scale * (std::numeric_limits<int16_t>::max() - value_zero_point);
+
+    const float step_size_deq = (key_max_deq - key_min_deq) / (steps - 1);
+    const float half_step_size_deq = step_size_deq / 2;
+
+    const float value_inv_quantizing = (std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::min() + 1) /
+      (value_max_deq - value_min_deq);
+    
+
+
+    for (int i = 0; i < steps - 1; i++) {
+      float value_deq = func(key_min_deq + i * step_size_deq);
+      float mid_value_deq = func(key_min_deq + i * step_size_deq + half_step_size_deq);
+      float next_value_deq = func(key_min_deq + (i + 1) * step_size_deq);
+
+      float value = std::round(value_deq * value_inv_quantizing);
+      float mid_value = std::round(mid_value_deq * value_inv_quantizing);
+      float next_value = std::round(next_value_deq * value_inv_quantizing);
+      float mid_iterp_value = std::round((value + next_value) / 2);
+
+      float mid_err = mid_iterp_value - mid_value;
+      float bias = std::round(mid_err / 2);
+      
+      lut[i] = static_cast<int16_t>(std::max(std::min(value - bias, value_max), value_min));
+    }
+
+    lut[steps - 1] = static_cast<int16_t>(std::max(std::min(func(value_max_deq) * value_inv_quantizing, value_max), value_min));
+  }
+
+} // namespace cmsisnn
+} // namespace contrib
+} // namespace relay
+} // namespace tvm
diff --git a/src/relay/backend/contrib/cmsisnn/compute_luts.h b/src/relay/backend/contrib/cmsisnn/compute_luts.h
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/cmsisnn/compute_luts.h
+ * \brief CMSIS-NN LUTs calculation functions
+ */
+
+#ifndef TVM_RELAY_BACKEND_CONTRIB_CMSISNN_LUT_H_
+#define TVM_RELAY_BACKEND_CONTRIB_CMSISNN_LUT_H_
+
+#include <cstdint>
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace cmsisnn {
+/*!
+* \brief Populates an int16 LUT based on the quantization parameters of its keys, values and respective transformation function
+*
+* \param key_zero_point - zero point of table's keys
+* \param key_scale - scale of the table's keys
+* \param value_zero_point - zero point of table's values
+* \param value_scale - scale of the table's values
+* \param func - function pointer of the transformation performed by the LUT
+* \param steps - number of total values inside the table
+* \param lut - int16_t array storing the values of the LUT
+*/
+void CalculateLUTInt16(int key_zero_point, float key_scale, int value_zero_point, float value_scale,
+                          float (*func)(float), const int steps, int16_t* lut);
+
+} //namespace cmsisnn
+} //namespace contrib
+} //namespace relay
+} //namespace tvm
+
+#endif
+
diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc
@@ -31,6 +31,7 @@
 #include "buffer_size.h"
 #include "compiler_attrs.h"
 #include "convolutions.h"
+#include "compute_luts.h"
 
 namespace tvm {
 namespace relay {
@@ -89,26 +90,42 @@ class RelayToTIRVisitor : public MixedModeMutator {
  private:
   inline IntImm ToArg(int32_t value) { return IntImm(DataType::Int(32), value); }
 
+  //struct used to allocated const NDArray
+  struct user_const {
+    tir::Var buffer_var;
+    int num_bits;
+    Array<PrimExpr> extents;
+    tvm::runtime::NDArray ndarray;
+  };
+
   void CreatePrimFuncForExtern(const GlobalVar& global_var, Array<tir::Var> func_signature,
                                const Map<tir::Var, tir::Buffer>& buffer_map,
                                tvm::Array<PrimExpr> call_extern_args,
                                PrimExpr context_buffer_var = PrimExpr(),
-                               int context_buffer_size = 0, int num_bits = 8) {
+                               int context_buffer_size = 0,
+                               int num_bits = 8,
+                               std::vector<user_const> context_const_buffer_vars = {}) {
     Map<String, ObjectRef> dict_attrs;
     dict_attrs.Set(tvm::attr::kGlobalSymbol, global_var->name_hint);
     dict_attrs.Set(tvm::attr::kTarget, target_);
     dict_attrs.Set("tir.noalias", Bool(true));
 
     tir::Stmt body = tir::Evaluate(
         tvm::tir::Call(DataType::Int(num_bits), tir::builtin::call_extern(), call_extern_args));
-
+  
     if (context_buffer_size) {
       body = tir::Allocate(Downcast<tir::Var>(context_buffer_var), DataType::Int(num_bits),
                            {context_buffer_size}, tir::const_true(), body);
     }
-
+    
+    for (int i = 0; i < int(context_const_buffer_vars.size()); i++){
+      body = tir::AllocateConst(Downcast<tir::Var>(context_const_buffer_vars[i].buffer_var), DataType::Int(context_const_buffer_vars[i].num_bits),
+                            context_const_buffer_vars[i].extents, context_const_buffer_vars[i].ndarray, body);
+    }
+    
     tir::PrimFunc replacement_func(func_signature, body, VoidType(), buffer_map,
                                    DictAttrs(dict_attrs));
+
     ir_module_->Add(global_var, replacement_func);
   }
 
@@ -505,6 +522,10 @@ class RelayToTIRVisitor : public MixedModeMutator {
     const CallNode* softmax_call = quantize_call->args[0].as<CallNode>();
     const CallNode* dequant_call = softmax_call->args[0].as<CallNode>();
     const float quant_scale = GetScalarFromConstant<float>(dequant_call->args[1]);
+    const auto bit_width = quantize_call->type_as<TensorTypeNode>()->dtype.bits();
+    LOG(INFO) << PrettyPrint(quantize_call->args[0]);
+    LOG(INFO) << PrettyPrint(softmax_call->args[0]);
+    LOG(INFO) << PrettyPrint(dequant_call->args[0]);
 
     // assuming layout as NHWC
     auto shape = quantize_call->type_as<TensorTypeNode>()->shape;
@@ -517,36 +538,103 @@ class RelayToTIRVisitor : public MixedModeMutator {
 
     // calculate multiplier and shift for CMSIS-NN softmax API
     // Note: TensorFlow Lite Micro assumptions
-    // Output zero point and scale are fixed to -128 and 1 / 256
+    // Output zero point and scale are fixed to -128 and 1 / 256 in the case of an int8 operator or to 0 and 1 / 32768
     // kScaledDiffIntegerBits, kInputBits, kBeta are described on the following github page
-    // https://github.com/tensorflow/tflite-micro/blob/d97cd0908d8cf5021e9d86f05a49888bee28c2a4/tensorflow/lite/micro/kernels/softmax_common.cc#L47
-    double beta_multiplier = (kBeta * quant_scale * (1 << (31 - kInputBits)));
-    beta_multiplier = std::min<double>(beta_multiplier, (1ll << 31) - 1.0);
-    auto mult_shift_pair = tvm::relay::qnn::GetFixedPointMultiplierShift(beta_multiplier);
-    int32_t mult = std::get<0>(mult_shift_pair);
-    int32_t shift = std::get<1>(mult_shift_pair);
-    int32_t diff_min = (1 << kScaledDiffIntegerBits) - 1;
-    diff_min <<= (31 - kScaledDiffIntegerBits);
-    diff_min >>= shift;
-    diff_min *= -1;
-
+    // https://github.com/tensorflow/tflite-micro/blob/d97cd0908d8cf5021e9d86f05a49888bee28c2a4/tensorflow/lite/exp_zero_pointmicro/kernels/softmax_common.cc#L47
+
+    int32_t mult;
+    int32_t shift;
+    int32_t diff_min = 0;
+
+    std::vector<user_const> softmax_params(2);
+    Device dev{DLDeviceType::kDLCPU, 0};
+
+    if (bit_width == 8){
+      double beta_multiplier = (kBeta * quant_scale * (1 << (31 - kInputBits)));
+      beta_multiplier = std::min<double>(beta_multiplier, (1ll << 31) - 1.0);
+      auto mult_shift_pair = tvm::relay::qnn::GetFixedPointMultiplierShift(beta_multiplier);
+      mult = std::get<0>(mult_shift_pair);
+      shift = std::get<1>(mult_shift_pair);
+      diff_min = (1 << kScaledDiffIntegerBits) - 1;
+      diff_min <<= (31 - kScaledDiffIntegerBits);
+      diff_min >>= shift;
+      diff_min *= -1;
+    }
+    else { //bit_width == 16
+      double scale_beta_rescale = quant_scale * kBeta / (10.0 / 65535.0);
+      auto mult_shift_pair = tvm::relay::qnn::GetFixedPointMultiplierShift(scale_beta_rescale);
+      mult = std::get<0>(mult_shift_pair);
+      shift = std::get<1>(mult_shift_pair);
+
+      int lut_entries = 513;
+      int16_t softmax_s16_exp_lut[lut_entries];
+      int16_t softmax_s16_one_by_one_lut[lut_entries];
+
+      const int range_int16 = std::numeric_limits<int16_t>::max() - std::numeric_limits<int16_t>::min();
+      int exp_zero_point = std::numeric_limits<int16_t>::max();
+      float exp_scale = 10.0f / range_int16;
+
+      int one_by_one_zero_point = std::numeric_limits<int16_t>::min();
+      float one_by_one_scale = 1.0f / range_int16;
+      
+      int lut_value_zero_point = 0;
+      float lut_value_scale = 2.0f / range_int16;
+
+      CalculateLUTInt16(exp_zero_point, exp_scale, lut_value_zero_point, lut_value_scale,
+                          [](float key){ return std::exp(key); }, lut_entries, softmax_s16_exp_lut);
+      CalculateLUTInt16(one_by_one_zero_point, one_by_one_scale, lut_value_zero_point, lut_value_scale,
+                          [](float key){ return 1.0f / (1.0f + key); }, lut_entries, softmax_s16_one_by_one_lut);
+
+      //first LUT
+      softmax_params[0].buffer_var = tir::Var("exp_lut", PointerType(PrimType(DataType::Int(bit_width)), "global.workspace"));
+      softmax_params[0].ndarray = runtime::NDArray::Empty({lut_entries}, DataType::Int(bit_width), dev);
+      softmax_params[0].ndarray.CopyFromBytes(softmax_s16_exp_lut, sizeof(int16_t)*lut_entries);
+      softmax_params[0].extents = {lut_entries};
+      softmax_params[0].num_bits = 16;
+
+      //second LUT
+      softmax_params[1].buffer_var = tir::Var("one_by_one_lut", PointerType(PrimType(DataType::Int(bit_width)), "global.workspace"));
+      softmax_params[1].ndarray = runtime::NDArray::Empty({lut_entries}, DataType::Int(bit_width), dev);
+      softmax_params[1].ndarray.CopyFromBytes(softmax_s16_one_by_one_lut, sizeof(int16_t)*lut_entries);
+      softmax_params[1].extents = {lut_entries};
+      softmax_params[1].num_bits = 16;
+    }
+    
     BufferCreator buffer_creator;
-    tir::Var in_var = buffer_creator.CreateBufferVar("input", DataType::Handle(8));
-    tir::Var out_var = buffer_creator.CreateBufferVar("output", DataType::Handle(8));
+    tir::Var in_var = buffer_creator.CreateBufferVar("input", DataType::Handle(bit_width));
+    tir::Var out_var = buffer_creator.CreateBufferVar("output", DataType::Handle(bit_width));
 
-    tvm::Array<PrimExpr> args = {
-        tir::StringImm("arm_softmax_s8"),
+    if (bit_width == 8) {
+      tvm::Array<PrimExpr> args = {
+        tir::StringImm("arm_softmax_s" + std::to_string(bit_width)),
         in_var,
         ToArg(num_rows),
         ToArg(row_size),
         ToArg(mult),
         ToArg(shift),
         ToArg(diff_min),
         out_var,
-    };
+      };
 
-    CreatePrimFuncForExtern(global_var, buffer_creator.GetPrimFuncParams(),
+      CreatePrimFuncForExtern(global_var, buffer_creator.GetPrimFuncParams(),
                             buffer_creator.GetBufferMap(), args);
+    } else { //bit_width == 16
+      tvm::Array<PrimExpr> args = {
+        tir::StringImm("arm_softmax_s" + std::to_string(bit_width)),
+        in_var,
+        ToArg(num_rows),
+        ToArg(row_size),
+        ToArg(mult),
+        ToArg(shift),
+        softmax_params[0].buffer_var,
+        softmax_params[1].buffer_var,
+        out_var,
+      };
+
+      CreatePrimFuncForExtern(global_var, buffer_creator.GetPrimFuncParams(),
+                            buffer_creator.GetBufferMap(), args, PrimExpr(),
+                            0, 8, softmax_params);
+    }
   }
 
   struct BinaryElementwiseClipPattern {
diff --git a/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc b/src/relay/backend/contrib/cmsisnn/tir_to_runtime.cc
diff --git a/tests/python/contrib/test_cmsisnn/test_softmax.py b/tests/python/contrib/test_cmsisnn/test_softmax.py