[TIR] Introduce Pass InjectPTXLDG32 (#13973)

andy-yang-1 · web-flow · commit 87bb8b1be8c4 · 2023-02-18T10:44:55.000-08:00
This PR introduces a new pass InjectPTXLDG32 to change the `if_then_else` call node to `ptx_pred_ldg32` call node. When the store buffer is local and the load value is global, the pass can change the if_then_else pattern to a ptx pattern.

Test the pass with

```python
with tvm.transform.PassContext(config={"tir.ptx_pred_ldg32": True}): 
    mod = tvm.build(f, target="cuda")
```
diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h
@@ -610,6 +610,18 @@ TVM_DLL const Op& tvm_store_matrix_sync();
  */
 TVM_DLL const Op& ptx_mma();
 
+/*!
+ * \brief tvm intrinsic for ptx predicate load with 32-bit data type.
+ *
+ */
+TVM_DLL const Op& ptx_ldg32();
+
+/*!
+ * \brief tvm intrinsic for ptx predicate load with 32-bit data type.
+ *
+ */
+TVM_DLL const Op& ptx_ldg32();
+
 /*!
  * \brief tvm intrinsic for sparse tensor core ptx instructions.
  *
diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
@@ -677,6 +677,12 @@ TVM_DLL Pass Filter(runtime::TypedPackedFunc<bool(PrimFunc)> fcond);
  */
 TVM_DLL Pass InjectPTXAsyncCopy();
 
+/*!
+ * \brief Pass to rewrite global to local memory copy on CUDA with ldg32 instruction.
+ * \return The pass.
+ */
+TVM_DLL Pass InjectPTXLDG32(bool enable_ptx_ldg32 = true);
+
 /*!
  * \brief Remove the weight layout rewrite block
  * \param skip_ndarray_rewrite If True, exact rewrite of NDArray, according to the given index map,
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
@@ -55,6 +55,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_async_copy", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.merge_async_commit_queue_scope", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.instrument_lwp", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.vtcm_capacity", Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.ptx_ldg32", Bool);
 
 // WARNING: May cause coherency issues resulting data miscompares
 // Experimental feature that, when enabled by the runtime, bypasses the cache when using DMA. When
@@ -159,6 +160,8 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
   bool enable_equiv_terms_in_cse_tir =
       pass_ctx->GetConfig<Bool>("tir.enable_equiv_terms_in_cse_tir", Bool(false)).value();
 
+  bool ptx_ldg32 = pass_ctx->GetConfig<Bool>("tir.ptx_ldg32", Bool(false)).value();
+
   // Get any user-added passes
   Array<Array<ObjectRef>> add_lower_pass =
       pass_ctx->GetConfig<Array<Array<ObjectRef>>>("tir.add_lower_pass", Array<Array<ObjectRef>>())
@@ -257,6 +260,10 @@ Array<tvm::transform::Pass> CreatePassList(bool disable_loop_partition) {
     pass_list.push_back(tir::transform::InstrumentBoundCheckers());
   }
 
+  if (ptx_ldg32) {
+    pass_list.push_back(tir::transform::InjectPTXLDG32(true));
+  }
+
   pass_list.push_back(
       tir::transform::CommonSubexprElimTIR(!disable_cse_tir, enable_equiv_terms_in_cse_tir));
 
@@ -584,6 +591,11 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
     mixed_pass_list.push_back(tir::transform::InjectPTXAsyncCopy());
   }
 
+  bool ptx_ldg32 = pass_ctx->GetConfig<Bool>("tir.ptx_ldg32", Bool(false)).value();
+  if (ptx_ldg32) {
+    mixed_pass_list.push_back(tir::transform::InjectPTXLDG32());
+  }
+
   bool unpacked_api = mixed_mod->GetAttr<relay::Executor>(tvm::attr::kExecutor)
                           .value_or(relay::Executor::Create("graph", {}))
                           ->GetAttr<Bool>("unpacked-api")
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
@@ -926,6 +926,37 @@ void CodeGenCUDA::VisitExpr_(const CallNode* op, std::ostream& os) {
   } else if (op->op.same_as(builtin::ptx_wait_group())) {
     std::string N = this->PrintExpr(op->args[0]);
     this->stream << "__asm__ __volatile__(\"cp.async.wait_group " + N + ";\");\n\n";
+  } else if (op->op.same_as(builtin::ptx_ldg32())) {
+    /*
+    asm volatile (
+        "{.reg .pred p;\n"
+        " setp.ne.b32 p, %2, 0;\n"
+        // " @p ld.global.nc.f32 %0, [%1];}\n"t
+        " @p ld.global.nc.L2::128B.f32 %0, [%1];}\n"
+        : "=f"(reg)
+        : "l"(addr), "r"((int)guard)
+    );
+    */
+
+    // get local
+    std::string reg = this->PrintExpr(op->args[0]);
+    // get guard
+    std::string guard = this->PrintExpr(op->args[1]);
+    const BufferLoadNode* addr_buffer = op->args[2].as<BufferLoadNode>();
+    std::string global_addr = this->PrintExpr(addr_buffer->indices[0]);
+    std::string global_buffer = this->PrintExpr(addr_buffer->buffer->data);
+    std::string local_addr = this->PrintExpr(op->args[3]);
+    this->stream << "asm volatile (\n";
+    this->stream << "\"{.reg .pred p;\\n\"\n";
+    this->stream << "\" setp.ne.b32 p, %2, 0;\\n\"\n";
+    this->stream << "\" @!p mov.b32 %0, 0;\\n\"\n";
+    this->stream << "\" @p ld.global.nc.f32 %0, [%1];}\\n\"\n";
+    // stream << "\" @p ld.global.nc.L2::128B.f32 %0, [%1];}\\n\"\n" ;
+    stream << ": \"=f\"(" << reg << "[" << local_addr << "]"
+           << ")\n";
+    stream << ": \"l\"((void*)(" << global_buffer << "+" << global_addr << ")), \"r\"((int)"
+           << guard << ")\n";
+    stream << ");\n";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc
@@ -251,6 +251,8 @@ TIR_DEFINE_BUILTIN_FUNC(tvm_store_matrix_sync)
 
 TIR_DEFINE_BUILTIN_FUNC(ptx_mma).set_attr<TCallEffectKind>("TCallEffectKind",
                                                            Integer(CallEffectKind::kOpaque));
+TIR_DEFINE_BUILTIN_FUNC(ptx_ldg32).set_num_inputs(4).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
 
 TIR_DEFINE_BUILTIN_FUNC(ptx_mma_sp)
     .set_attr<TCallEffectKind>("TCallEffectKind", Integer(CallEffectKind::kOpaque));
diff --git a/src/tir/transforms/inject_ptx_ldg32.cc b/src/tir/transforms/inject_ptx_ldg32.cc
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/arith/iter_affine_map.h>
+#include <tvm/runtime/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../../arith/const_fold.h"
+#include "../../arith/pattern_match.h"
+
+namespace tvm {
+namespace tir {
+
+class PTXRewriter : public StmtMutator {
+ public:
+  Stmt VisitStmt_(const AllocateNode* allocate) final {
+    if (!has_buffer_1) {
+      has_buffer_1 = true;
+      // addr[0] -> global_addr /  addr[1] -> local_addr
+      addr_buffer = decl_buffer({IntImm(DataType::Int(32), 2)}, DataType::Int(32), "addr", "local");
+      predicate_buffer =
+          decl_buffer({IntImm(DataType::Int(32), 1)}, DataType::Bool(1), "predicate", "local");
+    }
+    Stmt result = StmtMutator::VisitStmt_(allocate);
+    if (!has_buffer_2) {
+      has_buffer_2 = true;
+      result =
+          Allocate(addr_buffer->data, addr_buffer->dtype, addr_buffer->shape, Bool(true), result);
+      result = Allocate(predicate_buffer->data, predicate_buffer->dtype, predicate_buffer->shape,
+                        Bool(true), result);
+    }
+    return result;
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* store) final {
+    Stmt result = StmtMutator::VisitStmt_(store);
+    Buffer load_buffer = store->buffer;
+    PrimExpr load_value = store->value;
+    // const BufferLoadNode* gload = load_value.as<BufferLoadNode>(); // take
+    // the place of instance of
+    const CallNode* call = load_value.as<CallNode>();
+    if (call != nullptr) {
+      const OpNode* op = call->op.as<OpNode>();
+      if (op != nullptr && op->name == "tir.if_then_else") {
+        const PrimExpr& predicate = call->args[0];
+        const PrimExpr& lhs = call->args[1];
+        const PrimExpr& rhs = call->args[2];
+        PrimExpr global_addr, local_addr;
+        const BufferLoadNode* load = lhs.as<BufferLoadNode>();
+        PrimExpr imm_value = rhs;
+        if (load == nullptr) {
+          load = rhs.as<BufferLoadNode>();
+          imm_value = lhs;
+          if (load == nullptr) {
+            return result;
+          }
+        }
+        global_addr = load->indices[0];
+        const RampNode* ramp = global_addr.as<RampNode>();
+        if (ramp != nullptr) {
+          return result;
+        }
+        local_addr = store->indices[0];
+        BufferStore addr_store(addr_buffer, global_addr, {IntImm(DataType::Int(32), 0)});
+        BufferStore local_addr_store(addr_buffer, local_addr, {IntImm(DataType::Int(32), 1)});
+        BufferStore predicate_store(predicate_buffer, predicate, {IntImm(DataType::Int(32), 0)});
+        PrimExpr new_lhs, new_rhs, new_predicate, new_indice;
+        new_lhs =
+            BufferLoad(load->buffer, {BufferLoad(addr_buffer, {IntImm(DataType::Int(32), 0)})});
+        new_rhs = IntImm(DataType::Int(32), 0);
+        new_predicate = BufferLoad(predicate_buffer, {IntImm(DataType::Int(32), 0)});
+        new_indice = BufferLoad(addr_buffer, {IntImm(DataType::Int(32), 1)});
+        BufferStore value_store(store->buffer, imm_value, {new_indice});
+        Evaluate ptx_load(Call(store->buffer->dtype, tvm::tir::builtin::ptx_ldg32(),
+                               {store->buffer->data, new_predicate, new_lhs, new_indice}));
+        Array<Stmt> tmp_seq = {addr_store, local_addr_store, predicate_store, value_store,
+                               ptx_load};
+        SeqStmt seq_stmt = SeqStmt(tmp_seq);
+        return seq_stmt;
+      }
+    }
+    return result;
+  }
+
+  bool has_buffer_1 = false, has_buffer_2 = false;
+  Buffer addr_buffer, predicate_buffer;
+};
+
+namespace transform {
+
+Pass InjectPTXLDG32(bool enable_inject_ptx_intrin) {
+  auto pass_func = [enable_inject_ptx_intrin](PrimFunc f, IRModule m, PassContext ctx) {
+    if (enable_inject_ptx_intrin) {
+      auto* n = f.CopyOnWrite();
+      n->body = PTXRewriter()(n->body);
+      // inject ptx
+    }
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.InjectPTXLDG32", {});
+}
+
+// The pass can now be invoked via the pass infrastructure, but we also add a
+// Python binding for it
+TVM_REGISTER_GLOBAL("tir.transform.InjectPTXLDG32").set_body_typed(InjectPTXLDG32);
+
+}  // namespace transform
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_inject_ptx_ldg32.py b/tests/python/unittest/test_inject_ptx_ldg32.py
@@ -0,0 +1,67 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import tvm
+from tvm.script import tir as T
+import numpy as np
+import tvm.testing
+
+
+@T.prim_func
+def vector_add(A: T.Buffer((16), "float32"), B: T.Buffer((32), "float32")) -> None:
+    T.func_attr({"global_symbol": "default_function", "tir.noalias": True})
+    bx = T.env_thread("blockIdx.x")
+    tx = T.env_thread("threadIdx.x")
+    T.launch_thread(bx, 1)
+    T.launch_thread(tx, 32)
+    A_local = T.Buffer((32), "float32", scope="local")
+
+    with T.block():
+        T.reads(A[0:16])
+        T.writes(A_local[0:32])
+        A_local[tx] = T.if_then_else(tx % 2 == 0, A[tx / 2], T.float32(0), dtype="float32")
+        B[tx] = A_local[tx] + 1.0
+
+
+@tvm.testing.requires_cuda
+def test_inject_ptx_intrin():
+    f = vector_add
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    if major < 8:
+        # Require at least SM80
+        return
+    with tvm.transform.PassContext(config={"tir.ptx_ldg32": True}):
+        mod = tvm.build(f, target="cuda")
+    A_np = np.random.rand(16).astype("float32")
+    B_np = np.zeros((32)).astype("float32")
+    dev = tvm.cuda(0)
+    A_nd = tvm.nd.array(A_np, device=dev)
+    B_nd = tvm.nd.array(B_np, device=dev)
+    mod(A_nd, B_nd)
+
+    C_np = np.zeros((32)).astype("float32")
+
+    for i in range(32):
+        if i % 2 == 0:
+            C_np[i] = A_np[i // 2]
+        C_np[i] += 1.0
+
+    tvm.testing.assert_allclose(B_nd.numpy(), C_np)
+
+
+if __name__ == "__main__":
+    test_inject_ptx_intrin()