Merge branch 'main' into binyli/nccl_api

Binyang2014 · web-flow · commit 3a00896badf1 · 2026-02-12T19:05:30.000-08:00
diff --git a/python/mscclpp/language/internal/operations.py b/python/mscclpp/language/internal/operations.py
@@ -534,6 +534,7 @@ def __init__(
         self.dst_buff = dst_buff
         self.channel_ids = channel_ids
         self.channel_type = channel_type
+        self.from_packet = from_packet
         self.to_packet = to_packet
         self.with_signal = with_signal
         self.with_signal_and_flush = with_signal_and_flush
@@ -579,6 +580,25 @@ def __add__(self, other):
                 with_signal=self.with_signal,
                 with_signal_and_flush=self.with_signal_and_flush,
             )
+        elif (
+            isinstance(other, PutOperation)
+            and self.name == Instruction.read_put_packet
+            and self.name == other.name
+            and self.src_buff == other.src_buff
+            and self.channel_type == other.channel_type
+            and self.tbg_info == other.tbg_info
+        ):
+            fused_operation = PutOperation(
+                src_buff=self.src_buff,
+                dst_buff=self.dst_buff + other.dst_buff,
+                channel_ids=self.channel_ids + other.channel_ids,
+                channel_type=self.channel_type,
+                tbg_info=self.tbg_info,
+                from_packet=self.from_packet,
+                to_packet=self.to_packet,
+                with_signal=self.with_signal,
+                with_signal_and_flush=self.with_signal_and_flush,
+            )
 
         return fused_operation
 
diff --git a/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py b/python/mscclpp/language/tests/single_node/allgather_pkt_rppkt.py
@@ -0,0 +1,78 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language.channel import *
+from mscclpp.language.rank import *
+from mscclpp.language.general import *
+from mscclpp.language.program import *
+from mscclpp.language.collectives import *
+
+
+def allgather_example(name, gpu_size, num_threads_per_block, min_message_size, max_message_size):
+    chunksperloop = 1
+    collective = AllGather(gpu_size, chunksperloop, True)
+    with CollectiveProgram(
+        name,
+        collective,
+        gpu_size,
+        protocol="LL",
+        num_threads_per_block=num_threads_per_block,
+        use_double_scratch_buffer=True,
+        min_message_size=min_message_size,
+        max_message_size=max_message_size,
+    ):
+        # Creating Scratch Buffers
+        scratch_buffer = []
+        for gpu in range(gpu_size):
+            scratch_buffer.append(Buffer(gpu, 2 * gpu_size))
+
+        # Copying it to scratch buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            scratch_offset = gpu_size
+            input_buffer = rank.get_input_buffer()
+            rank.copy_packets(
+                scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1], input_buffer[0:1], tb=0
+            )
+
+        # Putting packets in the remote scratch buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            output_buffer = rank.get_output_buffer()
+            for peer in range(1, gpu_size):
+                dst_rank = (gpu + peer) % gpu_size
+                ch = MemoryChannel(dst_rank, gpu)
+                tb = 0
+                ch.read_put_packets(
+                    scratch_buffer[dst_rank][gpu : gpu + 1],
+                    scratch_buffer[gpu][scratch_offset + gpu : scratch_offset + gpu + 1],
+                    tb,
+                )
+
+        # Copying packets from local scratch buffer to local buffer
+        for gpu in range(gpu_size):
+            rank = Rank(gpu)
+            output_buffer = rank.get_output_buffer()
+            for peer in range(1, gpu_size):
+                dst_rank = (gpu + peer) % gpu_size
+                rank.unpack_packets(
+                    output_buffer[dst_rank : dst_rank + 1],
+                    scratch_buffer[gpu][dst_rank : dst_rank + 1],
+                    tb=0,
+                )
+
+        print(JSON())
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument("--name", type=str, help="name of the program")
+parser.add_argument("--num_gpus", type=int, help="number of gpus")
+parser.add_argument("--num_threads_per_block", type=int, default=1024, help="number of threads per block")
+parser.add_argument("--min_message_size", type=int, default=0, help="minimum message size")
+parser.add_argument("--max_message_size", type=int, default=2**64 - 1, help="maximum message size")
+
+args = parser.parse_args()
+
+allgather_example(args.name, args.num_gpus, args.num_threads_per_block, args.min_message_size, args.max_message_size)
diff --git a/python/test/executor_test.py b/python/test/executor_test.py
@@ -11,7 +11,7 @@
     env,
 )
 from mscclpp import CommGroup, GpuBuffer
-from mscclpp.utils import KernelBuilder, GpuBuffer, pack
+from mscclpp.utils import KernelBuilder, pack
 import os
 import struct
 
diff --git a/src/core/include/execution_kernel.hpp b/src/core/include/execution_kernel.hpp
@@ -298,11 +298,11 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
   ChannelType chType = op.channelType;
   if (chType == ChannelType::MEMORY) {
     size_t nPackets = size / sizeof(PacketPayload<PacketType>);
+    PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1));
     for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) {
+      PacketPayload<PacketType> data = pkts[pktIdx].read(flag_);
+      PacketType pkt(data, flag_);
       for (uint32_t idx = 0; idx < nOutput; ++idx) {
-        PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1));
-        PacketPayload<PacketType> data = pkts[pktIdx].read(flag_);
-        PacketType pkt(data, flag_);
         size_t offset = (scratchOffset_ + (dstOffsets[idx] << 1)) / sizeof(PacketType);
         void* remoteMemory = static_cast<char*>(memoryChannelBufferPtrs_[op.outputBufferRefs[idx].id]);
         mscclpp::write<PacketType>(remoteMemory, offset + pktIdx, pkt);
@@ -312,10 +312,8 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
     // Ensuring Data Is Ready
     size_t nPackets = size / sizeof(PacketPayload<PacketType>);
     for (size_t pktIdx = threadIdx.x; pktIdx < nPackets; pktIdx += blockDim.x) {
-      for (uint32_t idx = 0; idx < nOutput; ++idx) {
-        PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[idx] << 1));
-        pkts[pktIdx].read(flag_);
-      }
+      PacketType* pkts = (PacketType*)((char*)scratch + scratchOffset_ + (srcOffsets[0] << 1));
+      pkts[pktIdx].read(flag_);
     }
     __syncthreads();
 
@@ -325,7 +323,7 @@ MSCCLPP_DEVICE_INLINE void handleReadPutPackets(const Operation& op, void* scrat
       return;
     }
     uint32_t dstOffset = (dstOffsets[chIdx] << 1) + scratchOffset_;
-    uint32_t srcOffset = (srcOffsets[chIdx] << 1) + scratchOffset_;
+    uint32_t srcOffset = (srcOffsets[0] << 1) + scratchOffset_;
     MemoryId dstMemoryId = portChannelBufferIds_[op.outputBufferRefs[chIdx].id];
     portChannels_[channelIndexes[chIdx]].put(
         dstMemoryId, dstOffset, static_cast<MemoryId>(BufferType::SCRATCH) + localMemoryIdBegin_, srcOffset, size << 1);

Original file line number	Diff line number	Diff line change
`@@ -11,7 +11,7 @@`
`11`	`11`	`env,`
`12`	`12`	`)`
`13`	`13`	`from mscclpp import CommGroup, GpuBuffer`
`14`		`-from mscclpp.utils import KernelBuilder, GpuBuffer, pack`
	`14`	`+from mscclpp.utils import KernelBuilder, pack`
`15`	`15`	`import os`
`16`	`16`	`import struct`
`17`	`17`