Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ option(MSCCLPP_BUILD_APPS_NCCL "Build NCCL interfaces" ON)
option(MSCCLPP_USE_CUDA "Use NVIDIA/CUDA." OFF)
option(MSCCLPP_USE_ROCM "Use AMD/ROCm." OFF)
option(MSCCLPP_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(MSCCLPP_NPKIT_FLAGS "Enable NPKIT" OFF)

if(MSCCLPP_BYPASS_GPU_CHECK)
if(MSCCLPP_USE_CUDA)
Expand Down Expand Up @@ -122,8 +123,8 @@ endif()
if(MSCCLPP_ENABLE_TRACE)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_ENABLE_TRACE)
endif()
if(NPKIT_FLAGS)
target_compile_definitions(mscclpp_obj PRIVATE ${NPKIT_FLAGS})
if(MSCCLPP_NPKIT_FLAGS)
target_compile_definitions(mscclpp_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
endif()

# libmscclpp
Expand Down
4 changes: 3 additions & 1 deletion apps/nccl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ if(MSCCLPP_USE_CUDA)
elseif(MSCCLPP_USE_ROCM)
target_compile_definitions(mscclpp_nccl_obj PRIVATE MSCCLPP_USE_ROCM)
endif()

if(MSCCLPP_NPKIT_FLAGS)
target_compile_definitions(mscclpp_nccl_obj PRIVATE ${MSCCLPP_NPKIT_FLAGS})
endif()
add_library(mscclpp_nccl SHARED)
target_link_libraries(mscclpp_nccl PUBLIC mscclpp_obj mscclpp_nccl_obj)
set_target_properties(mscclpp_nccl PROPERTIES VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
Expand Down
53 changes: 52 additions & 1 deletion apps/nccl/src/allreduce.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
#include <mscclpp/sm_channel.hpp>
#include <mscclpp/sm_channel_device.hpp>

#if defined(ENABLE_NPKIT)
#include <mscclpp/npkit/npkit.hpp>
#endif

#include "common.hpp"

template <typename To, typename From>
Expand Down Expand Up @@ -238,10 +242,40 @@ template <typename T>
__global__ void __launch_bounds__(1024, 1)
allreduce7(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<mscclpp::SmChannel>* smChannels,
size_t channelDataOffset, size_t channelScratchOffset, int rank, int nRanksPerNode, int worldSize,
size_t nelems, uint32_t flag) {
size_t nelems, uint32_t flag
#if defined(ENABLE_NPKIT)
,
NpKitEventCollectContext* npKitEventCollectContexts, uint64_t* cpuTimestamp) {
#else
) {
#endif
// This version of allreduce only works for single nodes
if (worldSize != nRanksPerNode) return;

#if defined(ENABLE_NPKIT)
extern __shared__ int4 NpkitSharedMem[];
NpKitEvent* event_buffer = (NpKitEvent*)((char*)NpkitSharedMem);
uint64_t event_buffer_head = 0;
#if defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
uint64_t npkit_timestamp_entry = 0;
if (threadIdx.x == 0) {
npkit_timestamp_entry = NPKIT_GET_GPU_TIMESTAMP();
}
#endif
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_CPU)
#if defined(MSCCLPP_DEVICE_HIP)
NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, NPKIT_LOAD_CPU_TIMESTAMP_PER_BLOCK(cpuTimestamp, blockIdx.x),
#else
NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_CPU, 0, 0, *cpuTimestamp,
#endif
event_buffer, &event_buffer_head);
#endif
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_TIME_SYNC_GPU)
NpKit::CollectGpuEventShm(NPKIT_EVENT_TIME_SYNC_GPU, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), event_buffer,
&event_buffer_head);
#endif

if (sizeof(T) == 2)
nelems = (nelems * sizeof(T) + sizeof(T)) / sizeof(int);
else
Expand Down Expand Up @@ -312,6 +346,16 @@ __global__ void __launch_bounds__(1024, 1)
result[idx].x = data.x;
result[idx].y = data.y;
}
#if defined(ENABLE_NPKIT) && defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY) && \
defined(ENABLE_NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT)
NpKit::CollectGpuEventShm(NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY, 0, 0, npkit_timestamp_entry, event_buffer,
&event_buffer_head);
NpKit::CollectGpuEventShm(NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT, 0, 0, NPKIT_GET_GPU_TIMESTAMP(), event_buffer,
&event_buffer_head);
#endif
#if defined(ENABLE_NPKIT)
NpKit::StoreGpuEventShm(npKitEventCollectContexts, event_buffer, event_buffer_head);
#endif
}

template <typename T>
Expand Down Expand Up @@ -470,9 +514,16 @@ cudaError_t allreduce(T* buff, T* scratch, T* resultBuff, mscclpp::DeviceHandle<
nBlocks = 56;
nThreadsPerBlock = (nelems <= 76800) ? 512 : 1024;
}
#if defined(ENABLE_NPKIT)
size_t NpkitSharedMemSize = NPKIT_SHM_NUM_EVENTS * sizeof(NpKitEvent);
allreduce7<<<nBlocks, nThreadsPerBlock, NpkitSharedMemSize, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
flag++, NpKit::GetGpuEventCollectContexts(), NpKit::GetCpuTimestamp());
#else
allreduce7<<<nBlocks, nThreadsPerBlock, 0, stream>>>(buff, scratch, resultBuff, smChannels, channelInOffset,
channelScratchOffset, rank, nRanksPerNode, worldSize, nelems,
flag++);
#endif
} else {
int nBlocks = 35;
int nThreadsPerBlock = 512;
Expand Down
17 changes: 16 additions & 1 deletion apps/nccl/src/nccl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
#include <sstream>
#include <unordered_map>
#include <vector>

#if defined(ENABLE_NPKIT)
#include <mscclpp/npkit/npkit.hpp>
#endif
#include "allgather.hpp"
#include "allreduce.hpp"
#include "broadcast.hpp"
Expand Down Expand Up @@ -427,6 +429,12 @@ NCCL_API ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueI
}

*comm = commPtr;
#if defined(ENABLE_NPKIT)
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
if (npkitDumpDir != nullptr) {
NpKit::Init(rank);
}
#endif
return ncclSuccess;
}

Expand All @@ -446,6 +454,13 @@ NCCL_API ncclResult_t ncclCommDestroy(ncclComm_t comm) {
WARN("comm is nullptr");
return ncclInvalidArgument;
}
#if defined(ENABLE_NPKIT)
const char* npkitDumpDir = getenv("NPKIT_DUMP_DIR");
if (npkitDumpDir != nullptr) {
NpKit::Dump(npkitDumpDir);
NpKit::Shutdown();
}
#endif
delete comm;
return ncclSuccess;
}
Expand Down
7 changes: 5 additions & 2 deletions include/mscclpp/npkit/npkit_event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,10 @@
#define NPKIT_EVENT_EXECUTOR_INIT_ENTRY 0x19
#define NPKIT_EVENT_EXECUTOR_INIT_EXIT 0x1A

#define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x1B
#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x2E
#define NPKIT_EVENT_KERNEL_ALLREDUCE_ENTRY 0x1B
#define NPKIT_EVENT_KERNEL_ALLREDUCE_EXIT 0x1C

#define NPKIT_EVENT_EXECUTOR_OP_BASE_ENTRY 0x1D
#define NPKIT_EVENT_EXECUTOR_OP_BASE_EXIT 0x30

#endif
Loading