tvm/src/runtime/contrib/nvshmem/memory_allocator.cc at 9db3c804ab9e727d158ea5aa09ee7e73d429d3e5 · cyx-6/tvm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
#include <nvshmem.h>
#include <nvshmemx.h>
#include <tvm/runtime/memory/memory_manager.h>
#include <tvm/runtime/packed_func.h>
#include <tvm/runtime/registry.h>

#include <thread>

#include "../../cuda/cuda_common.h"
#include "../../memory/pooled_allocator.h"

namespace tvm {
namespace runtime {

using tvm::runtime::memory::Buffer;
using tvm::runtime::memory::PooledAllocator;

/*!
 * \brief The memory allocator of NVSHMEM.
 * Overriding PooledAllocator for efficient memory management.
 */
class NVSHMEMAllocator final : public PooledAllocator {
 public:
  explicit NVSHMEMAllocator() : PooledAllocator() {}

  ~NVSHMEMAllocator() { PooledAllocator::ReleaseAll(); }

  void Clear() final { PooledAllocator::ReleaseAll(); }

  bool AllowMemoryScope(const std::string& mem_scope) const final {
    // The allowed memory scope of NVSHMEM is "nvshmem";
    return mem_scope == "nvshmem";
  }

  /*! \brief Return the global NVSHMEM singleton allocator. */
  static NVSHMEMAllocator* Global() {
    static NVSHMEMAllocator* allocator = new NVSHMEMAllocator();
    return allocator;
  }

  NDArray Empty(ShapeTuple shape, DataType dtype, Device device) {
    NDArray::Container* container = new NDArray::Container(nullptr, shape, dtype, device);
    container->SetDeleter([](Object* obj) {
      auto* ptr = static_cast<NDArray::Container*>(obj);
      ICHECK(ptr->manager_ctx != nullptr);
      Buffer* buffer = reinterpret_cast<Buffer*>(ptr->manager_ctx);
      NVSHMEMAllocator::Global()->Free(*(buffer));
      delete buffer;
      delete ptr;
    });
    Buffer* buffer = new Buffer;
    *buffer = PooledAllocator::Alloc(device, shape, dtype, String("nvshmem"));
    container->manager_ctx = reinterpret_cast<void*>(buffer);
    container->dl_tensor.data = buffer->data;
    return NDArray(GetObjectPtr<Object>(container));
  }

 private:
  void* DeviceAllocDataSpace(Device dev, size_t size, size_t alignment,
                             DLDataType type_hint) final {
    ICHECK_EQ(dev.device_type, DLDeviceType::kDLCUDA)
        << "nvshmem can only allocate cuda device memory space.";
    ICHECK(type_hint.code == DLDataTypeCode::kDLInt || type_hint.code == DLDataTypeCode::kDLUInt ||
           type_hint.code == DLDataTypeCode::kDLFloat)
        << "nvshmem can only allocate tensor with int, usingned int or float data types.";
    return nvshmem_align(alignment, size);
  }

  void DeviceFreeDataSpace(Device dev, void* ptr) final { nvshmem_free(ptr); }
};

NDArray NVSHMEMEmpty(ShapeTuple shape, DataType dtype, Device device) {
  return NVSHMEMAllocator::Global()->Empty(shape, dtype, device);
}

TVM_REGISTER_GLOBAL("runtime.disco.nvshmem.empty").set_body_typed(NVSHMEMEmpty);

void NVSHMEMFinalize() {
  NVSHMEMAllocator::Global()->Clear();
  nvshmem_finalize();
}

TVM_REGISTER_GLOBAL("runtime.disco.nvshmem.finalize_nvshmem").set_body_typed(NVSHMEMFinalize);

}  // namespace runtime
}  // namespace tvm