-
Notifications
You must be signed in to change notification settings - Fork 4.3k
/
CachedBufAlloc.h
205 lines (161 loc) · 9.51 KB
/
CachedBufAlloc.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
#ifndef HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
#define HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h
#include <alpaka/alpaka.hpp>
#include "HeterogeneousCore/AlpakaInterface/interface/getDeviceCachingAllocator.h"
#include "HeterogeneousCore/AlpakaInterface/interface/getHostCachingAllocator.h"
namespace cms::alpakatools {
namespace traits {
//! The caching memory allocator trait.
template <typename TElem,
typename TDim,
typename TIdx,
typename TDev,
typename TQueue,
typename = void,
typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
struct CachedBufAlloc {
static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
};
//! The caching memory allocator implementation for the CPU device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
TQueue queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
// non-cached, queue-ordered asynchronous host-only memory
return alpaka::allocAsyncBuf<TElem, TIdx>(queue, extent);
}
};
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
//! The caching memory allocator implementation for the pinned host memory, with a blocking queue
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueCudaRtBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtBlocking>();
// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);
// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};
//! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueCudaRtNonBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>();
// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);
// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};
//! The caching memory allocator implementation for the CUDA device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev,
TQueue queue,
TExtent const& extent) -> alpaka::BufCudaRt<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev);
size_t width = alpaka::getWidth(extent);
size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
// TODO implement pitch for TDim > 1
size_t pitchBytes = widthBytes;
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);
// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
return alpaka::BufCudaRt<TElem, TDim, TIdx>(
dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent, pitchBytes);
}
};
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
//! The caching memory allocator implementation for the pinned host memory, with a blocking queue
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueHipRtBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtBlocking>();
// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);
// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};
//! The caching memory allocator implementation for the pinned host memory, with a non-blocking queue
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueHipRtNonBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>();
// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);
// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};
//! The caching memory allocator implementation for the ROCm/HIP device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev,
TQueue queue,
TExtent const& extent) -> alpaka::BufHipRt<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev);
size_t width = alpaka::getWidth(extent);
size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
// TODO implement pitch for TDim > 1
size_t pitchBytes = widthBytes;
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);
// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
return alpaka::BufHipRt<TElem, TDim, TIdx>(
dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent, pitchBytes);
}
};
#endif // ALPAKA_ACC_GPU_HIP_ENABLED
} // namespace traits
template <typename TElem,
typename TIdx,
typename TExtent,
typename TQueue,
typename TDev,
typename = std::enable_if_t<alpaka::isDevice<TDev> and alpaka::isQueue<TQueue>>>
ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
}
} // namespace cms::alpakatools
#endif // HeterogeneousCore_AlpakaInterface_interface_CachedBufAlloc_h