27 #ifndef LBANN_UTILS_CUDA_HPP 28 #define LBANN_UTILS_CUDA_HPP 31 #include "lbann_config.hpp" 36 #include <thrust/detail/allocator/tagged_allocator.h> 37 #include <thrust/device_vector.h> 38 #include <thrust/memory.h> 39 #include <thrust/system/cuda/detail/par.h> 40 #include <thrust/version.h> 45 #define LBANN_CUDA_SYNC(async) \ 48 cudaError_t status_CUDA_SYNC = cudaDeviceSynchronize(); \ 49 if (status_CUDA_SYNC == cudaSuccess) \ 50 status_CUDA_SYNC = cudaGetLastError(); \ 51 if (status_CUDA_SYNC != cudaSuccess) { \ 52 LBANN_ERROR((async ? "Asynchronous " : ""), \ 54 cudaGetErrorString(status_CUDA_SYNC), \ 58 #define LBANN_CUDA_CHECK_LAST_ERROR(async) \ 60 cudaError_t status = cudaGetLastError(); \ 61 if (status != cudaSuccess) { \ 62 LBANN_ERROR((async ? "Asynchronous " : ""), \ 64 cudaGetErrorString(status), \ 68 #define FORCE_CHECK_CUDA(cuda_call) \ 72 LBANN_CUDA_SYNC(true); \ 73 cudaError_t status_CHECK_CUDA = (cuda_call); \ 74 if (status_CHECK_CUDA != cudaSuccess) { \ 75 LBANN_ERROR("CUDA error (", cudaGetErrorString(status_CHECK_CUDA), ")"); \ 77 LBANN_CUDA_SYNC(false); \ 79 #define FORCE_CHECK_CUDA_NOSYNC(cuda_call) \ 81 cudaError_t status_CHECK_CUDA = (cuda_call); \ 82 if (status_CHECK_CUDA != cudaSuccess) { \ 83 LBANN_ERROR("CUDA error (", cudaGetErrorString(status_CHECK_CUDA), ")"); \ 87 #define CHECK_CUDA(cuda_call) FORCE_CHECK_CUDA(cuda_call); 89 #define CHECK_CUDA(cuda_call) FORCE_CHECK_CUDA_NOSYNC(cuda_call) 90 #endif // #ifdef LBANN_DEBUG 95 constexpr cudaMemcpyKind GPU_MEMCPY_DEVICE_TO_DEVICE = cudaMemcpyDeviceToDevice;
106 event_wrapper(
const event_wrapper& other);
107 event_wrapper& operator=(
const event_wrapper& other);
110 void record(cudaStream_t stream);
116 cudaEvent_t& get_event();
126 cudaStream_t m_stream;
134 Graph(cudaGraph_t graph =
nullptr);
140 Graph& operator=(Graph);
141 friend void swap(Graph& first, Graph& second);
144 void reset(cudaGraph_t graph =
nullptr);
146 cudaGraph_t release();
148 cudaGraph_t
get()
const noexcept;
150 operator cudaGraph_t() const noexcept;
160 begin_capture(cudaStream_t stream,
161 cudaStreamCaptureMode mode = cudaStreamCaptureModeGlobal);
163 static Graph end_capture(cudaStream_t stream);
166 cudaGraph_t graph_{
nullptr};
170 class ExecutableGraph
174 ExecutableGraph(cudaGraphExec_t graph_exec =
nullptr);
175 ExecutableGraph(cudaGraph_t graph);
179 ExecutableGraph(
const ExecutableGraph&) =
delete;
180 ExecutableGraph(ExecutableGraph&&);
181 ExecutableGraph& operator=(ExecutableGraph);
182 friend void swap(ExecutableGraph& first, ExecutableGraph& second);
185 void reset(cudaGraphExec_t graph =
nullptr);
187 cudaGraphExec_t release();
189 cudaGraphExec_t
get()
const noexcept;
191 operator cudaGraphExec_t() const noexcept;
194 void launch(cudaStream_t stream) const;
201 void update(cudaGraph_t graph);
204 cudaGraphExec_t graph_exec_{
nullptr};
212 template <
typename TensorDataType>
213 void copy_tensor(cudaStream_t stream,
214 const std::vector<size_t>& dims,
215 const TensorDataType* input,
216 const std::vector<size_t>& input_strides,
217 TensorDataType* output,
218 const std::vector<size_t>& output_strides);
220 void mem_copy_async(
void* output,
224 cudaStream_t stream);
232 using execute_on_stream
233 #if THRUST_MAJOR_VERSION > 1 || THRUST_MINOR_VERSION >= 9 234 = ::thrust::cuda_cub::execute_on_stream;
235 #elif THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION == 8 236 = ::thrust::system::cuda::detail::execute_on_stream;
239 static_assert(
false,
"Thrust 1.8 or newer is required");
246 template <
typename T = El::byte>
247 class allocator :
public ::thrust::detail::tagged_allocator<
250 ::thrust::pointer<T, execute_on_stream>>
254 typedef ::thrust::detail::tagged_allocator<
257 ::thrust::pointer<T, execute_on_stream>>
259 typedef typename parent_class::value_type value_type;
260 typedef typename parent_class::pointer pointer;
261 typedef typename parent_class::size_type size_type;
262 typedef typename parent_class::system_type system_type;
265 allocator(cudaStream_t stream = hydrogen::cuda::GetDefaultStream());
267 pointer allocate(size_type size);
271 void deallocate(pointer buffer, size_type size = 0);
273 system_type& system();
277 cudaStream_t m_stream;
279 system_type m_system;
283 template <
typename T>
284 using vector = ::thrust::device_vector<T, allocator<T>>;
290 #endif // LBANN_HAS_CUDA 292 #endif // LBANN_UTILS_CUDA_HPP