LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
gpu/cuda.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_UTILS_CUDA_HPP
28 #define LBANN_UTILS_CUDA_HPP
29 
31 #include "lbann_config.hpp"
32 
33 #ifdef LBANN_HAS_CUDA
34 
35 #include <cuda.h>
36 #include <thrust/detail/allocator/tagged_allocator.h>
37 #include <thrust/device_vector.h>
38 #include <thrust/memory.h>
39 #include <thrust/system/cuda/detail/par.h>
40 #include <thrust/version.h>
41 
42 // -------------------------------------------------------------
43 // Error utility macros
44 // -------------------------------------------------------------
45 #define LBANN_CUDA_SYNC(async) \
46  do { \
47  /* Synchronize GPU and check for errors. */ \
48  cudaError_t status_CUDA_SYNC = cudaDeviceSynchronize(); \
49  if (status_CUDA_SYNC == cudaSuccess) \
50  status_CUDA_SYNC = cudaGetLastError(); \
51  if (status_CUDA_SYNC != cudaSuccess) { \
52  LBANN_ERROR((async ? "Asynchronous " : ""), \
53  "CUDA error (", \
54  cudaGetErrorString(status_CUDA_SYNC), \
55  ")"); \
56  } \
57  } while (0)
58 #define LBANN_CUDA_CHECK_LAST_ERROR(async) \
59  do { \
60  cudaError_t status = cudaGetLastError(); \
61  if (status != cudaSuccess) { \
62  LBANN_ERROR((async ? "Asynchronous " : ""), \
63  "CUDA error (", \
64  cudaGetErrorString(status), \
65  ")"); \
66  } \
67  } while (0)
68 #define FORCE_CHECK_CUDA(cuda_call) \
69  do { \
70  /* Call CUDA API routine, synchronizing before and */ \
71  /* after to check for errors. */ \
72  LBANN_CUDA_SYNC(true); \
73  cudaError_t status_CHECK_CUDA = (cuda_call); \
74  if (status_CHECK_CUDA != cudaSuccess) { \
75  LBANN_ERROR("CUDA error (", cudaGetErrorString(status_CHECK_CUDA), ")"); \
76  } \
77  LBANN_CUDA_SYNC(false); \
78  } while (0)
79 #define FORCE_CHECK_CUDA_NOSYNC(cuda_call) \
80  do { \
81  cudaError_t status_CHECK_CUDA = (cuda_call); \
82  if (status_CHECK_CUDA != cudaSuccess) { \
83  LBANN_ERROR("CUDA error (", cudaGetErrorString(status_CHECK_CUDA), ")"); \
84  } \
85  } while (0)
86 #ifdef LBANN_DEBUG
87 #define CHECK_CUDA(cuda_call) FORCE_CHECK_CUDA(cuda_call);
88 #else
89 #define CHECK_CUDA(cuda_call) FORCE_CHECK_CUDA_NOSYNC(cuda_call)
90 #endif // #ifdef LBANN_DEBUG
91 
92 namespace lbann {
93 namespace cuda {
94 
95 constexpr cudaMemcpyKind GPU_MEMCPY_DEVICE_TO_DEVICE = cudaMemcpyDeviceToDevice;
96 
97 // -------------------------------------------------------------
98 // Wrapper classes
99 // -------------------------------------------------------------
100 
102 class event_wrapper
103 {
104 public:
105  event_wrapper();
106  event_wrapper(const event_wrapper& other);
107  event_wrapper& operator=(const event_wrapper& other);
108  ~event_wrapper();
110  void record(cudaStream_t stream);
112  bool query() const;
114  void synchronize();
116  cudaEvent_t& get_event();
117 
118 private:
122  cudaEvent_t m_event;
126  cudaStream_t m_stream;
127 };
128 
130 class Graph
131 {
132 
133 public:
134  Graph(cudaGraph_t graph = nullptr);
135  ~Graph();
136 
137  // Copy-and-swap idiom
138  Graph(const Graph&);
139  Graph(Graph&&);
140  Graph& operator=(Graph);
141  friend void swap(Graph& first, Graph& second);
142 
144  void reset(cudaGraph_t graph = nullptr);
146  cudaGraph_t release();
148  cudaGraph_t get() const noexcept;
150  operator cudaGraph_t() const noexcept;
151 
156  void create();
157 
159  static void
160  begin_capture(cudaStream_t stream,
161  cudaStreamCaptureMode mode = cudaStreamCaptureModeGlobal);
163  static Graph end_capture(cudaStream_t stream);
164 
165 private:
166  cudaGraph_t graph_{nullptr};
167 };
168 
170 class ExecutableGraph
171 {
172 
173 public:
174  ExecutableGraph(cudaGraphExec_t graph_exec = nullptr);
175  ExecutableGraph(cudaGraph_t graph);
176  ~ExecutableGraph();
177 
178  // Copy-and-swap idiom
179  ExecutableGraph(const ExecutableGraph&) = delete;
180  ExecutableGraph(ExecutableGraph&&);
181  ExecutableGraph& operator=(ExecutableGraph);
182  friend void swap(ExecutableGraph& first, ExecutableGraph& second);
183 
185  void reset(cudaGraphExec_t graph = nullptr);
187  cudaGraphExec_t release();
189  cudaGraphExec_t get() const noexcept;
191  operator cudaGraphExec_t() const noexcept;
192 
194  void launch(cudaStream_t stream) const;
195 
201  void update(cudaGraph_t graph);
202 
203 private:
204  cudaGraphExec_t graph_exec_{nullptr};
205 };
206 
207 // -------------------------------------------------------------
208 // Helper functions for tensor operations
209 // -------------------------------------------------------------
210 
212 template <typename TensorDataType>
213 void copy_tensor(cudaStream_t stream,
214  const std::vector<size_t>& dims,
215  const TensorDataType* input,
216  const std::vector<size_t>& input_strides,
217  TensorDataType* output,
218  const std::vector<size_t>& output_strides);
219 
220 void mem_copy_async(void* output,
221  const void* input,
222  const size_t count,
223  cudaMemcpyKind kind,
224  cudaStream_t stream);
225 
226 // -------------------------------------------------------------
227 // Utilities for Thrust
228 // -------------------------------------------------------------
229 namespace thrust {
230 
232 using execute_on_stream
233 #if THRUST_MAJOR_VERSION > 1 || THRUST_MINOR_VERSION >= 9
234  = ::thrust::cuda_cub::execute_on_stream; // >= 1.9.1
235 #elif THRUST_MAJOR_VERSION == 1 && THRUST_MINOR_VERSION == 8
236  = ::thrust::system::cuda::detail::execute_on_stream;
237 #else
238  = std::nullptr_t;
239 static_assert(false, "Thrust 1.8 or newer is required");
240 #endif
241 
246 template <typename T = El::byte>
247 class allocator : public ::thrust::detail::tagged_allocator<
248  T,
249  execute_on_stream,
250  ::thrust::pointer<T, execute_on_stream>>
251 {
252 public:
253  // Convenient typedefs
254  typedef ::thrust::detail::tagged_allocator<
255  T,
256  execute_on_stream,
257  ::thrust::pointer<T, execute_on_stream>>
258  parent_class;
259  typedef typename parent_class::value_type value_type;
260  typedef typename parent_class::pointer pointer;
261  typedef typename parent_class::size_type size_type;
262  typedef typename parent_class::system_type system_type;
263 
265  allocator(cudaStream_t stream = hydrogen::cuda::GetDefaultStream());
267  pointer allocate(size_type size);
271  void deallocate(pointer buffer, size_type size = 0);
273  system_type& system();
274 
275 private:
277  cudaStream_t m_stream;
279  system_type m_system;
280 };
281 
283 template <typename T>
284 using vector = ::thrust::device_vector<T, allocator<T>>;
285 
286 } // namespace thrust
287 } // namespace cuda
288 } // namespace lbann
289 
290 #endif // LBANN_HAS_CUDA
291 
292 #endif // LBANN_UTILS_CUDA_HPP