LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
impl/cuda.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 // Headers for NVCC
28 #ifdef __CUDACC__
29 #include <thrust/system/cuda/execution_policy.h>
30 #ifdef HYDROGEN_HAVE_CUB
31 #include "cub/block/block_reduce.cuh"
32 #endif // HYDROGEN_HAVE_CUB
33 #include <cuda_fp16.h>
34 #include <math_constants.h>
35 #endif // __CUDACC__
36 
37 namespace lbann {
38 
39 // -------------------------------------------------------------
40 // Device functions
41 // -------------------------------------------------------------
42 #ifdef __CUDACC__
43 
44 // Atomic add function
45 #if __CUDA_ARCH__ >= 530
46 __device__ __forceinline__ __half gpu_lib::atomic_add(__half* address,
47  __half val)
48 {
49 #if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)
50  return atomicAdd(address, val);
51 #else
52  unsigned int* address_as_uint = (unsigned int*)address;
53  unsigned int old = *address_as_uint;
54  __half* old_as_half = (__half*)&old;
55  unsigned int assumed;
56  unsigned int updated;
57  __half* updated_as_half = (__half*)&updated;
58  do {
59  assumed = old;
60  updated = old;
61  *updated_as_half += val;
62  old = atomicCAS(address_as_uint, assumed, updated);
63  } while (assumed != old);
64  return *old_as_half;
65 #endif // __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__)
66 }
67 #endif // __CUDA_ARCH__ >= 530
68 __device__ __forceinline__ float gpu_lib::atomic_add(float* address, float val)
69 {
70  return atomicAdd(address, val);
71 }
72 __device__ __forceinline__ double gpu_lib::atomic_add(double* address,
73  double val)
74 {
75 #if __CUDA_ARCH__ >= 600
76  return atomicAdd(address, val);
77 #else
78  unsigned long long int* address_as_ull = (unsigned long long int*)address;
79  unsigned long long int old = *address_as_ull, assumed;
80  do {
81  assumed = old;
82  old = atomicCAS(address_as_ull,
83  assumed,
84  __double_as_longlong(val + __longlong_as_double(assumed)));
85  } while (assumed != old);
86  return __longlong_as_double(old);
87 #endif // __CUDA_ARCH__ < 600
88 }
89 
90 // Block reduction
91 template <size_t bdimx, size_t bdimy, size_t bdimz, class T>
92 __device__ __forceinline__ T gpu_lib::block_reduce(T val)
93 {
94 #ifdef HYDROGEN_HAVE_CUB
95  constexpr auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
96  using BlockReduce = cub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
97  __shared__ typename BlockReduce::TempStorage workspace;
98  val = BlockReduce(workspace).Sum(val);
99 #else
100  const size_t tid =
101  threadIdx.x + threadIdx.y * bdimx + threadIdx.z * bdimx * bdimy;
102  constexpr size_t bsize = bdimx * bdimy * bdimz;
103  __shared__ DataType shared_max_vals[bsize];
104  shared_max_vals[tid] = val;
105  for (size_t stride = bsize / 2; stride > 0; stride /= 2) {
106  __syncthreads();
107  if (tid < stride) {
108  shared_max_vals[tid] =
109  shared_max_vals[tid] + shared_max_vals[tid + stride];
110  }
111  }
112  if (tid == 0) {
113  val = shared_max_vals[0];
114  }
115 #endif // HYDROGEN_HAVE_CUB
116  return val;
117 }
118 template <size_t bdimx, size_t bdimy, size_t bdimz, class T, class Op>
119 __device__ __forceinline__ T gpu_lib::block_reduce(T val)
120 {
121 #ifdef HYDROGEN_HAVE_CUB
122  constexpr auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
123  using BlockReduce = cub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
124  __shared__ typename BlockReduce::TempStorage workspace;
125  val = BlockReduce(workspace).Reduce(val, Op());
126 #else
127  Op op;
128  const size_t tid =
129  threadIdx.x + threadIdx.y * bdimx + threadIdx.z * bdimx * bdimy;
130  constexpr size_t bsize = bdimx * bdimy * bdimz;
131  __shared__ DataType shared_max_vals[bsize];
132  shared_max_vals[tid] = val;
133  for (size_t stride = bsize / 2; stride > 0; stride /= 2) {
134  __syncthreads();
135  if (tid < stride) {
136  shared_max_vals[tid] =
137  op(shared_max_vals[tid], shared_max_vals[tid + stride]);
138  }
139  }
140  if (tid == 0) {
141  val = shared_max_vals[0];
142  }
143 #endif // HYDROGEN_HAVE_CUB
144  return val;
145 }
146 
147 // Unary math functions
148 #if __CUDA_ARCH__ >= 530
149 template <>
150 __device__ __forceinline__ bool gpu_lib::isfinite(__half const& x)
151 {
152  return !(::__isnan(x) || ::__hisinf(x));
153 }
154 template <>
155 __device__ __forceinline__ bool gpu_lib::isinf(__half const& x)
156 {
157  return ::__hisinf(x);
158 }
159 template <>
160 __device__ __forceinline__ bool gpu_lib::isnan(__half const& x)
161 {
162  return ::__hisnan(x);
163 }
164 
165 // This support is far from complete!
166 #define WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(func) \
167  __device__ __forceinline__ __half gpu_lib::func(__half const& x) \
168  { \
169  return ::h##func(x); \
170  }
171 
172 // FIXME (trb): This is maybe not the best long-term solution, but it
173 // might be the best we can do without really digging into
174 // half-precision implementation.
175 #define WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(func) \
176  __device__ __forceinline__ __half gpu_lib::func(__half const& x) \
177  { \
178  return func(float(x)); \
179  }
180 
181 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(round)
182 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(ceil)
183 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(floor)
184 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(sqrt)
185 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(rsqrt)
186 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(exp)
187 // WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(expm1)
188 //
189 // FIXME (trb): This is not going to be as accurate as a native expm1
190 // implementation could be:
191 __device__ __forceinline__ __half gpu_lib::expm1(__half const& x)
192 {
193  return ::__hsub(::hexp(x), ::__float2half(1.f));
194 }
195 
196 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(log)
197 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(log1p)
198 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(cos)
199 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(sin)
200 
201 // WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(tan)
202 //
203 // FIXME (trb): This just uses the trig identity. Probably less
204 // accurate than a native implementation.
205 __device__ __forceinline__ __half gpu_lib::tan(__half const& x)
206 {
207  return ::__hdiv(::hsin(x), ::hcos(x));
208 }
209 
210 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acos)
211 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asin)
212 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atan)
213 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(cosh)
214 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(sinh)
215 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(tanh)
216 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acosh)
217 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asinh)
218 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atanh)
219 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(erf)
220 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(erfinv)
221 #undef WRAP_UNARY_CUDA_HALF_MATH_FUNCTION
222 
223 // Binary math functions
224 __device__ __forceinline__ __half gpu_lib::min(const __half& x, const __half& y)
225 {
226  return ::__hle(x, y) ? x : y;
227 }
228 
229 __device__ __forceinline__ __half gpu_lib::max(const __half& x, const __half& y)
230 {
231  return ::__hle(x, y) ? y : x;
232 }
233 #endif // __CUDA_ARCH__ >= 530
234 
235 // Numeric limits
236 #ifdef __CUDACC_RELAXED_CONSTEXPR__
237 template <typename T>
238 constexpr __device__ __forceinline__ T min()
239 {
240  return std::numeric_limits<T>::min();
241 }
242 template <typename T>
243 constexpr __device__ __forceinline__ T max()
244 {
245  return std::numeric_limits<T>::min();
246 }
247 template <typename T>
248 constexpr __device__ __forceinline__ T epsilon()
249 {
250  return std::numeric_limits<T>::epsilon();
251 }
252 template <typename T>
253 __device__ __forceinline__ T infinity()
254 {
255  return std::numeric_limits<T>::infinity();
256 }
257 #else // __CUDACC_RELAXED_CONSTEXPR__
258 #define SPECIFIERS \
259  template <> \
260  __device__ __forceinline__
261 SPECIFIERS constexpr float gpu_lib::min<float>() { return FLT_MIN; }
262 SPECIFIERS constexpr double gpu_lib::min<double>() { return DBL_MIN; }
263 SPECIFIERS constexpr int gpu_lib::min<int>() { return INT_MIN; }
264 SPECIFIERS constexpr long int gpu_lib::min<long int>() { return LONG_MIN; }
265 SPECIFIERS constexpr long long int gpu_lib::min<long long int>()
266 {
267  return LLONG_MIN;
268 }
269 SPECIFIERS constexpr float gpu_lib::max<float>() { return FLT_MAX; }
270 SPECIFIERS constexpr double gpu_lib::max<double>() { return DBL_MAX; }
271 SPECIFIERS constexpr int gpu_lib::max<int>() { return INT_MAX; }
272 SPECIFIERS constexpr long int gpu_lib::max<long int>() { return LONG_MAX; }
273 SPECIFIERS constexpr long long int gpu_lib::max<long long int>()
274 {
275  return LLONG_MAX;
276 }
277 SPECIFIERS constexpr float gpu_lib::epsilon<float>() { return FLT_EPSILON; }
278 SPECIFIERS constexpr double gpu_lib::epsilon<double>() { return DBL_EPSILON; }
279 SPECIFIERS float gpu_lib::infinity<float>() { return CUDART_INF_F; }
280 SPECIFIERS double gpu_lib::infinity<double>() { return CUDART_INF; }
281 #undef SPECIFIERS
282 #endif // __CUDACC_RELAXED_CONSTEXPR__
283 
284 namespace cuda {
285 
286 // -------------------------------------------------------------
287 // Utilities for Thrust
288 // -------------------------------------------------------------
289 #ifndef DOXYGEN_SHOULD_SKIP_THIS
290 
291 namespace thrust {
292 
293 template <typename T>
294 allocator<T>::allocator(cudaStream_t stream)
295  : m_stream(stream), m_system(stream)
296 {}
297 
298 template <typename T>
299 typename allocator<T>::pointer
300 allocator<T>::allocate(allocator<T>::size_type size)
301 {
302  value_type* buffer = nullptr;
303  if (size > 0) {
304 #ifdef HYDROGEN_HAVE_CUB
305  auto& memory_pool = El::cub::MemoryPool();
306  CHECK_CUDA(memory_pool.DeviceAllocate(reinterpret_cast<void**>(&buffer),
307  size * sizeof(value_type),
308  m_stream));
309 #else
310  CHECK_CUDA(cudaMalloc(&buffer, size * sizeof(value_type)));
311 #endif // HYDROGEN_HAVE_CUB
312  }
313  return pointer(buffer);
314 }
315 
316 template <typename T>
317 void allocator<T>::deallocate(allocator<T>::pointer buffer,
318  allocator<T>::size_type size)
319 {
320  auto&& ptr = buffer.get();
321  if (ptr != nullptr) {
322 #ifdef HYDROGEN_HAVE_CUB
323  auto& memory_pool = El::cub::MemoryPool();
324  CHECK_CUDA(memory_pool.DeviceFree(ptr));
325 #else
326  CHECK_CUDA(cudaFree(ptr));
327 #endif // HYDROGEN_HAVE_CUB
328  }
329 }
330 
331 template <typename T>
332 typename allocator<T>::system_type& allocator<T>::system()
333 {
334  return m_system;
335 }
336 
337 } // namespace thrust
338 #endif // !DOXYGEN_SHOULD_SKIP_THIS
339 
340 } // namespace cuda
341 
342 #endif // __CUDACC__
343 
344 } // namespace lbann