LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
impl/rocm.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 // Headers for HIP
28 #ifdef __HIPCC__
29 #include <thrust/system/hip/execution_policy.h>
30 #ifdef HYDROGEN_HAVE_CUB
31 #include "hipcub/block/block_reduce.hpp"
32 #endif // HYDROGEN_HAVE_CUB
33 #include <hip/hip_fp16.h>
34 #include <limits>
35 #endif // __HIPCC__
36 
37 namespace lbann {
38 
39 // -------------------------------------------------------------
40 // Device functions
41 // -------------------------------------------------------------
42 #ifdef __HIPCC__
43 
44 // Atomic add function
45 __device__ __forceinline__ __half gpu_lib::atomic_add(__half* address,
46  __half val)
47 {
48  unsigned int* address_as_uint = (unsigned int*)address;
49  unsigned int old = *address_as_uint;
50  __half* old_as_half = (__half*)&old;
51  unsigned int assumed;
52  unsigned int updated;
53  __half* updated_as_half = (__half*)&updated;
54  do {
55  assumed = old;
56  updated = old;
57  *updated_as_half += val;
58  old = atomicCAS(address_as_uint, assumed, updated);
59  } while (assumed != old);
60  return *old_as_half;
61 }
62 __device__ __forceinline__ float gpu_lib::atomic_add(float* address, float val)
63 {
64  return atomicAdd(address, val);
65 }
66 __device__ __forceinline__ double gpu_lib::atomic_add(double* address,
67  double val)
68 {
69  unsigned long long int* address_as_ull = (unsigned long long int*)address;
70  unsigned long long int old = *address_as_ull, assumed;
71  do {
72  assumed = old;
73  old = atomicCAS(address_as_ull,
74  assumed,
75  __double_as_longlong(val + __longlong_as_double(assumed)));
76 
77  // Note: uses integer comparison to avoid hang in case of NaN (since NaN !=
78  // NaN)
79  } while (assumed != old);
80  return __longlong_as_double(old);
81 }
82 
83 // Block reduction
84 template <size_t bdimx, size_t bdimy, size_t bdimz, class T>
85 __device__ __forceinline__ T gpu_lib::block_reduce(T val)
86 {
87 #ifdef HYDROGEN_HAVE_CUB
88  constexpr auto reduce_algo = hipcub::BLOCK_REDUCE_WARP_REDUCTIONS;
89  using BlockReduce = hipcub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
90  __shared__ typename BlockReduce::TempStorage workspace;
91  val = BlockReduce(workspace).Sum(val);
92 #else
93  const size_t tid =
94  threadIdx.x + threadIdx.y * bdimx + threadIdx.z * bdimx * bdimy;
95  constexpr size_t bsize = bdimx * bdimy * bdimz;
96  //__shared__ T shared_max_vals[bsize];
97  __shared__ char shared_max_vals_buffer[bsize * sizeof(T)];
98  T* shared_max_vals = reinterpret_cast<T*>(shared_max_vals);
99  shared_max_vals[tid] = val;
100  for (size_t stride = bsize / 2; stride > 0; stride /= 2) {
101  __syncthreads();
102  if (tid < stride) {
103  shared_max_vals[tid] =
104  shared_max_vals[tid] + shared_max_vals[tid + stride];
105  }
106  }
107  if (tid == 0) {
108  val = shared_max_vals[0];
109  }
110 #endif // HYDROGEN_HAVE_CUB
111  return val;
112 }
113 template <size_t bdimx, size_t bdimy, size_t bdimz, class T, class Op>
114 __device__ __forceinline__ T gpu_lib::block_reduce(T val)
115 {
116 #ifdef HYDROGEN_HAVE_CUB
117  constexpr auto reduce_algo = hipcub::BLOCK_REDUCE_WARP_REDUCTIONS;
118  using BlockReduce = hipcub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
119  __shared__ typename BlockReduce::TempStorage workspace;
120  val = BlockReduce(workspace).Reduce(val, Op());
121 #else
122  Op op;
123  const size_t tid =
124  threadIdx.x + threadIdx.y * bdimx + threadIdx.z * bdimx * bdimy;
125  constexpr size_t bsize = bdimx * bdimy * bdimz;
126  //__shared__ DataType shared_max_vals[bsize];
127  __shared__ char shared_max_vals_buffer[bsize * sizeof(T)];
128  T* shared_max_vals = reinterpret_cast<T*>(shared_max_vals);
129  shared_max_vals[tid] = val;
130  for (size_t stride = bsize / 2; stride > 0; stride /= 2) {
131  __syncthreads();
132  if (tid < stride) {
133  shared_max_vals[tid] =
134  op(shared_max_vals[tid], shared_max_vals[tid + stride]);
135  }
136  }
137  if (tid == 0) {
138  val = shared_max_vals[0];
139  }
140 #endif // HYDROGEN_HAVE_CUB
141  return val;
142 }
143 
144 // Unary math functions
145 // This support is far from complete!
146 #define WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(func) \
147  __device__ __forceinline__ __half gpu_lib::func(__half const& x) \
148  { \
149  return ::h##func(x); \
150  }
151 
152 // FIXME (trb): This is maybe not the best long-term solution, but it
153 // might be the best we can do without really digging into
154 // half-precision implementation.
155 #define WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(func) \
156  __device__ __forceinline__ __half gpu_lib::func(__half const& x) \
157  { \
158  return ::func(float(x)); \
159  }
160 
161 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(round)
162 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(ceil)
163 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(floor)
164 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(sqrt)
165 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(rsqrt)
166 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(exp)
167 
168 // WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(expm1)
169 //
170 // FIXME (trb): This is not going to be as accurate as a native expm1
171 // implementation could be:
172 __device__ __forceinline__ __half gpu_lib::expm1(__half const& x)
173 {
174  return ::__hsub(::hexp(x), ::__float2half(1.f));
175 }
176 
177 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(log)
178 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(log1p)
179 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(cos)
180 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(sin)
181 
182 // WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(tan)
183 //
184 // FIXME (trb): This just uses the trig identity. Probably less
185 // accurate than a native implementation.
186 __device__ __forceinline__ __half gpu_lib::tan(__half const& x)
187 {
188  return ::__hdiv(::hsin(x), ::hcos(x));
189 }
190 
191 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acos)
192 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asin)
193 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atan)
194 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(cosh)
195 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(sinh)
196 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(tanh)
197 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acosh)
198 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asinh)
199 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atanh)
200 #undef WRAP_UNARY_ROCM_HALF_MATH_FUNCTION
201 
202 // Binary math functions
203 __device__ __forceinline__ __half gpu_lib::min(const __half& x, const __half& y)
204 {
205  return ::__hle(x, y) ? x : y;
206 }
207 
208 __device__ __forceinline__ __half gpu_lib::max(const __half& x, const __half& y)
209 {
210  return ::__hle(x, y) ? y : x;
211 }
212 
213 // Numeric limits
214 template <typename T>
215 constexpr __device__ __forceinline__ T gpu_lib::min()
216 {
217  return std::numeric_limits<T>::min();
218 }
219 template <typename T>
220 constexpr __device__ __forceinline__ T gpu_lib::max()
221 {
222  return std::numeric_limits<T>::max();
223 }
224 template <typename T>
225 constexpr __device__ __forceinline__ T gpu_lib::epsilon()
226 {
227  return std::numeric_limits<T>::epsilon();
228 }
229 template <typename T>
230 __device__ __forceinline__ T gpu_lib::infinity()
231 {
232  return std::numeric_limits<T>::infinity();
233 }
234 
235 namespace rocm {
236 
237 // -------------------------------------------------------------
238 // Utilities for Thrust
239 // -------------------------------------------------------------
240 #ifndef DOXYGEN_SHOULD_SKIP_THIS
241 
242 namespace thrust {
243 
244 template <typename T>
245 allocator<T>::allocator(hipStream_t stream) : m_stream(stream), m_system(stream)
246 {}
247 
248 template <typename T>
249 typename allocator<T>::pointer
250 allocator<T>::allocate(allocator<T>::size_type size)
251 {
252  value_type* buffer = nullptr;
253  if (size > 0) {
254 #ifdef HYDROGEN_HAVE_CUB
255  auto& memory_pool = El::cub::MemoryPool();
256  CHECK_ROCM(memory_pool.DeviceAllocate(reinterpret_cast<void**>(&buffer),
257  size * sizeof(value_type),
258  m_stream));
259 #else
260  CHECK_ROCM(hipMalloc(&buffer, size * sizeof(value_type)));
261 #endif // HYDROGEN_HAVE_CUB
262  }
263  return pointer(buffer);
264 }
265 
266 template <typename T>
267 void allocator<T>::deallocate(allocator<T>::pointer buffer,
268  allocator<T>::size_type size)
269 {
270  auto&& ptr = buffer.get();
271  if (ptr != nullptr) {
272 #ifdef HYDROGEN_HAVE_CUB
273  auto& memory_pool = El::cub::MemoryPool();
274  CHECK_ROCM(memory_pool.DeviceFree(ptr));
275 #else
276  CHECK_ROCM(hipFree(ptr));
277 #endif // HYDROGEN_HAVE_CUB
278  }
279 }
280 
281 template <typename T>
282 typename allocator<T>::system_type& allocator<T>::system()
283 {
284  return m_system;
285 }
286 
287 } // namespace thrust
288 #endif // !DOXYGEN_SHOULD_SKIP_THIS
289 
290 } // namespace rocm
291 
292 #endif // __HIPCC__
293 
294 } // namespace lbann