LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
impl/gpu_lib.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 namespace lbann {
28 namespace gpu_lib {
29 #if defined LBANN_HAS_CUDA
30 using namespace cuda;
31 #elif defined LBANN_HAS_ROCM
32 using namespace rocm;
33 #endif // LBANN_HAS_CUDA
34 
35 // -------------------------------------------------------------
36 // Device properties
37 // -------------------------------------------------------------
38 
39 inline void clip_grid_dims(dim3& grid_dims)
40 {
41  const auto max_grid_dims_ = max_grid_dims();
42  grid_dims.x = std::min(grid_dims.x, max_grid_dims_.x);
43  grid_dims.y = std::min(grid_dims.y, max_grid_dims_.y);
44  grid_dims.z = std::min(grid_dims.z, max_grid_dims_.z);
45 }
46 
47 // -------------------------------------------------------------
48 // Device functions
49 // -------------------------------------------------------------
50 #if defined __CUDACC__ || defined __HIPCC__
51 
52 // Unary math functions
53 #define WRAP_UNARY_MATH_FUNCTION(func) \
54  __device__ __forceinline__ float func(const float& x) \
55  { \
56  return ::func##f(x); \
57  } \
58  __device__ __forceinline__ double func(const double& x) \
59  { \
60  return ::func(x); \
61  }
62 template <typename T>
63 __device__ __forceinline__ T abs(const T& x)
64 {
65  return x >= static_cast<T>(0) ? x : -x;
66 }
67 __device__ __forceinline__ float abs(const float& x) { return ::fabsf(x); }
68 __device__ __forceinline__ double abs(const double& x) { return ::fabs(x); }
69 WRAP_UNARY_MATH_FUNCTION(round)
70 WRAP_UNARY_MATH_FUNCTION(ceil)
71 WRAP_UNARY_MATH_FUNCTION(floor)
72 WRAP_UNARY_MATH_FUNCTION(sqrt)
73 WRAP_UNARY_MATH_FUNCTION(rsqrt)
74 WRAP_UNARY_MATH_FUNCTION(exp)
75 WRAP_UNARY_MATH_FUNCTION(expm1)
76 WRAP_UNARY_MATH_FUNCTION(log)
77 WRAP_UNARY_MATH_FUNCTION(log1p)
78 WRAP_UNARY_MATH_FUNCTION(cos)
79 WRAP_UNARY_MATH_FUNCTION(sin)
80 WRAP_UNARY_MATH_FUNCTION(tan)
81 WRAP_UNARY_MATH_FUNCTION(acos)
82 WRAP_UNARY_MATH_FUNCTION(asin)
83 WRAP_UNARY_MATH_FUNCTION(atan)
84 WRAP_UNARY_MATH_FUNCTION(cosh)
85 WRAP_UNARY_MATH_FUNCTION(sinh)
86 WRAP_UNARY_MATH_FUNCTION(tanh)
87 WRAP_UNARY_MATH_FUNCTION(acosh)
88 WRAP_UNARY_MATH_FUNCTION(asinh)
89 WRAP_UNARY_MATH_FUNCTION(atanh)
90 WRAP_UNARY_MATH_FUNCTION(erf)
91 WRAP_UNARY_MATH_FUNCTION(erfinv)
92 #undef WRAP_UNARY_MATH_FUNCTION
93 
94 template <typename T>
95 __device__ __forceinline__ bool isfinite(T const& x)
96 {
97  return ::isfinite(x);
98 }
99 template <typename T>
100 __device__ __forceinline__ bool isinf(T const& x)
101 {
102  return ::isinf(x);
103 }
104 template <typename T>
105 __device__ __forceinline__ bool isnan(T const& x)
106 {
107  return ::isnan(x);
108 }
109 
110 // Binary math functions
111 #define WRAP_BINARY_MATH_FUNCTION(func) \
112  __device__ __forceinline__ float func(const float& x, const float& y) \
113  { \
114  return ::func##f(x, y); \
115  } \
116  __device__ __forceinline__ double func(const double& x, const double& y) \
117  { \
118  return ::func(x, y); \
119  }
120 template <typename T>
121 __device__ __forceinline__ T min(const T& x, const T& y)
122 {
123  return y < x ? y : x;
124 }
125 __device__ __forceinline__ float min(const float& x, const float& y)
126 {
127  return ::fminf(x, y);
128 }
129 __device__ __forceinline__ double min(const double& x, const double& y)
130 {
131  return ::fmin(x, y);
132 }
133 template <typename T>
134 __device__ __forceinline__ T max(const T& x, const T& y)
135 {
136  return y > x ? y : x;
137 }
138 __device__ __forceinline__ float max(const float& x, const float& y)
139 {
140  return ::fmaxf(x, y);
141 }
142 __device__ __forceinline__ double max(const double& x, const double& y)
143 {
144  return ::fmax(x, y);
145 }
146 __device__ __forceinline__ float mod(const float& x, const float& y)
147 {
148  return ::fmodf(x, y);
149 }
150 __device__ __forceinline__ double mod(const double& x, const double& y)
151 {
152  return ::fmod(x, y);
153 }
154 WRAP_BINARY_MATH_FUNCTION(pow)
155 #undef WRAP_BINARY_MATH_FUNCTION
156 
157 __device__ __forceinline__ __half pow(const __half& x, const __half& y)
158 {
159  return pow(float(x), float(y));
160 }
161 
162 __device__ __forceinline__ __half mod(const __half& x, const __half& y)
163 {
164  return mod(float(x), float(y));
165 }
166 
167 // FIXME (TRB): I think this is right? Borrowed the values from the
168 // sourceforge half library.
169 template <>
170 __device__ __forceinline__ __half min<__half>()
171 {
172  return __short_as_half(0x0400);
173 }
174 template <>
175 __device__ __forceinline__ __half max<__half>()
176 {
177  return __short_as_half(0x7BFF);
178 }
179 template <>
180 __device__ __forceinline__ __half epsilon<__half>()
181 {
182  return __short_as_half(0x1400);
183 }
184 template <>
185 __device__ __forceinline__ __half infinity<__half>()
186 {
187  return __short_as_half(0x7C00);
188 }
189 
190 // Array member functions
191 template <typename T, size_t N>
192 __host__ __device__ __forceinline__ size_t array<T, N>::size() const
193 {
194  return N;
195 }
196 template <typename T, size_t N>
197 __host__ __device__ __forceinline__ T& array<T, N>::operator[](size_t i)
198 {
199  return vals[i];
200 }
201 template <typename T, size_t N>
202 __host__ __device__ __forceinline__ const T&
203 array<T, N>::operator[](size_t i) const
204 {
205  return vals[i];
206 }
207 
208 #endif // __CUDACC__ || __HIPCC__
209 
210 // -------------------------------------------------------------
211 // Helper functions for entrywise operations
212 // -------------------------------------------------------------
213 #if defined __CUDACC__ || defined __HIPCC__
214 
215 namespace apply_entrywise_operator_impl {
216 
223 template <template <typename> class UnaryOperator, typename TensorDataType>
224 __global__ void unary_1d_kernel(size_t size,
225  const TensorDataType* __restrict__ input,
226  TensorDataType* __restrict__ output)
227 {
228  const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
229  const size_t nthreads = blockDim.x * gridDim.x;
230  UnaryOperator<TensorDataType> op;
231  for (size_t i = gid; i < size; i += nthreads) {
232  output[i] = op(input[i]);
233  }
234 }
235 
242 template <template <typename> class UnaryOperator, typename TensorDataType>
243 __global__ void unary_2d_kernel(size_t height,
244  size_t width,
245  const TensorDataType* __restrict__ input,
246  size_t input_ldim,
247  TensorDataType* __restrict__ output,
248  size_t output_ldim)
249 {
250  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
251  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
252  const size_t nthreadsx = blockDim.x * gridDim.x;
253  const size_t nthreadsy = blockDim.y * gridDim.y;
254  UnaryOperator<TensorDataType> op;
255  for (size_t j = gidy; j < width; j += nthreadsy) {
256  for (size_t i = gidx; i < height; i += nthreadsx) {
257  const auto& x = input[i + j * input_ldim];
258  auto& y = output[i + j * output_ldim];
259  y = op(x);
260  }
261  }
262 }
263 
270 template <template <typename> class BinaryOperator, typename TensorDataType>
271 __global__ void binary_1d_kernel(size_t size,
272  const TensorDataType* __restrict__ input1,
273  const TensorDataType* __restrict__ input2,
274  TensorDataType* __restrict__ output)
275 {
276  const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
277  const size_t nthreads = blockDim.x * gridDim.x;
278  BinaryOperator<TensorDataType> op;
279  for (size_t i = gid; i < size; i += nthreads) {
280  output[i] = op(input1[i], input2[i]);
281  }
282 }
283 
290 template <template <typename> class BinaryOperator, typename TensorDataType>
291 __global__ void binary_2d_kernel(size_t height,
292  size_t width,
293  const TensorDataType* __restrict__ input1,
294  size_t input1_ldim,
295  const TensorDataType* __restrict__ input2,
296  size_t input2_ldim,
297  TensorDataType* __restrict__ output,
298  size_t output_ldim)
299 {
300  const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
301  const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
302  const size_t nthreadsx = blockDim.x * gridDim.x;
303  const size_t nthreadsy = blockDim.y * gridDim.y;
304  BinaryOperator<TensorDataType> op;
305  for (size_t j = gidy; j < width; j += nthreadsy) {
306  for (size_t i = gidx; i < height; i += nthreadsx) {
307  const auto& x1 = input1[i + j * input1_ldim];
308  const auto& x2 = input2[i + j * input2_ldim];
309  auto& y = output[i + j * output_ldim];
310  y = op(x1, x2);
311  }
312  }
313 }
314 
315 } // namespace apply_entrywise_operator_impl
316 
322 template <template <typename> class UnaryOp, typename TensorDataType>
324  const El::AbstractMatrix<TensorDataType>& input,
325  El::AbstractMatrix<TensorDataType>& output)
326 {
327 
328  // Check that input and output are valid
329  if (input.GetDevice() != El::Device::GPU) {
330  LBANN_ERROR("input is not on GPU");
331  }
332  else if (output.GetDevice() != El::Device::GPU) {
333  LBANN_ERROR("output is not on GPU");
334  }
335  else if (input.Height() != output.Height() ||
336  input.Width() != output.Width()) {
337  LBANN_ERROR("input matrix dimensions "
338  "(",
339  input.Height(),
340  " x ",
341  input.Width(),
342  ")"
343  "don't match output matrix dimensions "
344  "(",
345  output.Height(),
346  " x ",
347  output.Width(),
348  ")");
349  }
350 
351  // Return immediately if no compute is required
352  if (output.IsEmpty()) {
353  return;
354  }
355 
356  // Launch GPU kernel
357  if (input.Contiguous() && output.Contiguous()) {
358  dim3 block_dims, grid_dims;
359  block_dims.x = 256;
360  grid_dims.x =
361  (output.Height() * output.Width() + block_dims.x - 1) / block_dims.x;
362  gpu_lib::clip_grid_dims(grid_dims);
363  auto multisync =
364  El::MakeMultiSync(gpu::get_sync_info(output), gpu::get_sync_info(input));
365  hydrogen::gpu::LaunchKernel(
366  apply_entrywise_operator_impl::unary_1d_kernel<UnaryOp, TensorDataType>,
367  grid_dims,
368  block_dims,
369  0,
370  multisync,
371  output.Height() * output.Width(),
372  input.LockedBuffer(),
373  output.Buffer());
374  }
375  else {
376  dim3 block_dims, grid_dims;
377  block_dims.x = 256;
378  block_dims.y = 1;
379  grid_dims.x = (output.Height() + block_dims.x - 1) / block_dims.x;
380  grid_dims.y = (output.Width() + block_dims.y - 1) / block_dims.y;
381  gpu_lib::clip_grid_dims(grid_dims);
382  auto multisync =
383  El::MakeMultiSync(gpu::get_sync_info(output), gpu::get_sync_info(input));
384  hydrogen::gpu::LaunchKernel(
385  apply_entrywise_operator_impl::unary_2d_kernel<UnaryOp, TensorDataType>,
386  grid_dims,
387  block_dims,
388  0,
389  multisync,
390  input.Height(),
391  input.Width(),
392  input.LockedBuffer(),
393  input.LDim(),
394  output.Buffer(),
395  output.LDim());
396  }
397 }
398 
404 template <template <typename> class BinaryOp, typename TensorDataType>
406  const El::AbstractMatrix<TensorDataType>& input1,
407  const El::AbstractMatrix<TensorDataType>& input2,
408  El::AbstractMatrix<TensorDataType>& output)
409 {
410 
411  // Check that input and output are valid
412  if (input1.GetDevice() != El::Device::GPU ||
413  input2.GetDevice() != El::Device::GPU) {
414  LBANN_ERROR("input is not on GPU");
415  }
416  else if (output.GetDevice() != El::Device::GPU) {
417  LBANN_ERROR("output is not on GPU");
418  }
419  else if (input1.Height() != input2.Height() ||
420  input1.Width() != input2.Width() ||
421  input1.Height() != output.Height() ||
422  input1.Width() != output.Width()) {
423  LBANN_ERROR("input matrix dimensions "
424  "(",
425  input1.Height(),
426  " x ",
427  input1.Width(),
428  ", ",
429  input2.Height(),
430  " x ",
431  input2.Width(),
432  ")"
433  "don't match output matrix dimensions "
434  "(",
435  output.Height(),
436  " x ",
437  output.Width(),
438  ")");
439  }
440 
441  // Return immediately if no compute is required
442  if (output.IsEmpty()) {
443  return;
444  }
445 
446  // Launch GPU kernel
447  if (input1.Contiguous() && input2.Contiguous() && output.Contiguous()) {
448  dim3 block_dims, grid_dims;
449  block_dims.x = 256;
450  grid_dims.x =
451  (output.Height() * output.Width() + block_dims.x - 1) / block_dims.x;
452  gpu_lib::clip_grid_dims(grid_dims);
453  auto multisync = El::MakeMultiSync(gpu::get_sync_info(output),
454  gpu::get_sync_info(input1),
455  gpu::get_sync_info(input2));
456  hydrogen::gpu::LaunchKernel(
457  apply_entrywise_operator_impl::binary_1d_kernel<BinaryOp, TensorDataType>,
458  grid_dims,
459  block_dims,
460  0,
461  multisync,
462  output.Height() * output.Width(),
463  input1.LockedBuffer(),
464  input2.LockedBuffer(),
465  output.Buffer());
466  }
467  else {
468  dim3 block_dims, grid_dims;
469  block_dims.x = 256;
470  block_dims.y = 1;
471  grid_dims.x = (output.Height() + block_dims.x - 1) / block_dims.x;
472  grid_dims.y = (output.Width() + block_dims.y - 1) / block_dims.y;
473  gpu_lib::clip_grid_dims(grid_dims);
474  auto multisync = El::MakeMultiSync(gpu::get_sync_info(output),
475  gpu::get_sync_info(input1),
476  gpu::get_sync_info(input2));
477  hydrogen::gpu::LaunchKernel(
478  apply_entrywise_operator_impl::binary_2d_kernel<BinaryOp, TensorDataType>,
479  grid_dims,
480  block_dims,
481  0,
482  multisync,
483  output.Height(),
484  output.Width(),
485  input1.LockedBuffer(),
486  input1.LDim(),
487  input2.LockedBuffer(),
488  input2.LDim(),
489  output.Buffer(),
490  output.LDim());
491  }
492 }
493 
498 template <template <typename> class UnaryOperator, typename TensorDataType>
500  const El::AbstractDistMatrix<TensorDataType>& input,
501  El::AbstractDistMatrix<TensorDataType>& output)
502 {
503  if (input.Height() != output.Height() || input.Width() != output.Width()) {
504  LBANN_ERROR("input matrix dimensions "
505  "(",
506  input.Height(),
507  " x ",
508  input.Width(),
509  ")"
510  "don't match output matrix dimensions "
511  "(",
512  output.Height(),
513  " x ",
514  output.Width(),
515  ")");
516  }
517  else if (input.DistData() != output.DistData()) {
518  LBANN_ERROR("input and output matrix distributions don't match");
519  }
520  apply_entrywise_unary_operator<UnaryOperator>(input.LockedMatrix(),
521  output.Matrix());
522 }
523 
528 template <template <typename> class BinaryOperator, typename TensorDataType>
530  const El::AbstractDistMatrix<TensorDataType>& input1,
531  const El::AbstractDistMatrix<TensorDataType>& input2,
532  El::AbstractDistMatrix<TensorDataType>& output)
533 {
534  if (input1.Height() != input2.Height() || input1.Width() != input2.Width() ||
535  input1.Height() != output.Height() || input1.Width() != output.Width()) {
536  LBANN_ERROR("input matrix dimensions "
537  "(",
538  input1.Height(),
539  " x ",
540  input1.Width(),
541  ", ",
542  input2.Height(),
543  " x ",
544  input2.Width(),
545  ")"
546  "don't match output matrix dimensions "
547  "(",
548  output.Height(),
549  " x ",
550  output.Width(),
551  ")");
552  }
553  else if (input1.DistData() != input2.DistData() ||
554  input1.DistData() != output.DistData()) {
555  LBANN_ERROR("input and output matrix distributions don't match");
556  }
557  apply_entrywise_binary_operator<BinaryOperator>(input1.LockedMatrix(),
558  input2.LockedMatrix(),
559  output.Matrix());
560 }
561 
562 #endif // __CUDACC__ || __HIPCC__
563 
564 } // namespace gpu_lib
565 } // namespace lbann
void apply_entrywise_binary_operator(const El::AbstractMatrix< TensorDataType > &input1, const El::AbstractMatrix< TensorDataType > &input2, El::AbstractMatrix< TensorDataType > &output)
void clip_grid_dims(dim3 &grid_dims)
#define LBANN_ERROR(...)
Definition: exception.hpp:37
void apply_entrywise_unary_operator(const El::AbstractMatrix< TensorDataType > &input, El::AbstractMatrix< TensorDataType > &output)
El::SyncInfo< El::Device::GPU > get_sync_info(El::AbstractMatrix< TensorDataType > const &m)
Get a SyncInfo from an AbstractMatrix.