29 #if defined LBANN_HAS_CUDA 31 #elif defined LBANN_HAS_ROCM 33 #endif // LBANN_HAS_CUDA 41 const auto max_grid_dims_ = max_grid_dims();
42 grid_dims.x = std::min(grid_dims.x, max_grid_dims_.x);
43 grid_dims.y = std::min(grid_dims.y, max_grid_dims_.y);
44 grid_dims.z = std::min(grid_dims.z, max_grid_dims_.z);
50 #if defined __CUDACC__ || defined __HIPCC__ 53 #define WRAP_UNARY_MATH_FUNCTION(func) \ 54 __device__ __forceinline__ float func(const float& x) \ 56 return ::func##f(x); \ 58 __device__ __forceinline__ double func(const double& x) \ 63 __device__ __forceinline__ T abs(
const T& x)
65 return x >=
static_cast<T
>(0) ? x : -x;
67 __device__ __forceinline__
float abs(
const float& x) { return ::fabsf(x); }
68 __device__ __forceinline__
double abs(
const double& x) { return ::fabs(x); }
69 WRAP_UNARY_MATH_FUNCTION(round)
70 WRAP_UNARY_MATH_FUNCTION(ceil)
71 WRAP_UNARY_MATH_FUNCTION(floor)
72 WRAP_UNARY_MATH_FUNCTION(sqrt)
73 WRAP_UNARY_MATH_FUNCTION(rsqrt)
74 WRAP_UNARY_MATH_FUNCTION(exp)
75 WRAP_UNARY_MATH_FUNCTION(expm1)
76 WRAP_UNARY_MATH_FUNCTION(log)
77 WRAP_UNARY_MATH_FUNCTION(log1p)
78 WRAP_UNARY_MATH_FUNCTION(cos)
79 WRAP_UNARY_MATH_FUNCTION(sin)
80 WRAP_UNARY_MATH_FUNCTION(tan)
81 WRAP_UNARY_MATH_FUNCTION(acos)
82 WRAP_UNARY_MATH_FUNCTION(asin)
83 WRAP_UNARY_MATH_FUNCTION(atan)
84 WRAP_UNARY_MATH_FUNCTION(cosh)
85 WRAP_UNARY_MATH_FUNCTION(sinh)
86 WRAP_UNARY_MATH_FUNCTION(tanh)
87 WRAP_UNARY_MATH_FUNCTION(acosh)
88 WRAP_UNARY_MATH_FUNCTION(asinh)
89 WRAP_UNARY_MATH_FUNCTION(atanh)
90 WRAP_UNARY_MATH_FUNCTION(erf)
91 WRAP_UNARY_MATH_FUNCTION(erfinv)
92 #undef WRAP_UNARY_MATH_FUNCTION 95 __device__ __forceinline__
bool isfinite(T
const& x)
100 __device__ __forceinline__
bool isinf(T
const& x)
104 template <
typename T>
105 __device__ __forceinline__
bool isnan(T
const& x)
111 #define WRAP_BINARY_MATH_FUNCTION(func) \ 112 __device__ __forceinline__ float func(const float& x, const float& y) \ 114 return ::func##f(x, y); \ 116 __device__ __forceinline__ double func(const double& x, const double& y) \ 118 return ::func(x, y); \ 120 template <
typename T>
121 __device__ __forceinline__ T min(
const T& x,
const T& y)
123 return y < x ? y : x;
125 __device__ __forceinline__
float min(
const float& x,
const float& y)
127 return ::fminf(x, y);
129 __device__ __forceinline__
double min(
const double& x,
const double& y)
133 template <
typename T>
134 __device__ __forceinline__ T max(
const T& x,
const T& y)
136 return y > x ? y : x;
138 __device__ __forceinline__
float max(
const float& x,
const float& y)
140 return ::fmaxf(x, y);
142 __device__ __forceinline__
double max(
const double& x,
const double& y)
146 __device__ __forceinline__
float mod(
const float& x,
const float& y)
148 return ::fmodf(x, y);
150 __device__ __forceinline__
double mod(
const double& x,
const double& y)
154 WRAP_BINARY_MATH_FUNCTION(pow)
155 #undef WRAP_BINARY_MATH_FUNCTION 157 __device__ __forceinline__ __half pow(
const __half& x,
const __half& y)
159 return pow(
float(x),
float(y));
162 __device__ __forceinline__ __half mod(
const __half& x,
const __half& y)
164 return mod(
float(x),
float(y));
170 __device__ __forceinline__ __half min<__half>()
172 return __short_as_half(0x0400);
175 __device__ __forceinline__ __half max<__half>()
177 return __short_as_half(0x7BFF);
180 __device__ __forceinline__ __half epsilon<__half>()
182 return __short_as_half(0x1400);
185 __device__ __forceinline__ __half infinity<__half>()
187 return __short_as_half(0x7C00);
191 template <
typename T,
size_t N>
192 __host__ __device__ __forceinline__
size_t array<T, N>::size()
const 196 template <
typename T,
size_t N>
197 __host__ __device__ __forceinline__ T& array<T, N>::operator[](
size_t i)
201 template <
typename T,
size_t N>
202 __host__ __device__ __forceinline__
const T&
203 array<T, N>::operator[](
size_t i)
const 208 #endif // __CUDACC__ || __HIPCC__ 213 #if defined __CUDACC__ || defined __HIPCC__ 215 namespace apply_entrywise_operator_impl {
223 template <
template <
typename>
class UnaryOperator,
typename TensorDataType>
224 __global__
void unary_1d_kernel(
size_t size,
225 const TensorDataType* __restrict__ input,
226 TensorDataType* __restrict__ output)
228 const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
229 const size_t nthreads = blockDim.x * gridDim.x;
230 UnaryOperator<TensorDataType> op;
231 for (
size_t i = gid; i < size; i += nthreads) {
232 output[i] = op(input[i]);
242 template <
template <
typename>
class UnaryOperator,
typename TensorDataType>
243 __global__
void unary_2d_kernel(
size_t height,
245 const TensorDataType* __restrict__ input,
247 TensorDataType* __restrict__ output,
250 const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
251 const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
252 const size_t nthreadsx = blockDim.x * gridDim.x;
253 const size_t nthreadsy = blockDim.y * gridDim.y;
254 UnaryOperator<TensorDataType> op;
255 for (
size_t j = gidy; j < width; j += nthreadsy) {
256 for (
size_t i = gidx; i < height; i += nthreadsx) {
257 const auto& x = input[i + j * input_ldim];
258 auto& y = output[i + j * output_ldim];
270 template <
template <
typename>
class BinaryOperator,
typename TensorDataType>
271 __global__
void binary_1d_kernel(
size_t size,
272 const TensorDataType* __restrict__ input1,
273 const TensorDataType* __restrict__ input2,
274 TensorDataType* __restrict__ output)
276 const size_t gid = threadIdx.x + blockIdx.x * blockDim.x;
277 const size_t nthreads = blockDim.x * gridDim.x;
278 BinaryOperator<TensorDataType> op;
279 for (
size_t i = gid; i < size; i += nthreads) {
280 output[i] = op(input1[i], input2[i]);
290 template <
template <
typename>
class BinaryOperator,
typename TensorDataType>
291 __global__
void binary_2d_kernel(
size_t height,
293 const TensorDataType* __restrict__ input1,
295 const TensorDataType* __restrict__ input2,
297 TensorDataType* __restrict__ output,
300 const size_t gidx = threadIdx.x + blockIdx.x * blockDim.x;
301 const size_t gidy = threadIdx.y + blockIdx.y * blockDim.y;
302 const size_t nthreadsx = blockDim.x * gridDim.x;
303 const size_t nthreadsy = blockDim.y * gridDim.y;
304 BinaryOperator<TensorDataType> op;
305 for (
size_t j = gidy; j < width; j += nthreadsy) {
306 for (
size_t i = gidx; i < height; i += nthreadsx) {
307 const auto& x1 = input1[i + j * input1_ldim];
308 const auto& x2 = input2[i + j * input2_ldim];
309 auto& y = output[i + j * output_ldim];
322 template <
template <
typename>
class UnaryOp,
typename TensorDataType>
324 const El::AbstractMatrix<TensorDataType>& input,
325 El::AbstractMatrix<TensorDataType>& output)
329 if (input.GetDevice() != El::Device::GPU) {
332 else if (output.GetDevice() != El::Device::GPU) {
335 else if (input.Height() != output.Height() ||
336 input.Width() != output.Width()) {
343 "don't match output matrix dimensions " 352 if (output.IsEmpty()) {
357 if (input.Contiguous() && output.Contiguous()) {
358 dim3 block_dims, grid_dims;
361 (output.Height() * output.Width() + block_dims.x - 1) / block_dims.x;
365 hydrogen::gpu::LaunchKernel(
366 apply_entrywise_operator_impl::unary_1d_kernel<UnaryOp, TensorDataType>,
371 output.Height() * output.Width(),
372 input.LockedBuffer(),
376 dim3 block_dims, grid_dims;
379 grid_dims.x = (output.Height() + block_dims.x - 1) / block_dims.x;
380 grid_dims.y = (output.Width() + block_dims.y - 1) / block_dims.y;
384 hydrogen::gpu::LaunchKernel(
385 apply_entrywise_operator_impl::unary_2d_kernel<UnaryOp, TensorDataType>,
392 input.LockedBuffer(),
404 template <
template <
typename>
class BinaryOp,
typename TensorDataType>
406 const El::AbstractMatrix<TensorDataType>& input1,
407 const El::AbstractMatrix<TensorDataType>& input2,
408 El::AbstractMatrix<TensorDataType>& output)
412 if (input1.GetDevice() != El::Device::GPU ||
413 input2.GetDevice() != El::Device::GPU) {
416 else if (output.GetDevice() != El::Device::GPU) {
419 else if (input1.Height() != input2.Height() ||
420 input1.Width() != input2.Width() ||
421 input1.Height() != output.Height() ||
422 input1.Width() != output.Width()) {
433 "don't match output matrix dimensions " 442 if (output.IsEmpty()) {
447 if (input1.Contiguous() && input2.Contiguous() && output.Contiguous()) {
448 dim3 block_dims, grid_dims;
451 (output.Height() * output.Width() + block_dims.x - 1) / block_dims.x;
456 hydrogen::gpu::LaunchKernel(
457 apply_entrywise_operator_impl::binary_1d_kernel<BinaryOp, TensorDataType>,
462 output.Height() * output.Width(),
463 input1.LockedBuffer(),
464 input2.LockedBuffer(),
468 dim3 block_dims, grid_dims;
471 grid_dims.x = (output.Height() + block_dims.x - 1) / block_dims.x;
472 grid_dims.y = (output.Width() + block_dims.y - 1) / block_dims.y;
477 hydrogen::gpu::LaunchKernel(
478 apply_entrywise_operator_impl::binary_2d_kernel<BinaryOp, TensorDataType>,
485 input1.LockedBuffer(),
487 input2.LockedBuffer(),
498 template <
template <
typename>
class UnaryOperator,
typename TensorDataType>
500 const El::AbstractDistMatrix<TensorDataType>& input,
501 El::AbstractDistMatrix<TensorDataType>& output)
503 if (input.Height() != output.Height() || input.Width() != output.Width()) {
510 "don't match output matrix dimensions " 517 else if (input.DistData() != output.DistData()) {
518 LBANN_ERROR(
"input and output matrix distributions don't match");
520 apply_entrywise_unary_operator<UnaryOperator>(input.LockedMatrix(),
528 template <
template <
typename>
class BinaryOperator,
typename TensorDataType>
530 const El::AbstractDistMatrix<TensorDataType>& input1,
531 const El::AbstractDistMatrix<TensorDataType>& input2,
532 El::AbstractDistMatrix<TensorDataType>& output)
534 if (input1.Height() != input2.Height() || input1.Width() != input2.Width() ||
535 input1.Height() != output.Height() || input1.Width() != output.Width()) {
546 "don't match output matrix dimensions " 553 else if (input1.DistData() != input2.DistData() ||
554 input1.DistData() != output.DistData()) {
555 LBANN_ERROR(
"input and output matrix distributions don't match");
557 apply_entrywise_binary_operator<BinaryOperator>(input1.LockedMatrix(),
558 input2.LockedMatrix(),
562 #endif // __CUDACC__ || __HIPCC__ void apply_entrywise_binary_operator(const El::AbstractMatrix< TensorDataType > &input1, const El::AbstractMatrix< TensorDataType > &input2, El::AbstractMatrix< TensorDataType > &output)
void clip_grid_dims(dim3 &grid_dims)
void apply_entrywise_unary_operator(const El::AbstractMatrix< TensorDataType > &input, El::AbstractMatrix< TensorDataType > &output)
El::SyncInfo< El::Device::GPU > get_sync_info(El::AbstractMatrix< TensorDataType > const &m)
Get a SyncInfo from an AbstractMatrix.