27 #ifndef LBANN_UTILS_GPULIB_HPP 28 #define LBANN_UTILS_GPULIB_HPP 35 #if defined LBANN_HAS_CUDA 37 #elif defined LBANN_HAS_ROCM 39 #endif // LBANN_HAS_CUDA 55 #if defined __CUDACC__ || defined __HIPCC__ 58 __device__ __forceinline__ __half atomic_add(__half* address, __half val);
59 __device__ __forceinline__
float atomic_add(
float* address,
float val);
60 __device__ __forceinline__
double atomic_add(
double* address,
double val);
74 template <
size_t bdimx,
size_t bdimy,
size_t bdimz,
class T>
75 __device__ __forceinline__ T block_reduce(T val);
91 template <
size_t bdimx,
size_t bdimy,
size_t bdimz,
class T,
class Op>
92 __device__ __forceinline__ T block_reduce(T val);
95 #define DECLARE_UNARY_MATH_FUNC_WITH_TYPE(func, type) \ 96 __device__ __forceinline__ type func(type const& x) 97 #define DECLARE_UNARY_MATH_FUNC(func) \ 98 DECLARE_UNARY_MATH_FUNC_WITH_TYPE(func, __half); \ 99 DECLARE_UNARY_MATH_FUNC_WITH_TYPE(func, float); \ 100 DECLARE_UNARY_MATH_FUNC_WITH_TYPE(func, double) 101 template <
typename T>
102 __device__ __forceinline__ T abs(
const T& x);
103 __device__ __forceinline__
float abs(
float const& x);
104 __device__ __forceinline__
double abs(
double const& x);
105 DECLARE_UNARY_MATH_FUNC(round);
106 DECLARE_UNARY_MATH_FUNC(ceil);
107 DECLARE_UNARY_MATH_FUNC(floor);
108 DECLARE_UNARY_MATH_FUNC(sqrt);
109 DECLARE_UNARY_MATH_FUNC(rsqrt);
110 DECLARE_UNARY_MATH_FUNC(exp);
111 DECLARE_UNARY_MATH_FUNC(expm1);
112 DECLARE_UNARY_MATH_FUNC(log);
113 DECLARE_UNARY_MATH_FUNC(log1p);
114 DECLARE_UNARY_MATH_FUNC(cos);
115 DECLARE_UNARY_MATH_FUNC(sin);
116 DECLARE_UNARY_MATH_FUNC(tan);
117 DECLARE_UNARY_MATH_FUNC(acos);
118 DECLARE_UNARY_MATH_FUNC(asin);
119 DECLARE_UNARY_MATH_FUNC(atan);
120 DECLARE_UNARY_MATH_FUNC(cosh);
121 DECLARE_UNARY_MATH_FUNC(sinh);
122 DECLARE_UNARY_MATH_FUNC(tanh);
123 DECLARE_UNARY_MATH_FUNC(acosh);
124 DECLARE_UNARY_MATH_FUNC(asinh);
125 DECLARE_UNARY_MATH_FUNC(atanh);
126 DECLARE_UNARY_MATH_FUNC(erf);
127 DECLARE_UNARY_MATH_FUNC(erfinv);
128 template <
typename T>
129 __device__ __forceinline__
bool isfinite(
const T& x);
130 template <
typename T>
131 __device__ __forceinline__
bool isinf(
const T& x);
132 template <
typename T>
133 __device__ __forceinline__
bool isnan(
const T& x);
134 #undef DECLARE_UNARY_MATH_FUNC 135 #undef DECLARE_UNARY_MATH_FUNC_WITH_TYPE 138 #define DECLARE_BINARY_UNARY_MATH_FUNC_WITH_TYPE(func, type) \ 139 __device__ __forceinline__ type func(type const& x, type const& y) 140 #define DECLARE_BINARY_UNARY_MATH_FUNC(func) \ 141 DECLARE_BINARY_UNARY_MATH_FUNC_WITH_TYPE(func, __half); \ 142 DECLARE_BINARY_UNARY_MATH_FUNC_WITH_TYPE(func, float); \ 143 DECLARE_BINARY_UNARY_MATH_FUNC_WITH_TYPE(func, double) 144 template <
typename T>
145 __device__ __forceinline__ T min(
const T& x,
const T& y);
146 DECLARE_BINARY_UNARY_MATH_FUNC(min);
147 template <
typename T>
148 __device__ __forceinline__ T max(
const T& x,
const T& y);
149 DECLARE_BINARY_UNARY_MATH_FUNC(max);
150 DECLARE_BINARY_UNARY_MATH_FUNC(mod);
151 DECLARE_BINARY_UNARY_MATH_FUNC(pow);
152 #undef DECLARE_BINARY_UNARY_MATH_FUNC 153 #undef DECLARE_BINARY_UNARY_MATH_FUNC_WITH_TYPE 156 template <
typename T>
157 constexpr __device__ __forceinline__ T min();
158 template <
typename T>
159 constexpr __device__ __forceinline__ T max();
160 template <
typename T>
161 constexpr __device__ __forceinline__ T epsilon();
162 template <
typename T>
163 __device__ __forceinline__ T infinity();
166 template <
typename T,
size_t N>
170 __host__ __device__ __forceinline__
size_t size()
const;
171 __host__ __device__ __forceinline__ T& operator[](
size_t i);
172 __host__ __device__ __forceinline__
const T& operator[](
size_t i)
const;
175 #endif // __CUDACC__ || __HIPCC__ 181 #if defined __CUDACC__ || defined __HIPCC__ 187 template <
template <
typename>
class UnaryOperator,
typename TensorDataType>
189 const El::AbstractMatrix<TensorDataType>& input,
190 El::AbstractMatrix<TensorDataType>& output);
196 template <
template <
typename>
class BinaryOperator,
typename TensorDataType>
198 const El::AbstractMatrix<TensorDataType>& input1,
199 const El::AbstractMatrix<TensorDataType>& input2,
200 El::AbstractMatrix<TensorDataType>& output);
206 template <
template <
typename>
class UnaryOperator,
typename TensorDataType>
208 const El::AbstractDistMatrix<TensorDataType>& input,
209 El::AbstractDistMatrix<TensorDataType>& output);
215 template <
template <
typename>
class BinaryOperator,
typename TensorDataType>
217 const El::AbstractDistMatrix<TensorDataType>& input1,
218 const El::AbstractDistMatrix<TensorDataType>& input2,
219 El::AbstractDistMatrix<TensorDataType>& output);
221 #endif // __CUDACC__ || __HIPCC__ 231 #endif // LBANN_HAS_GPU 232 #endif // LBANN_UTILS_GPULIB_HPP void apply_entrywise_binary_operator(const El::AbstractMatrix< TensorDataType > &input1, const El::AbstractMatrix< TensorDataType > &input2, El::AbstractMatrix< TensorDataType > &output)
void clip_grid_dims(dim3 &grid_dims)
void apply_entrywise_unary_operator(const El::AbstractMatrix< TensorDataType > &input, El::AbstractMatrix< TensorDataType > &output)