29 #include <thrust/system/cuda/execution_policy.h> 30 #ifdef HYDROGEN_HAVE_CUB 31 #include "cub/block/block_reduce.cuh" 32 #endif // HYDROGEN_HAVE_CUB 33 #include <cuda_fp16.h> 34 #include <math_constants.h> 45 #if __CUDA_ARCH__ >= 530 46 __device__ __forceinline__ __half gpu_lib::atomic_add(__half* address,
49 #if __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__) 50 return atomicAdd(address, val);
52 unsigned int* address_as_uint = (
unsigned int*)address;
53 unsigned int old = *address_as_uint;
54 __half* old_as_half = (__half*)&old;
57 __half* updated_as_half = (__half*)&updated;
61 *updated_as_half += val;
62 old = atomicCAS(address_as_uint, assumed, updated);
63 }
while (assumed != old);
65 #endif // __CUDA_ARCH__ >= 700 || !defined(__CUDA_ARCH__) 67 #endif // __CUDA_ARCH__ >= 530 68 __device__ __forceinline__
float gpu_lib::atomic_add(
float* address,
float val)
70 return atomicAdd(address, val);
72 __device__ __forceinline__
double gpu_lib::atomic_add(
double* address,
75 #if __CUDA_ARCH__ >= 600 76 return atomicAdd(address, val);
78 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
79 unsigned long long int old = *address_as_ull, assumed;
82 old = atomicCAS(address_as_ull,
84 __double_as_longlong(val + __longlong_as_double(assumed)));
85 }
while (assumed != old);
86 return __longlong_as_double(old);
87 #endif // __CUDA_ARCH__ < 600 91 template <
size_t bdimx,
size_t bdimy,
size_t bdimz,
class T>
92 __device__ __forceinline__ T gpu_lib::block_reduce(T val)
94 #ifdef HYDROGEN_HAVE_CUB 95 constexpr
auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
96 using BlockReduce = cub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
97 __shared__
typename BlockReduce::TempStorage workspace;
98 val = BlockReduce(workspace).Sum(val);
101 threadIdx.x + threadIdx.y * bdimx + threadIdx.z * bdimx * bdimy;
102 constexpr
size_t bsize = bdimx * bdimy * bdimz;
103 __shared__ DataType shared_max_vals[bsize];
104 shared_max_vals[tid] = val;
105 for (
size_t stride = bsize / 2; stride > 0; stride /= 2) {
108 shared_max_vals[tid] =
109 shared_max_vals[tid] + shared_max_vals[tid + stride];
113 val = shared_max_vals[0];
115 #endif // HYDROGEN_HAVE_CUB 118 template <
size_t bdimx,
size_t bdimy,
size_t bdimz,
class T,
class Op>
119 __device__ __forceinline__ T gpu_lib::block_reduce(T val)
121 #ifdef HYDROGEN_HAVE_CUB 122 constexpr
auto reduce_algo = cub::BLOCK_REDUCE_WARP_REDUCTIONS;
123 using BlockReduce = cub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
124 __shared__
typename BlockReduce::TempStorage workspace;
125 val = BlockReduce(workspace).Reduce(val, Op());
129 threadIdx.x + threadIdx.y * bdimx + threadIdx.z * bdimx * bdimy;
130 constexpr
size_t bsize = bdimx * bdimy * bdimz;
131 __shared__ DataType shared_max_vals[bsize];
132 shared_max_vals[tid] = val;
133 for (
size_t stride = bsize / 2; stride > 0; stride /= 2) {
136 shared_max_vals[tid] =
137 op(shared_max_vals[tid], shared_max_vals[tid + stride]);
141 val = shared_max_vals[0];
143 #endif // HYDROGEN_HAVE_CUB 148 #if __CUDA_ARCH__ >= 530 150 __device__ __forceinline__
bool gpu_lib::isfinite(__half
const& x)
152 return !(::__isnan(x) || ::__hisinf(x));
155 __device__ __forceinline__
bool gpu_lib::isinf(__half
const& x)
157 return ::__hisinf(x);
160 __device__ __forceinline__
bool gpu_lib::isnan(__half
const& x)
162 return ::__hisnan(x);
166 #define WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(func) \ 167 __device__ __forceinline__ __half gpu_lib::func(__half const& x) \ 169 return ::h##func(x); \ 175 #define WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(func) \ 176 __device__ __forceinline__ __half gpu_lib::func(__half const& x) \ 178 return func(float(x)); \ 181 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(round)
182 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(ceil)
183 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(floor)
184 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(sqrt)
185 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(rsqrt)
186 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(exp)
191 __device__ __forceinline__ __half gpu_lib::expm1(__half
const& x)
193 return ::__hsub(::hexp(x), ::__float2half(1.f));
196 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(log)
197 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(log1p)
198 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(cos)
199 WRAP_UNARY_CUDA_HALF_MATH_FUNCTION(sin)
205 __device__ __forceinline__ __half gpu_lib::tan(__half
const& x)
207 return ::__hdiv(::hsin(x), ::hcos(x));
210 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acos)
211 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asin)
212 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atan)
213 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(cosh)
214 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(sinh)
215 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(tanh)
216 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acosh)
217 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asinh)
218 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atanh)
219 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(erf)
220 WRAP_UNARY_CUDA_HALF_CAST_TO_FLOAT_MATH_FUNCTION(erfinv)
221 #undef WRAP_UNARY_CUDA_HALF_MATH_FUNCTION 224 __device__ __forceinline__ __half gpu_lib::min(
const __half& x,
const __half& y)
226 return ::__hle(x, y) ? x : y;
229 __device__ __forceinline__ __half gpu_lib::max(
const __half& x,
const __half& y)
231 return ::__hle(x, y) ? y : x;
233 #endif // __CUDA_ARCH__ >= 530 236 #ifdef __CUDACC_RELAXED_CONSTEXPR__ 237 template <
typename T>
238 constexpr __device__ __forceinline__ T min()
240 return std::numeric_limits<T>::min();
242 template <
typename T>
243 constexpr __device__ __forceinline__ T max()
245 return std::numeric_limits<T>::min();
247 template <
typename T>
248 constexpr __device__ __forceinline__ T epsilon()
250 return std::numeric_limits<T>::epsilon();
252 template <
typename T>
253 __device__ __forceinline__ T infinity()
255 return std::numeric_limits<T>::infinity();
257 #else // __CUDACC_RELAXED_CONSTEXPR__ 260 __device__ __forceinline__ 261 SPECIFIERS constexpr
float gpu_lib::min<float>() {
return FLT_MIN; }
262 SPECIFIERS constexpr
double gpu_lib::min<double>() {
return DBL_MIN; }
263 SPECIFIERS constexpr
int gpu_lib::min<int>() {
return INT_MIN; }
264 SPECIFIERS constexpr
long int gpu_lib::min<long int>() {
return LONG_MIN; }
265 SPECIFIERS constexpr
long long int gpu_lib::min<long long int>()
269 SPECIFIERS constexpr
float gpu_lib::max<float>() {
return FLT_MAX; }
270 SPECIFIERS constexpr
double gpu_lib::max<double>() {
return DBL_MAX; }
271 SPECIFIERS constexpr
int gpu_lib::max<int>() {
return INT_MAX; }
272 SPECIFIERS constexpr
long int gpu_lib::max<long int>() {
return LONG_MAX; }
273 SPECIFIERS constexpr
long long int gpu_lib::max<long long int>()
277 SPECIFIERS constexpr
float gpu_lib::epsilon<float>() {
return FLT_EPSILON; }
278 SPECIFIERS constexpr
double gpu_lib::epsilon<double>() {
return DBL_EPSILON; }
279 SPECIFIERS
float gpu_lib::infinity<float>() {
return CUDART_INF_F; }
280 SPECIFIERS
double gpu_lib::infinity<double>() {
return CUDART_INF; }
282 #endif // __CUDACC_RELAXED_CONSTEXPR__ 289 #ifndef DOXYGEN_SHOULD_SKIP_THIS 293 template <
typename T>
294 allocator<T>::allocator(cudaStream_t stream)
295 : m_stream(stream), m_system(stream)
298 template <
typename T>
299 typename allocator<T>::pointer
300 allocator<T>::allocate(allocator<T>::size_type size)
302 value_type* buffer =
nullptr;
304 #ifdef HYDROGEN_HAVE_CUB 305 auto& memory_pool = El::cub::MemoryPool();
306 CHECK_CUDA(memory_pool.DeviceAllocate(reinterpret_cast<void**>(&buffer),
307 size *
sizeof(value_type),
310 CHECK_CUDA(cudaMalloc(&buffer, size *
sizeof(value_type)));
311 #endif // HYDROGEN_HAVE_CUB 313 return pointer(buffer);
316 template <
typename T>
317 void allocator<T>::deallocate(allocator<T>::pointer buffer,
318 allocator<T>::size_type size)
320 auto&& ptr = buffer.get();
321 if (ptr !=
nullptr) {
322 #ifdef HYDROGEN_HAVE_CUB 323 auto& memory_pool = El::cub::MemoryPool();
324 CHECK_CUDA(memory_pool.DeviceFree(ptr));
326 CHECK_CUDA(cudaFree(ptr));
327 #endif // HYDROGEN_HAVE_CUB 331 template <
typename T>
332 typename allocator<T>::system_type& allocator<T>::system()
338 #endif // !DOXYGEN_SHOULD_SKIP_THIS