29 #include <thrust/system/hip/execution_policy.h> 30 #ifdef HYDROGEN_HAVE_CUB 31 #include "hipcub/block/block_reduce.hpp" 32 #endif // HYDROGEN_HAVE_CUB 33 #include <hip/hip_fp16.h> 45 __device__ __forceinline__ __half gpu_lib::atomic_add(__half* address,
48 unsigned int* address_as_uint = (
unsigned int*)address;
49 unsigned int old = *address_as_uint;
50 __half* old_as_half = (__half*)&old;
53 __half* updated_as_half = (__half*)&updated;
57 *updated_as_half += val;
58 old = atomicCAS(address_as_uint, assumed, updated);
59 }
while (assumed != old);
62 __device__ __forceinline__
float gpu_lib::atomic_add(
float* address,
float val)
64 return atomicAdd(address, val);
66 __device__ __forceinline__
double gpu_lib::atomic_add(
double* address,
69 unsigned long long int* address_as_ull = (
unsigned long long int*)address;
70 unsigned long long int old = *address_as_ull, assumed;
73 old = atomicCAS(address_as_ull,
75 __double_as_longlong(val + __longlong_as_double(assumed)));
79 }
while (assumed != old);
80 return __longlong_as_double(old);
84 template <
size_t bdimx,
size_t bdimy,
size_t bdimz,
class T>
85 __device__ __forceinline__ T gpu_lib::block_reduce(T val)
87 #ifdef HYDROGEN_HAVE_CUB 88 constexpr
auto reduce_algo = hipcub::BLOCK_REDUCE_WARP_REDUCTIONS;
89 using BlockReduce = hipcub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
90 __shared__
typename BlockReduce::TempStorage workspace;
91 val = BlockReduce(workspace).Sum(val);
94 threadIdx.x + threadIdx.y * bdimx + threadIdx.z * bdimx * bdimy;
95 constexpr
size_t bsize = bdimx * bdimy * bdimz;
97 __shared__
char shared_max_vals_buffer[bsize *
sizeof(T)];
98 T* shared_max_vals =
reinterpret_cast<T*
>(shared_max_vals);
99 shared_max_vals[tid] = val;
100 for (
size_t stride = bsize / 2; stride > 0; stride /= 2) {
103 shared_max_vals[tid] =
104 shared_max_vals[tid] + shared_max_vals[tid + stride];
108 val = shared_max_vals[0];
110 #endif // HYDROGEN_HAVE_CUB 113 template <
size_t bdimx,
size_t bdimy,
size_t bdimz,
class T,
class Op>
114 __device__ __forceinline__ T gpu_lib::block_reduce(T val)
116 #ifdef HYDROGEN_HAVE_CUB 117 constexpr
auto reduce_algo = hipcub::BLOCK_REDUCE_WARP_REDUCTIONS;
118 using BlockReduce = hipcub::BlockReduce<T, bdimx, reduce_algo, bdimy, bdimz>;
119 __shared__
typename BlockReduce::TempStorage workspace;
120 val = BlockReduce(workspace).Reduce(val, Op());
124 threadIdx.x + threadIdx.y * bdimx + threadIdx.z * bdimx * bdimy;
125 constexpr
size_t bsize = bdimx * bdimy * bdimz;
127 __shared__
char shared_max_vals_buffer[bsize *
sizeof(T)];
128 T* shared_max_vals =
reinterpret_cast<T*
>(shared_max_vals);
129 shared_max_vals[tid] = val;
130 for (
size_t stride = bsize / 2; stride > 0; stride /= 2) {
133 shared_max_vals[tid] =
134 op(shared_max_vals[tid], shared_max_vals[tid + stride]);
138 val = shared_max_vals[0];
140 #endif // HYDROGEN_HAVE_CUB 146 #define WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(func) \ 147 __device__ __forceinline__ __half gpu_lib::func(__half const& x) \ 149 return ::h##func(x); \ 155 #define WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(func) \ 156 __device__ __forceinline__ __half gpu_lib::func(__half const& x) \ 158 return ::func(float(x)); \ 161 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(round)
162 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(ceil)
163 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(floor)
164 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(sqrt)
165 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(rsqrt)
166 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(exp)
172 __device__ __forceinline__ __half gpu_lib::expm1(__half
const& x)
174 return ::__hsub(::hexp(x), ::__float2half(1.f));
177 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(log)
178 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(log1p)
179 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(cos)
180 WRAP_UNARY_ROCM_HALF_MATH_FUNCTION(sin)
186 __device__ __forceinline__ __half gpu_lib::tan(__half
const& x)
188 return ::__hdiv(::hsin(x), ::hcos(x));
191 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acos)
192 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asin)
193 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atan)
194 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(cosh)
195 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(sinh)
196 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(tanh)
197 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(acosh)
198 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(asinh)
199 WRAP_UNARY_ROCM_HALF_CAST_TO_FLOAT_MATH_FUNCTION(atanh)
200 #undef WRAP_UNARY_ROCM_HALF_MATH_FUNCTION 203 __device__ __forceinline__ __half gpu_lib::min(
const __half& x,
const __half& y)
205 return ::__hle(x, y) ? x : y;
208 __device__ __forceinline__ __half gpu_lib::max(
const __half& x,
const __half& y)
210 return ::__hle(x, y) ? y : x;
214 template <
typename T>
215 constexpr __device__ __forceinline__ T gpu_lib::min()
217 return std::numeric_limits<T>::min();
219 template <
typename T>
220 constexpr __device__ __forceinline__ T gpu_lib::max()
222 return std::numeric_limits<T>::max();
224 template <
typename T>
225 constexpr __device__ __forceinline__ T gpu_lib::epsilon()
227 return std::numeric_limits<T>::epsilon();
229 template <
typename T>
230 __device__ __forceinline__ T gpu_lib::infinity()
232 return std::numeric_limits<T>::infinity();
240 #ifndef DOXYGEN_SHOULD_SKIP_THIS 244 template <
typename T>
245 allocator<T>::allocator(hipStream_t stream) : m_stream(stream), m_system(stream)
248 template <
typename T>
249 typename allocator<T>::pointer
250 allocator<T>::allocate(allocator<T>::size_type size)
252 value_type* buffer =
nullptr;
254 #ifdef HYDROGEN_HAVE_CUB 255 auto& memory_pool = El::cub::MemoryPool();
256 CHECK_ROCM(memory_pool.DeviceAllocate(reinterpret_cast<void**>(&buffer),
257 size *
sizeof(value_type),
260 CHECK_ROCM(hipMalloc(&buffer, size *
sizeof(value_type)));
261 #endif // HYDROGEN_HAVE_CUB 263 return pointer(buffer);
266 template <
typename T>
267 void allocator<T>::deallocate(allocator<T>::pointer buffer,
268 allocator<T>::size_type size)
270 auto&& ptr = buffer.get();
271 if (ptr !=
nullptr) {
272 #ifdef HYDROGEN_HAVE_CUB 273 auto& memory_pool = El::cub::MemoryPool();
274 CHECK_ROCM(memory_pool.DeviceFree(ptr));
276 CHECK_ROCM(hipFree(ptr));
277 #endif // HYDROGEN_HAVE_CUB 281 template <
typename T>
282 typename allocator<T>::system_type& allocator<T>::system()
288 #endif // !DOXYGEN_SHOULD_SKIP_THIS