26 #ifndef LBANN_SRC_LAYERS_TRANSFORM_CUTENSOR_PERMUTEIMPL_HPP_INCLUDED 27 #define LBANN_SRC_LAYERS_TRANSFORM_CUTENSOR_PERMUTEIMPL_HPP_INCLUDED 42 #include <unordered_map> 104 template <typename DataT>
106 El::Matrix<DataT,
El::
Device::GPU>& out) const;
115 template <typename DataT>
117 El::Matrix<DataT,
El::
Device::GPU>& out) const;
129 inline static std::unordered_map<std::
string, cutensorTensorDescriptor_t>
133 static std::vector<int32_t>
make_modes(
size_t const ndims);
134 template <typename DataT>
137 template <typename DataT>
138 static cutensorTensorDescriptor_t
151 : m_perm{std::move(
perm)},
198 std::vector<int32_t> modes(ndims + 1);
199 std::iota(begin(modes), end(modes), static_cast<int>(
'a'));
203 template <
typename DataT>
205 El::Matrix<DataT, El::Device::GPU>
const& mat,
208 auto const& dims = dims_in.
get();
209 std::ostringstream oss;
210 oss << mat.Height() <<
"," << mat.Width() <<
"," << mat.LDim() <<
";" 212 for (
size_t ii = 1; ii < dims.size(); ++ii)
213 oss <<
"," << dims[ii];
214 oss <<
";" << lbann::TypeName<DataT>();
218 template <
typename DataT>
220 El::Matrix<DataT, El::Device::GPU>
const& mat,
228 std::vector<int64_t> extents = dims.
get();
229 extents.push_back(mat.Width());
232 strides.get().push_back(mat.LDim());
234 cutensorTensorDescriptor_t desc;
239 strides.get().data(),
241 CUTENSOR_OP_IDENTITY));
254 template <
typename DataT>
256 El::Matrix<DataT, El::Device::GPU>
const& in,
257 El::Matrix<DataT, El::Device::GPU>& out)
const 262 auto const one = El::To<CUDAScalar<DataT>>(1.f);
264 El::MakeMultiSync(El::SyncInfoFromMatrix(out), El::SyncInfoFromMatrix(in));
276 CUDAScalarType<DataT>,
277 static_cast<El::SyncInfo<El::Device::GPU>
>(multisync).Stream()));
280 template <
typename DataT>
282 El::Matrix<DataT, El::Device::GPU>
const& in,
283 El::Matrix<DataT, El::Device::GPU>& out)
const 297 auto const one = El::To<CUDAScalar<DataT>>(1.f);
299 El::MakeMultiSync(El::SyncInfoFromMatrix(out), El::SyncInfoFromMatrix(in));
310 CUDAScalarType<DataT>,
311 static_cast<El::SyncInfo<El::Device::GPU>
>(multisync).Stream()));
324 #endif // LBANN_SRC_LAYERS_TRANSFORM_CUTENSOR_PERMUTEIMPL_HPP_INCLUDED ModesType const & output_modes() const noexcept
auto get_strides(ColMajorDims< DimT > const &dims)
Compute packed strides of the given dimensions.
auto permute_impl(std::vector< IndexT > const &in, std::vector< PermT > const &perm)
void inverse_permute(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
Apply the inverse permutation to the tensor.
static cutensorHandle_t * get_handle_ptr()
#define CHECK_CUTENSOR(cmd)
#define LBANN_ASSERT_DEBUG(cond)
DimsType const & input_dims() const noexcept
ModesType const & input_modes() const noexcept
StridesType output_strides() const
static std::unordered_map< std::string, cutensorTensorDescriptor_t > m_desc_map
Keep track of descriptors so we don't have to repeatedly rebuild them.
static std::string get_desc_key(El::Matrix< DataT, El::Device::GPU > const &mat, DimsType const &dims)
static cutensorTensorDescriptor_t get_descriptor(El::Matrix< DataT, El::Device::GPU > const &mat, DimsType const &dims)
StridesType input_strides() const
void permute(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
Permute the tensor.
constexpr El::Device Device
std::vector< int32_t > ModesType
cuTENSOR_PermuteImpl(ColMajorPerm perm)
DimsType const & output_dims() const noexcept
std::vector< T > & get() noexcept
auto size() const noexcept
void swap(cuTENSOR_PermuteImpl &other)
cuTENSOR-based implementation of tensor permute.
static std::vector< int32_t > make_modes(size_t const ndims)
bool is_valid(RowMajorPerm const &perm)
auto permute_dims(RowMajorDims< IndexT > const &in, RowMajorPerm const &perm)
void set_dims(DimsType input_dims)
Setup the dimensions.
ColMajorPerm const & perm() const noexcept