26 #ifndef LBANN_SRC_LAYERS_TRANSFORM_CUTT_PERMUTEIMPL_HPP_INCLUDED 27 #define LBANN_SRC_LAYERS_TRANSFORM_CUTT_PERMUTEIMPL_HPP_INCLUDED 40 #include <unordered_map> 44 #define LBANN_CHECK_CUTT(cmd) \ 46 cuttResult _check_cutt_err_result = (cmd); \ 47 if (CUTT_SUCCESS != _check_cutt_err_result) { \ 48 LBANN_ERROR("cuTT operation \"" #cmd "\" FAILED (", \ 49 cutt_err_string(_check_cutt_err_result), \ 59 case CUTT_INVALID_PLAN:
60 return "Invalid plan handle";
61 case CUTT_INVALID_PARAMETER:
62 return "Invalid input parameter";
63 case CUTT_INVALID_DEVICE:
64 return "Execution tried on device different than where plan was created";
65 case CUTT_INTERNAL_ERROR:
66 return "Internal error";
67 case CUTT_UNDEFINED_ERROR:
68 return "Undefined error";
70 return "<Unknown error value>";
130 template <typename DataT>
132 El::Matrix<DataT,
El::
Device::GPU>& out) const;
141 template <typename DataT>
143 El::Matrix<DataT,
El::
Device::GPU>& out) const;
160 template <typename DataT>
166 El::Matrix<DataT,
El::
Device::GPU> const& out) const;
168 template <typename DataT>
174 El::Matrix<DataT,
El::
Device::GPU> const& out) const;
176 template <typename DataT>
178 El::Matrix<DataT,
El::
Device::GPU> const& out) const;
180 template <typename DataT>
186 El::Matrix<DataT,
El::
Device::GPU>& out) const;
188 template <typename DataT>
194 El::Matrix<DataT,
El::
Device::GPU>& out) const;
231 std::cerr << e.
what();
257 template <
typename DataT>
263 El::Matrix<DataT, El::Device::GPU>
const& in,
264 El::Matrix<DataT, El::Device::GPU>
const& out)
const 270 auto const key = in.Width();
271 if (plan_map.count(key) == 0UL) {
272 std::vector<int> permutation(perm.
get()), dimensions(in_dims.
get());
273 permutation.push_back(static_cast<int>(perm.
size()));
274 dimensions.push_back(in.Width());
275 cuttHandle plan = 0U;
281 out.GetSyncInfo().Stream()));
282 plan_map.emplace(key, plan);
284 return plan_map[key];
287 template <
typename DataT>
292 El::Matrix<DataT, El::Device::GPU>
const& in,
293 El::Matrix<DataT, El::Device::GPU>
const& out)
const 295 std::vector<int> permutation(perm.
get()), dimensions(in_dims.
get());
302 out.GetSyncInfo().Stream()));
306 template <
typename DataT>
308 El::Matrix<DataT, El::Device::GPU>
const& in,
309 El::Matrix<DataT, El::Device::GPU>
const& out)
const 311 return in.LDim() == in.Height() && out.LDim() == out.Height() &&
315 template <
typename DataT>
317 El::Matrix<DataT, El::Device::GPU>& out)
const 319 if (in.Width() == El::Int{0})
333 template <
typename DataT>
335 El::Matrix<DataT, El::Device::GPU>
const& in,
336 El::Matrix<DataT, El::Device::GPU>& out)
const 338 if (in.Width() == El::Int{0})
357 template <
typename DataT>
363 El::Matrix<DataT, El::Device::GPU>
const& in,
364 El::Matrix<DataT, El::Device::GPU>& out)
const 367 El::MakeMultiSync(El::SyncInfoFromMatrix(out), El::SyncInfoFromMatrix(in));
368 auto const plan =
get_mb_plan(plan_map, perm, in_dims, out_dims, in, out);
370 cuttExecute(plan, const_cast<DataT*>(in.LockedBuffer()), out.Buffer()));
373 template <
typename DataT>
379 El::Matrix<DataT, El::Device::GPU>
const& in,
380 El::Matrix<DataT, El::Device::GPU>& out)
const 383 El::MakeMultiSync(El::SyncInfoFromMatrix(out), El::SyncInfoFromMatrix(in));
384 if (sample_plan == 0U)
387 DataT*
const in_buf =
const_cast<DataT*
>(in.LockedBuffer());
388 DataT*
const out_buf = out.Buffer();
390 auto const batch_size = in.Width();
391 auto const in_stride = in.LDim();
392 auto const out_stride = out.LDim();
393 for (El::Int sample = 0; sample < batch_size; ++sample) {
395 in_buf + sample * in_stride,
396 out_buf + sample * out_stride));
409 #undef LBANN_CHECK_CUTT 410 #endif // LBANN_SRC_LAYERS_TRANSFORM_CUTT_PERMUTEIMPL_HPP_INCLUDED void set_dims(DimsType input_dims)
Setup the dimensions.
cuttHandle get_mb_plan(PlanMap &plan_map, ColMajorPerm const &perm, DimsType const &in_dims, DimsType const &out_dims, El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > const &out) const
cuttHandle get_sample_plan(ColMajorPerm const &perm, DimsType const &in_dims, DimsType const &out_dims, El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > const &out) const
#define LBANN_ASSERT_DEBUG(cond)
std::unordered_map< BatchSizeT, Plan > PlanMap
The base exception for LBANN errors.
char const * what() const noexcept override
cuTT-based implementation of tensor permute.
ColMajorPerm const & perm() const noexcept
constexpr El::Device Device
cuTT_PermuteImpl(ColMajorPerm perm)
void permute(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
Permute the tensor.
void do_sample_permute(Plan &plan, ColMajorPerm const &perm, DimsType const &in_dims, DimsType const &out_dims, El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
void do_mb_permute(PlanMap &plan_map, ColMajorPerm const &perm, DimsType const &in_dims, DimsType const &out_dims, El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
std::vector< T > & get() noexcept
auto size() const noexcept
bool is_mb_permutable(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > const &out) const
DimsType const & output_dims() const noexcept
void inverse_permute(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
Apply the inverse permutation to the tensor.
static char const * cutt_err_string(cuttResult err) noexcept
#define LBANN_CHECK_CUTT(cmd)
~cuTT_PermuteImpl() noexcept
RowMajorPerm invert(RowMajorPerm const &in)
bool is_valid(RowMajorPerm const &perm)
DimsType const & input_dims() const noexcept
auto permute_dims(RowMajorDims< IndexT > const &in, RowMajorPerm const &perm)
void swap(cuTT_PermuteImpl &other)