LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
cutt_permuteimpl.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 #ifndef LBANN_SRC_LAYERS_TRANSFORM_CUTT_PERMUTEIMPL_HPP_INCLUDED
27 #define LBANN_SRC_LAYERS_TRANSFORM_CUTT_PERMUTEIMPL_HPP_INCLUDED
28 
29 #include "lbann/base.hpp" // Elemental support.
31 #include "lbann/utils/typename.hpp"
32 
33 #include "tensor_dims_utils.hpp"
34 
35 #include <cutt.h>
36 
37 #include <iterator>
38 #include <sstream>
39 #include <string>
40 #include <unordered_map>
41 #include <utility>
42 #include <vector>
43 
44 #define LBANN_CHECK_CUTT(cmd) \
45  do { \
46  cuttResult _check_cutt_err_result = (cmd); \
47  if (CUTT_SUCCESS != _check_cutt_err_result) { \
48  LBANN_ERROR("cuTT operation \"" #cmd "\" FAILED (", \
49  cutt_err_string(_check_cutt_err_result), \
50  ")"); \
51  } \
52  } while (0)
53 
54 static inline char const* cutt_err_string(cuttResult err) noexcept
55 {
56  switch (err) {
57  case CUTT_SUCCESS:
58  return "Success";
59  case CUTT_INVALID_PLAN:
60  return "Invalid plan handle";
61  case CUTT_INVALID_PARAMETER:
62  return "Invalid input parameter";
63  case CUTT_INVALID_DEVICE:
64  return "Execution tried on device different than where plan was created";
65  case CUTT_INTERNAL_ERROR:
66  return "Internal error";
67  case CUTT_UNDEFINED_ERROR:
68  return "Undefined error";
69  default:
70  return "<Unknown error value>";
71  }
72 }
73 
74 namespace lbann {
75 
94 {
95 public:
97 
98 public:
100 
103  ~cuTT_PermuteImpl() noexcept;
105 
106 
108  ColMajorPerm const& perm() const noexcept;
109 
110  DimsType const& input_dims() const noexcept;
111  DimsType const& output_dims() const noexcept;
112 
114 
115 
122 
130  template <typename DataT>
131  void permute(El::Matrix<DataT, El::Device::GPU> const& in,
132  El::Matrix<DataT, El::Device::GPU>& out) const;
133 
141  template <typename DataT>
142  void inverse_permute(El::Matrix<DataT, El::Device::GPU> const& in,
143  El::Matrix<DataT, El::Device::GPU>& out) const;
144 
146 
147  void swap(cuTT_PermuteImpl& other);
150 
151 private:
152  using BatchSizeT = El::Int;
153  using Plan = cuttHandle;
154  using PlanMap = std::unordered_map<BatchSizeT, Plan>;
155  // The key here corresponds to the minibatch size. This is chosen to
156  // be robust to variable batch sizes beyond the simple last-batch
157  // "remainder", however unlikely any other case may be.
158 
159 private:
160  template <typename DataT>
161  cuttHandle get_mb_plan(PlanMap& plan_map,
162  ColMajorPerm const& perm,
163  DimsType const& in_dims,
164  DimsType const& out_dims,
165  El::Matrix<DataT, El::Device::GPU> const& in,
166  El::Matrix<DataT, El::Device::GPU> const& out) const;
167 
168  template <typename DataT>
169  cuttHandle
170  get_sample_plan(ColMajorPerm const& perm,
171  DimsType const& in_dims,
172  DimsType const& out_dims,
173  El::Matrix<DataT, El::Device::GPU> const& in,
174  El::Matrix<DataT, El::Device::GPU> const& out) const;
175 
176  template <typename DataT>
177  bool is_mb_permutable(El::Matrix<DataT, El::Device::GPU> const& in,
178  El::Matrix<DataT, El::Device::GPU> const& out) const;
179 
180  template <typename DataT>
181  void do_mb_permute(PlanMap& plan_map,
182  ColMajorPerm const& perm,
183  DimsType const& in_dims,
184  DimsType const& out_dims,
185  El::Matrix<DataT, El::Device::GPU> const& in,
186  El::Matrix<DataT, El::Device::GPU>& out) const;
187 
188  template <typename DataT>
189  void do_sample_permute(Plan& plan,
190  ColMajorPerm const& perm,
191  DimsType const& in_dims,
192  DimsType const& out_dims,
193  El::Matrix<DataT, El::Device::GPU> const& in,
194  El::Matrix<DataT, El::Device::GPU>& out) const;
195 
196 private:
201 
202  // Plan memoization -- lazily constructed.
205  mutable Plan m_sample_fwd_plan = 0U;
206  mutable Plan m_sample_inv_plan = 0U;
207 }; // class cuTT_PermuteImpl
208 
210  : m_perm{std::move(perm)}, m_inv_perm{invert(m_perm)}
211 {
214 }
215 
217 {
218  try {
219  for (auto& [_, plan] : m_fwd_plans)
220  if (plan)
221  LBANN_CHECK_CUTT(cuttDestroy(plan));
222  for (auto& [_, plan] : m_inv_plans)
223  if (plan)
224  LBANN_CHECK_CUTT(cuttDestroy(plan));
225  if (m_sample_fwd_plan)
226  LBANN_CHECK_CUTT(cuttDestroy(m_sample_fwd_plan));
227  if (m_sample_inv_plan)
228  LBANN_CHECK_CUTT(cuttDestroy(m_sample_inv_plan));
229  }
230  catch (lbann::exception const& e) {
231  std::cerr << e.what();
232  std::terminate();
233  }
234 }
235 
236 inline auto cuTT_PermuteImpl::perm() const noexcept -> ColMajorPerm const&
237 {
238  return m_perm;
239 }
240 
241 inline auto cuTT_PermuteImpl::input_dims() const noexcept -> DimsType const&
242 {
243  return m_input_dims;
244 }
245 
246 inline auto cuTT_PermuteImpl::output_dims() const noexcept -> DimsType const&
247 {
248  return m_output_dims;
249 }
250 
252 {
253  m_input_dims = std::move(input_dims);
255 }
256 
257 template <typename DataT>
259  PlanMap& plan_map,
260  ColMajorPerm const& perm,
261  DimsType const& in_dims,
262  DimsType const& out_dims,
263  El::Matrix<DataT, El::Device::GPU> const& in,
264  El::Matrix<DataT, El::Device::GPU> const& out) const
265 {
266  LBANN_ASSERT_DEBUG(in.Width() == out.Width());
267  LBANN_ASSERT_DEBUG(perm.size() == in_dims.size() &&
268  perm.size() == out_dims.size());
269 
270  auto const key = in.Width();
271  if (plan_map.count(key) == 0UL) {
272  std::vector<int> permutation(perm.get()), dimensions(in_dims.get());
273  permutation.push_back(static_cast<int>(perm.size()));
274  dimensions.push_back(in.Width());
275  cuttHandle plan = 0U;
276  LBANN_CHECK_CUTT(cuttPlan(&plan,
277  dimensions.size(),
278  dimensions.data(),
279  permutation.data(),
280  sizeof(DataT),
281  out.GetSyncInfo().Stream()));
282  plan_map.emplace(key, plan);
283  }
284  return plan_map[key];
285 }
286 
287 template <typename DataT>
289  ColMajorPerm const& perm,
290  DimsType const& in_dims,
291  DimsType const& out_dims,
292  El::Matrix<DataT, El::Device::GPU> const& in,
293  El::Matrix<DataT, El::Device::GPU> const& out) const
294 {
295  std::vector<int> permutation(perm.get()), dimensions(in_dims.get());
296  Plan plan = 0UL;
297  LBANN_CHECK_CUTT(cuttPlan(&plan,
298  dimensions.size(),
299  dimensions.data(),
300  permutation.data(),
301  sizeof(DataT),
302  out.GetSyncInfo().Stream()));
303  return plan;
304 }
305 
306 template <typename DataT>
308  El::Matrix<DataT, El::Device::GPU> const& in,
309  El::Matrix<DataT, El::Device::GPU> const& out) const
310 {
311  return in.LDim() == in.Height() && out.LDim() == out.Height() &&
312  in.Width() > 1;
313 }
314 
315 template <typename DataT>
316 void cuTT_PermuteImpl::permute(El::Matrix<DataT, El::Device::GPU> const& in,
317  El::Matrix<DataT, El::Device::GPU>& out) const
318 {
319  if (in.Width() == El::Int{0})
320  return;
321 
322  if (is_mb_permutable(in, out))
324  else
326  m_inv_perm,
327  m_input_dims,
329  in,
330  out);
331 }
332 
333 template <typename DataT>
335  El::Matrix<DataT, El::Device::GPU> const& in,
336  El::Matrix<DataT, El::Device::GPU>& out) const
337 {
338  if (in.Width() == El::Int{0})
339  return;
340 
341  if (is_mb_permutable(in, out))
343  m_inv_perm,
345  m_input_dims,
346  in,
347  out);
348  else
350  m_inv_perm,
352  m_input_dims,
353  in,
354  out);
355 }
356 
357 template <typename DataT>
359  PlanMap& plan_map,
360  ColMajorPerm const& perm,
361  DimsType const& in_dims,
362  DimsType const& out_dims,
363  El::Matrix<DataT, El::Device::GPU> const& in,
364  El::Matrix<DataT, El::Device::GPU>& out) const
365 {
366  auto multisync =
367  El::MakeMultiSync(El::SyncInfoFromMatrix(out), El::SyncInfoFromMatrix(in));
368  auto const plan = get_mb_plan(plan_map, perm, in_dims, out_dims, in, out);
370  cuttExecute(plan, const_cast<DataT*>(in.LockedBuffer()), out.Buffer()));
371 }
372 
373 template <typename DataT>
375  Plan& sample_plan,
376  ColMajorPerm const& perm,
377  DimsType const& in_dims,
378  DimsType const& out_dims,
379  El::Matrix<DataT, El::Device::GPU> const& in,
380  El::Matrix<DataT, El::Device::GPU>& out) const
381 {
382  auto multisync =
383  El::MakeMultiSync(El::SyncInfoFromMatrix(out), El::SyncInfoFromMatrix(in));
384  if (sample_plan == 0U)
385  sample_plan = get_sample_plan(perm, in_dims, out_dims, in, out);
386 
387  DataT* const in_buf = const_cast<DataT*>(in.LockedBuffer());
388  DataT* const out_buf = out.Buffer();
389 
390  auto const batch_size = in.Width();
391  auto const in_stride = in.LDim();
392  auto const out_stride = out.LDim();
393  for (El::Int sample = 0; sample < batch_size; ++sample) {
394  LBANN_CHECK_CUTT(cuttExecute(sample_plan,
395  in_buf + sample * in_stride,
396  out_buf + sample * out_stride));
397  }
398 }
399 
401 {
402  std::swap(m_perm, other.m_perm);
403  std::swap(m_inv_perm, other.m_inv_perm);
404  std::swap(m_input_dims, other.m_input_dims);
405  std::swap(m_output_dims, other.m_output_dims);
406 }
407 
408 } // namespace lbann
409 #undef LBANN_CHECK_CUTT
410 #endif // LBANN_SRC_LAYERS_TRANSFORM_CUTT_PERMUTEIMPL_HPP_INCLUDED
void set_dims(DimsType input_dims)
Setup the dimensions.
cuttHandle get_mb_plan(PlanMap &plan_map, ColMajorPerm const &perm, DimsType const &in_dims, DimsType const &out_dims, El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > const &out) const
cuttHandle get_sample_plan(ColMajorPerm const &perm, DimsType const &in_dims, DimsType const &out_dims, El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > const &out) const
#define LBANN_ASSERT_DEBUG(cond)
Definition: exception.hpp:104
std::unordered_map< BatchSizeT, Plan > PlanMap
The base exception for LBANN errors.
Definition: exception.hpp:118
char const * what() const noexcept override
cuTT-based implementation of tensor permute.
ColMajorPerm const & perm() const noexcept
constexpr El::Device Device
cuTT_PermuteImpl(ColMajorPerm perm)
void permute(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
Permute the tensor.
void do_sample_permute(Plan &plan, ColMajorPerm const &perm, DimsType const &in_dims, DimsType const &out_dims, El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
void do_mb_permute(PlanMap &plan_map, ColMajorPerm const &perm, DimsType const &in_dims, DimsType const &out_dims, El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
std::vector< T > & get() noexcept
auto size() const noexcept
bool is_mb_permutable(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > const &out) const
DimsType const & output_dims() const noexcept
void inverse_permute(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
Apply the inverse permutation to the tensor.
static char const * cutt_err_string(cuttResult err) noexcept
#define LBANN_CHECK_CUTT(cmd)
RowMajorPerm invert(RowMajorPerm const &in)
bool is_valid(RowMajorPerm const &perm)
DimsType const & input_dims() const noexcept
auto permute_dims(RowMajorDims< IndexT > const &in, RowMajorPerm const &perm)
void swap(cuTT_PermuteImpl &other)