LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
cutensor_permuteimpl.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 #ifndef LBANN_SRC_LAYERS_TRANSFORM_CUTENSOR_PERMUTEIMPL_HPP_INCLUDED
27 #define LBANN_SRC_LAYERS_TRANSFORM_CUTENSOR_PERMUTEIMPL_HPP_INCLUDED
28 
29 #include "lbann/base.hpp" // Elemental support.
31 #include "lbann/utils/typename.hpp"
32 
33 #include "tensor_dims_utils.hpp"
34 
35 // This is only separated to make it easier if we "formally accept"
36 // cuTENSOR like we have cuDNN or cuBLAS, etc.
37 #include "cutensor_support.hpp"
38 
39 #include <iterator>
40 #include <sstream>
41 #include <string>
42 #include <unordered_map>
43 #include <utility>
44 #include <vector>
45 
46 namespace lbann {
47 
56 {
57 public:
60  using ModesType = std::vector<int32_t>;
61 
62 public:
64 
67 
69 
70 
72  ColMajorPerm const& perm() const noexcept;
73 
74  DimsType const& input_dims() const noexcept;
75  DimsType const& output_dims() const noexcept;
76 
77  ModesType const& input_modes() const noexcept;
78  ModesType const& output_modes() const noexcept;
79 
80  StridesType input_strides() const;
82 
84 
85 
92 
104  template <typename DataT>
105  void permute(El::Matrix<DataT, El::Device::GPU> const& in,
106  El::Matrix<DataT, El::Device::GPU>& out) const;
107 
115  template <typename DataT>
116  void inverse_permute(El::Matrix<DataT, El::Device::GPU> const& in,
117  El::Matrix<DataT, El::Device::GPU>& out) const;
118 
120 
121  void swap(cuTENSOR_PermuteImpl& other);
124 
125 private:
129  inline static std::unordered_map<std::string, cutensorTensorDescriptor_t>
131 
132 private:
133  static std::vector<int32_t> make_modes(size_t const ndims);
134  template <typename DataT>
135  static std::string get_desc_key(El::Matrix<DataT, El::Device::GPU> const& mat,
136  DimsType const& dims);
137  template <typename DataT>
138  static cutensorTensorDescriptor_t
139  get_descriptor(El::Matrix<DataT, El::Device::GPU> const& mat,
140  DimsType const& dims);
141 
142 private:
148 }; // class cuTENSOR_PermuteImpl
149 
151  : m_perm{std::move(perm)},
154 {
156 }
157 
158 inline auto cuTENSOR_PermuteImpl::perm() const noexcept -> ColMajorPerm const&
159 {
160  return m_perm;
161 }
162 
163 inline auto cuTENSOR_PermuteImpl::input_dims() const noexcept -> DimsType const&
164 {
165  return m_input_dims;
166 }
167 
168 inline auto cuTENSOR_PermuteImpl::output_dims() const noexcept
169  -> DimsType const&
170 {
171  return m_output_dims;
172 }
173 
174 inline auto cuTENSOR_PermuteImpl::input_modes() const noexcept
175  -> ModesType const&
176 {
177  return m_input_modes;
178 }
179 
180 inline auto cuTENSOR_PermuteImpl::output_modes() const noexcept
181  -> ModesType const&
182 {
183  return m_output_modes;
184 }
185 
187 {
188  return get_strides(m_input_dims);
189 }
190 
192 {
193  return get_strides(m_output_dims);
194 }
195 
196 inline auto cuTENSOR_PermuteImpl::make_modes(size_t const ndims) -> ModesType
197 {
198  std::vector<int32_t> modes(ndims + 1); // Add the sample dim.
199  std::iota(begin(modes), end(modes), static_cast<int>('a'));
200  return modes;
201 }
202 
203 template <typename DataT>
205  El::Matrix<DataT, El::Device::GPU> const& mat,
206  DimsType const& dims_in)
207 {
208  auto const& dims = dims_in.get();
209  std::ostringstream oss;
210  oss << mat.Height() << "," << mat.Width() << "," << mat.LDim() << ";"
211  << dims.front();
212  for (size_t ii = 1; ii < dims.size(); ++ii)
213  oss << "," << dims[ii];
214  oss << ";" << lbann::TypeName<DataT>();
215  return oss.str();
216 }
217 
218 template <typename DataT>
219 cutensorTensorDescriptor_t cuTENSOR_PermuteImpl::get_descriptor(
220  El::Matrix<DataT, El::Device::GPU> const& mat,
221  DimsType const& dims)
222 {
223  auto key = get_desc_key(mat, dims); // captures Width to account for
224  // minibatch size and LDim to
225  // account for stride.
226  auto iter = m_desc_map.find(key);
227  if (iter == end(m_desc_map)) {
228  std::vector<int64_t> extents = dims.get();
229  extents.push_back(mat.Width()); // Don't forget MB size
230 
231  auto strides = get_strides(dims);
232  strides.get().push_back(mat.LDim()); // Don't forget sample stride.
233 
234  cutensorTensorDescriptor_t desc;
235  CHECK_CUTENSOR(cutensorInitTensorDescriptor(get_handle_ptr(),
236  &desc,
237  extents.size(),
238  extents.data(),
239  strides.get().data(),
240  CUDAType<DataT>,
241  CUTENSOR_OP_IDENTITY));
242  m_desc_map.emplace(std::move(key), desc);
243  return desc;
244  }
245  return iter->second;
246 }
247 
249 {
250  m_input_dims = std::move(input_dims);
252 }
253 
254 template <typename DataT>
256  El::Matrix<DataT, El::Device::GPU> const& in,
257  El::Matrix<DataT, El::Device::GPU>& out) const
258 {
259  auto const in_desc = get_descriptor(in, m_input_dims);
260  auto const out_desc = get_descriptor(out, m_output_dims);
261 
262  auto const one = El::To<CUDAScalar<DataT>>(1.f);
263  auto multisync =
264  El::MakeMultiSync(El::SyncInfoFromMatrix(out), El::SyncInfoFromMatrix(in));
265 
266  // This permutation is input_modes -> output_modes.
267  CHECK_CUTENSOR(cutensorPermutation(
268  get_handle_ptr(),
269  &one,
270  in.LockedBuffer(),
271  &in_desc,
272  m_input_modes.data(),
273  out.Buffer(),
274  &out_desc,
275  m_output_modes.data(),
276  CUDAScalarType<DataT>,
277  static_cast<El::SyncInfo<El::Device::GPU>>(multisync).Stream()));
278 }
279 
280 template <typename DataT>
282  El::Matrix<DataT, El::Device::GPU> const& in,
283  El::Matrix<DataT, El::Device::GPU>& out) const
284 {
285  // This permutation is output_modes -> input_modes. Use some aliases
286  // to help.
287  auto const& in_dims = m_output_dims;
288  auto const& out_dims = m_input_dims;
289 
290  auto const& in_modes = m_output_modes;
291  auto const& out_modes = m_input_modes;
292 
293  // This part matches the regular "permute" function
294  auto const in_desc = get_descriptor(in, in_dims);
295  auto const out_desc = get_descriptor(out, out_dims);
296 
297  auto const one = El::To<CUDAScalar<DataT>>(1.f);
298  auto multisync =
299  El::MakeMultiSync(El::SyncInfoFromMatrix(out), El::SyncInfoFromMatrix(in));
300 
301  CHECK_CUTENSOR(cutensorPermutation(
302  get_handle_ptr(),
303  &one,
304  in.LockedBuffer(),
305  &in_desc,
306  in_modes.data(),
307  out.Buffer(),
308  &out_desc,
309  out_modes.data(),
310  CUDAScalarType<DataT>,
311  static_cast<El::SyncInfo<El::Device::GPU>>(multisync).Stream()));
312 }
313 
315 {
316  std::swap(m_perm, other.m_perm);
317  std::swap(m_input_dims, other.m_input_dims);
318  std::swap(m_output_dims, other.m_output_dims);
319  std::swap(m_input_modes, other.m_input_modes);
320  std::swap(m_output_modes, other.m_output_modes);
321 }
322 
323 } // namespace lbann
324 #endif // LBANN_SRC_LAYERS_TRANSFORM_CUTENSOR_PERMUTEIMPL_HPP_INCLUDED
ModesType const & output_modes() const noexcept
auto get_strides(ColMajorDims< DimT > const &dims)
Compute packed strides of the given dimensions.
auto permute_impl(std::vector< IndexT > const &in, std::vector< PermT > const &perm)
void inverse_permute(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
Apply the inverse permutation to the tensor.
static cutensorHandle_t * get_handle_ptr()
#define CHECK_CUTENSOR(cmd)
#define LBANN_ASSERT_DEBUG(cond)
Definition: exception.hpp:104
DimsType const & input_dims() const noexcept
ModesType const & input_modes() const noexcept
static std::unordered_map< std::string, cutensorTensorDescriptor_t > m_desc_map
Keep track of descriptors so we don&#39;t have to repeatedly rebuild them.
static std::string get_desc_key(El::Matrix< DataT, El::Device::GPU > const &mat, DimsType const &dims)
static cutensorTensorDescriptor_t get_descriptor(El::Matrix< DataT, El::Device::GPU > const &mat, DimsType const &dims)
void permute(El::Matrix< DataT, El::Device::GPU > const &in, El::Matrix< DataT, El::Device::GPU > &out) const
Permute the tensor.
constexpr El::Device Device
std::vector< int32_t > ModesType
DimsType const & output_dims() const noexcept
std::vector< T > & get() noexcept
auto size() const noexcept
void swap(cuTENSOR_PermuteImpl &other)
cuTENSOR-based implementation of tensor permute.
static std::vector< int32_t > make_modes(size_t const ndims)
bool is_valid(RowMajorPerm const &perm)
auto permute_dims(RowMajorDims< IndexT > const &in, RowMajorPerm const &perm)
void set_dims(DimsType input_dims)
Setup the dimensions.
ColMajorPerm const & perm() const noexcept