LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
serialize_matrices_impl.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 #pragma once
27 #ifndef LBANN_UTILS_SERIALIZATION_SERIALIZE_MATRICES_IMPL_HPP_
28 #define LBANN_UTILS_SERIALIZATION_SERIALIZE_MATRICES_IMPL_HPP_
29 
31 
32 #include <El/blas_like/level1/Copy/Translate.hpp>
33 #include <El/blas_like/level1/Copy/TranslateBetweenGrids.hpp>
34 
35 // These really belong in Elemental; let's just extend that.
36 namespace El {
37 
38 template <typename ArchiveT, typename T>
39 void save(ArchiveT& ar, ::El::AbstractMatrix<T> const& mat)
40 {
41  switch (mat.GetDevice()) {
42  case ::El::Device::CPU:
43  save(ar, static_cast<::El::Matrix<T, ::El::Device::CPU> const&>(mat));
44  break;
45 #ifdef LBANN_HAS_GPU
46  case ::El::Device::GPU:
47  save(ar, static_cast<::El::Matrix<T, ::El::Device::GPU> const&>(mat));
48  break;
49 #endif // LBANN_HAS_GPU
50  default:
51  LBANN_ERROR("Unknown device.");
52  }
53 }
54 
55 template <typename ArchiveT,
56  typename T,
57  ::El::Device D,
59 void save(ArchiveT& ar, ::El::Matrix<T, D> const& mat)
60 {
61  LBANN_ASSERT(!mat.Viewing());
62  ar(::cereal::make_nvp("height", mat.Height()),
63  ::cereal::make_nvp("width", mat.Width()));
64 }
65 
66 namespace details {
67 template <typename ArchiveT,
68  typename T,
70 void do_save(ArchiveT& ar, ::El::Matrix<T, ::El::Device::CPU> const& mat)
71 {
72  LBANN_ASSERT(!mat.Viewing());
73  ar(mat.Height(), mat.Width());
74  if (mat.Contiguous()) {
75  ar(::cereal::binary_data(mat.LockedBuffer(),
76  mat.LDim() * mat.Width() * sizeof(T)));
77  }
78  else {
79  for (::El::Int col = 0; col < mat.Width(); ++col)
80  ar(::cereal::binary_data(mat.LockedBuffer() + col * mat.LDim(),
81  mat.Height() * sizeof(T)));
82  }
83 }
84 
85 #ifdef LBANN_HAS_GPU
86 template <typename ArchiveT,
87  typename T,
89 void do_save(ArchiveT& ar, ::El::Matrix<T, ::El::Device::GPU> const& mat)
90 {
91  ::El::Matrix<T, ::El::Device::CPU> cpu_mat(mat);
92  do_save(ar, cpu_mat);
93 }
94 #endif // LBANN_HAS_GPU
95 } // namespace details
96 
98 template <typename ArchiveT,
99  typename T,
100  ::El::Device D,
102 void save(ArchiveT& ar, ::El::Matrix<T, D> const& mat)
103 {
104  LBANN_ASSERT(!mat.Viewing());
105  details::do_save(ar, mat);
106 }
107 
108 template <typename ArchiveT, typename T, ::El::Device D>
110  ::El::Matrix<T, D> const& mat)
111 {
112  LBANN_ASSERT(!mat.Viewing());
113  // Forward to the underlying archive on Root.
114  ar.save_on_root(mat);
115 }
116 
117 template <typename ArchiveT,
118  typename T,
120 void load(ArchiveT& archive, ::El::AbstractMatrix<T>& mat)
121 {
122  switch (mat.GetDevice()) {
123  case ::El::Device::CPU:
124  load(archive, static_cast<::El::Matrix<T, ::El::Device::CPU>&>(mat));
125  break;
126 #ifdef LBANN_HAS_GPU
127  case ::El::Device::GPU:
128  load(archive, static_cast<::El::Matrix<T, ::El::Device::GPU>&>(mat));
129  break;
130 #endif // LBANN_HAS_GPU
131  default:
132  LBANN_ERROR("Unknown device.");
133  }
134 }
135 
136 template <typename ArchiveT,
137  typename T,
138  ::El::Device D,
140 void load(ArchiveT& archive, ::El::Matrix<T, D>& mat)
141 {
142  LBANN_ASSERT(!mat.Viewing());
143  ::El::Int height, width;
144  archive(CEREAL_NVP(height), CEREAL_NVP(width));
145  mat.Resize(height, width);
146 }
147 
148 template <typename ArchiveT,
149  typename T,
151 void load(ArchiveT& archive, ::El::Matrix<T, ::El::Device::CPU>& mat)
152 {
153  LBANN_ASSERT(!mat.Viewing());
154  ::El::Int height, width;
155  archive(CEREAL_NVP(height), CEREAL_NVP(width));
156  mat.Resize(height, width);
157  archive(::cereal::binary_data(mat.Buffer(),
158  mat.Height() * mat.Width() * sizeof(T)));
159 }
160 
161 #if defined LBANN_HAS_GPU
162 template <typename ArchiveT,
163  typename T,
165 void load(ArchiveT& archive, ::El::Matrix<T, ::El::Device::GPU>& mat)
166 {
167  LBANN_ASSERT(!mat.Viewing());
168  ::El::Matrix<T, ::El::Device::CPU> cpu_mat;
169  load(archive, cpu_mat);
170  ::El::Copy(cpu_mat, mat);
171 }
172 #endif // defined LBANN_HAS_GPU
173 
174 template <typename ArchiveT, typename T, ::El::Device D>
176  ::El::Matrix<T, D>& mat)
177 {
178  LBANN_ASSERT(!mat.Viewing());
179 
180  // Restore the local matrix, then handle the Bcast
181  ar.load_on_root(mat);
182 
183  // First broadcast the size information.
184  auto height = mat.Height();
185  auto width = mat.Width();
186  ::El::mpi::Broadcast(height,
187  ar.root(),
188  ar.grid().Comm(),
189  ::El::SyncInfo<::El::Device::CPU>{});
190  ::El::mpi::Broadcast(width,
191  ar.root(),
192  ar.grid().Comm(),
193  ::El::SyncInfo<::El::Device::CPU>{});
194  // Resize _should_ be a no-op if the size doesn't change, but I'm
195  // not actually 100% sure.
196  if (!ar.am_root())
197  mat.Resize(height, width);
198 
199  // Finally the matrix data.
200  ::El::Broadcast(static_cast<::El::AbstractMatrix<T>&>(mat),
201  ar.grid().Comm(),
202  ar.root());
203 }
204 
205 // DistMatrix
206 
207 template <typename ArchiveT,
208  typename T,
210 void save(ArchiveT& ar, ::El::AbstractDistMatrix<T> const& mat)
211 {
212  LBANN_ASSERT(!mat.Viewing());
213  ar(::cereal::make_nvp("global_height", mat.Height()),
214  ::cereal::make_nvp("global_width", mat.Width()));
215 }
216 
217 template <typename ArchiveT,
218  typename T,
220 void load(ArchiveT& ar, ::El::AbstractDistMatrix<T>& mat)
221 {
222  LBANN_ASSERT(!mat.Viewing());
223  ::El::Int global_height, global_width;
224  ar(::cereal::make_nvp("global_height", global_height),
225  ::cereal::make_nvp("global_width", global_width));
226  mat.Resize(global_height, global_width);
227 }
228 
229 template <typename ArchiveT,
230  typename T,
232 void save(ArchiveT& ar, ::El::AbstractDistMatrix<T> const& mat)
233 {
234  LBANN_ASSERT(!mat.Viewing());
235  // Binary archives don't use NVPs, so there's no point in making
236  // them here.
237  ar(mat.Height(), mat.Width(), mat.LockedMatrix());
238 }
239 
240 template <typename ArchiveT,
241  typename T,
243 void load(ArchiveT& ar, ::El::AbstractDistMatrix<T>& mat)
244 {
245  LBANN_ASSERT(!mat.Viewing());
246  ::El::Int global_height, global_width;
247  ar(global_height, global_width);
248  mat.Resize(global_height, global_width);
249 #ifdef LBANN_DEBUG
250  ::El::Matrix<T, ::El::Device::CPU> mat_cpu;
251  ar(mat_cpu);
252  LBANN_ASSERT(mat_cpu.Height() == mat.LocalHeight());
253  LBANN_ASSERT(mat_cpu.Width() == mat.LocalWidth());
254  mat.Matrix() = mat_cpu;
255 #else
256  ar(mat.Matrix());
257 #endif
258 }
259 
260 template <typename ArchiveT,
261  typename T,
264  ::El::AbstractDistMatrix<T> const& mat)
265 {
266  ar(::cereal::make_nvp("global_height", mat.Height()),
267  ::cereal::make_nvp("global_width", mat.Width()));
268 }
269 
270 template <typename ArchiveT,
271  typename T,
274  ::El::AbstractDistMatrix<T>& mat)
275 {
276  El::Int height, width;
277  ar(::cereal::make_nvp("global_height", height),
278  ::cereal::make_nvp("global_width", width));
279  mat.Resize(height, width);
280 }
281 
282 template <typename ArchiveT,
283  typename T,
286  ::El::AbstractDistMatrix<T> const& mat)
287 {
288  LBANN_ASSERT(!mat.Viewing());
289  using CircMatType = ::El::
290  DistMatrix<T, ::El::CIRC, ::El::CIRC, ::El::ELEMENT, ::El::Device::CPU>;
291  CircMatType circ_mat(mat);
292  CircMatType circ_mat_ar(ar.grid(), ar.root());
293  if (circ_mat.DistData() == circ_mat_ar.DistData()) {
294  circ_mat_ar = std::move(circ_mat);
295  }
296  else {
297  ::El::copy::Translate(circ_mat, circ_mat_ar);
298  }
299  save(ar, circ_mat_ar);
300 }
301 
302 template <typename ArchiveT,
303  typename T,
306  ::El::AbstractDistMatrix<T>& mat)
307 {
308  LBANN_ASSERT(!mat.Viewing());
309  using CircMatType = ::El::
310  DistMatrix<T, ::El::CIRC, ::El::CIRC, ::El::ELEMENT, ::El::Device::CPU>;
311 
312  // Do the root process read.
313  CircMatType circ_mat(mat.Grid(), mat.Root());
314  CircMatType circ_mat_ar(ar.grid(), ar.root());
315  load(ar, circ_mat_ar);
316  if (circ_mat.DistData() == circ_mat_ar.DistData()) {
317  circ_mat = std::move(circ_mat_ar);
318  }
319  else {
320  ::El::copy::Translate(circ_mat_ar, circ_mat);
321  }
322 
323  // Distribute the data
324  ::El::Copy(circ_mat, mat);
325 }
326 
327 template <typename ArchiveT,
328  typename T,
331  ::El::DistMatrix<T, ::El::CIRC, ::El::CIRC> const& mat)
332 {
333  LBANN_ASSERT(!mat.Viewing());
334  LBANN_ASSERT(mat.Grid() == ar.grid());
335  LBANN_ASSERT(mat.Root() == ar.root());
336  ar(::cereal::make_nvp("global_height", mat.Height()),
337  ::cereal::make_nvp("global_width", mat.Width()));
338  save(ar, ::cereal::make_nvp("matrix_data", mat.LockedMatrix()));
339 }
340 
341 template <typename ArchiveT,
342  typename T,
345  ::El::DistMatrix<T, ::El::CIRC, ::El::CIRC>& mat)
346 {
347  LBANN_ASSERT(!mat.Viewing());
348  LBANN_ASSERT(mat.Grid() == ar.grid());
349  LBANN_ASSERT(mat.Root() == ar.root());
350 
351  // Restore the height/width using the usual mechanism, but WAIT on
352  // the matrix, since the local matrix of CIRC,CIRC matrix is not
353  // Bcast.
354  ::El::Int height, width;
355  ar(::cereal::make_nvp("global_height", height),
356  ::cereal::make_nvp("global_width", width));
357 
358  // Restore the matrix data on the root process.
359  mat.Resize(height, width);
360  ar.load_on_root(mat.Matrix());
361 }
362 
363 } // namespace El
364 
365 // Dealing with smart pointers and object construction
366 
367 namespace cereal {
368 
369 template <typename DataT,
370  ::El::Dist CDist,
371  ::El::Dist RDist,
372  ::El::DistWrap Wrap,
373  ::El::Device D>
374 template <
375  typename ArchiveT,
376  ::h2::meta::EnableWhen<::lbann::utils::IsBuiltinArchive<ArchiveT>, int>>
377 void LoadAndConstruct<::El::DistMatrix<DataT, CDist, RDist, Wrap, D>>::
378  load_and_construct(ArchiveT& ar, cereal::construct<DistMatrixType>& construct)
379 {
380  // Construct the matrix on the right grid.
382  construct(g, /*root=*/0);
383 
384  // Use the regular load function to restore its state. NOTE: do
385  // *not* use ArchiveT::operator() here because it trys to open a
386  // new scope, which can cause a variety of errors depending on the
387  // underlying archive type.
388  load(ar, *construct.ptr());
389 }
390 
391 template <typename DataT,
392  ::El::Dist CDist,
393  ::El::Dist RDist,
394  ::El::DistWrap Wrap,
395  ::El::Device D>
396 template <typename ArchiveT>
397 void LoadAndConstruct<::El::DistMatrix<DataT, CDist, RDist, Wrap, D>>::
399  cereal::construct<DistMatrixType>& construct)
400 {
401  construct(ar.grid(), /*root=*/0);
402  load(ar, *construct.ptr());
403 }
404 
405 } // namespace cereal
406 
407 #endif // LBANN_UTILS_SERIALIZATION_SERIALIZE_MATRICES_IMPL_HPP_
El::Grid Grid
Definition: base.hpp:126
#define LBANN_ERROR(...)
Definition: exception.hpp:37
EnableWhen<!IsTextArchive< ArchiveT > &&IsBuiltinArchive< ArchiveT >, ResultT > WhenNotTextArchive
SFINAE helper for splitting text-based and non-text-based serialization functions.
El::Grid const & grid() const noexcept
Grid const & get_current_grid() noexcept
Get the current grid being used for deserialization.
constexpr El::Device Device
#define LBANN_ASSERT(cond)
Definition: exception.hpp:97
void load(ArchiveT &archive, ::El::AbstractMatrix< T > &mat)
void save(ArchiveT &ar, ::El::AbstractMatrix< T > const &mat)
Save a matrix to a text-based archive.
EnableWhen< IsTextArchive< ArchiveT > &&IsBuiltinArchive< ArchiveT >, ResultT > WhenTextArchive
SFINAE helper for splitting text-based and non-text-based serialization functions.
void do_save(ArchiveT &ar, ::El::Matrix< T, ::El::Device::CPU > const &mat)
Save a CPU matrix to a non-text-based archive.
El::Grid const & grid() const noexcept
::distconv::tensor::Distribution Dist