LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
kfac_util.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_EXECUTION_ALGORITHMS_KFAC_KFAC_UTIL_HPP_INCLUDED
28 #define LBANN_EXECUTION_ALGORITHMS_KFAC_KFAC_UTIL_HPP_INCLUDED
29 
30 #include "lbann/base.hpp"
36 
37 // Forward declarations
38 namespace lbann {
39 class KFAC;
40 template <El::Device Device>
41 class kfac_block;
42 } // namespace lbann
43 
44 namespace lbann {
45 
46 namespace kfac {
47 
48 #if defined AL_HAS_NCCL
49 using BackendT = ::Al::NCCLBackend;
50 #elif defined AL_HAS_HOST_TRANSFER
51 using BackendT = ::Al::HostTransferBackend;
52 #else
53 using BackendT = ::Al::MPIBackend;
54 #endif
55 
56 using ReqT = typename BackendT::req_type;
57 
59 {
60  ALL, // Apply round-robin assingment to all of the layers. may cause load
61  // imbalance.
62  EACH, // Apply round-robin assingment to every type of layers. may
63  // not work well for small networks.
64  ROOT, // Use only the root GPU. This is only for testing.
65 };
66 
68 {
69  ALLREDUCE, // Use lbann_comm::allreduce
70  REDUCE_SCATTER, // Use El::ReduceScatter
71  REDUCE, // Use El::Reduce for each block
72 };
73 
75 {
76  ALLREDUCE, // Use lbann_comm::allreduce
77  ALLGATHER, // Use El::ReduceScatter
78  BROADCAST // Use El::Broadcast for each block
79 };
80 
82 template <El::Device Device>
83 void get_matrix_inverse(El::AbstractMatrix<DataType>& Ainv,
84  El::AbstractMatrix<DataType>& Linv,
85  const El::AbstractMatrix<DataType>& A,
86  bool report_time,
87  DataType damping,
88  DataType damping_bn_err,
89  bool is_bn,
90  const El::SyncInfo<Device>& sync_info);
91 
93 template <El::Device Device>
94 void get_matrix_inverse_eigen(El::AbstractMatrix<DataType>& Ainv,
95  El::AbstractMatrix<DataType>& Linv,
96  const El::AbstractMatrix<DataType>& A,
97  bool report_time,
98  DataType damping,
99  DataType damping_bn_err,
100  bool is_bn,
101  const El::SyncInfo<Device>& sync_info);
102 
104 template <El::Device Device>
105 std::string get_matrix_stat(const El::Matrix<DataType, Device>& X,
106  const char* name);
107 
109 template <El::Device Device>
110 void allreduce_lower_tri(El::AbstractMatrix<DataType>& A,
111  El::AbstractMatrix<DataType>& AL,
112  lbann_comm* comm,
113  const El::SyncInfo<Device>& sync_info);
114 
117 
119 template <El::Device Device>
121  const std::vector<std::pair<size_t, El::AbstractMatrix<DataType>*>>& blocks,
122  El::Matrix<DataType, Device>& global_buffer,
123  lbann_comm* comm,
125 
127 std::pair<bool, bool> is_allgather_buffer_required(kfac_allgather_mode mode);
128 
130 template <El::Device Device>
131 void allgather_blocks(
132  const std::vector<std::pair<size_t, El::AbstractMatrix<DataType>*>>& blocks,
133  El::Matrix<DataType, Device>& send_buffer,
134  El::Matrix<DataType, Device>& recv_buffer,
135  lbann_comm* comm,
136  kfac_allgather_mode mode);
137 
139 template <El::Device Device>
141  const std::vector<std::shared_ptr<kfac_block<Device>>>& blocks,
142  El::Matrix<DataType, Device>& global_buffer,
143  lbann_comm* comm);
144 
146 template <El::Device Device>
148  const std::vector<std::shared_ptr<kfac_block<Device>>>& blocks,
149  El::Matrix<double, El::Device::CPU>& global_buffer,
150  lbann_comm* comm);
151 
153 template <El::Device Device>
154 void add_to_diagonal(El::Matrix<DataType, Device>& A,
155  DataType value,
156  DataType value_bn_err,
157  bool is_bn,
158  const El::SyncInfo<Device>& sync_info);
159 
161 template <El::Device Device>
162 void make_diagonal(El::Matrix<DataType, Device>& A,
163  El::Matrix<DataType, Device>& B,
164  DataType value,
165  DataType value_bn_err,
166  bool is_bn,
167  const El::SyncInfo<Device>& sync_info);
168 
170 template <El::Device Device>
171 void get_matrix_entrywise_inverse(El::Matrix<DataType, Device>& input,
172  El::Matrix<DataType, Device>& output,
173  const El::SyncInfo<Device>& sync_info);
174 
176 template <El::Device Device>
177 void fill_upper_tri(El::Matrix<DataType, Device>& A,
178  const El::SyncInfo<Device>& sync_info);
179 
183 template <El::Device Device>
184 void update_kronecker_average(El::Matrix<DataType, Device>& Aave,
185  const El::Matrix<DataType, Device>& A,
186  size_t count,
187  double decay,
188  const El::SyncInfo<Device>& sync_info);
189 
193 template <El::Device Device>
194 void identity(El::Matrix<DataType, Device>& A,
195  const El::SyncInfo<Device>& sync_info);
196 
198 template <El::Device Device>
199 void pack_lower_tri(El::Matrix<DataType, Device>& L,
200  const El::Matrix<DataType, Device>& A,
201  const El::SyncInfo<Device>& sync_info);
202 
204 template <El::Device Device>
205 void unpack_lower_tri(El::Matrix<DataType, Device>& A,
206  const El::Matrix<DataType, Device>& L,
207  const El::SyncInfo<Device>& sync_info);
208 
209 template <typename T, El::Device Device>
211  const El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& A,
212  El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& B,
213  El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& subset,
214  std::vector<ReqT>& Requests);
215 
216 template <typename T, El::Device Device>
218  const El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& A,
219  El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& B,
220  El::Int featureSize,
221  El::Int currentBatchSize,
222  std::vector<ReqT>& Requests);
223 
224 template <typename T, El::Device Device>
226  const El::DistMatrix<T, El::STAR, El::STAR, El::ELEMENT, Device>& A,
227  El::DistMatrix<T, El::STAR, El::STAR, El::ELEMENT, Device>& B,
228  std::vector<ReqT>& Requests);
229 
230 template <typename T, El::Device Device>
232  const El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& A,
233  El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& B,
234  std::vector<ReqT>& Requests);
235 
236 template <typename T, El::Device Device>
238  El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device> const& A,
239  El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& B);
240 
241 } // namespace kfac
242 } // namespace lbann
243 
244 #endif // LBANN_EXECUTION_ALGORITHMS_KFAC_KFAC_UTIL_HPP_INCLUDED
void add_to_diagonal(El::Matrix< DataType, Device > &A, DataType value, DataType value_bn_err, bool is_bn, const El::SyncInfo< Device > &sync_info)
Add the damping value to the diagonal elements of A.
void allgather_inverse_matrices(const std::vector< std::shared_ptr< kfac_block< Device >>> &blocks, El::Matrix< DataType, Device > &global_buffer, lbann_comm *comm)
Perform allgather for inverse matrices.
void unpack_lower_tri(El::Matrix< DataType, Device > &A, const El::Matrix< DataType, Device > &L, const El::SyncInfo< Device > &sync_info)
Unpack the lower triangular of a symmetric matrix.
::Al::MPIBackend BackendT
Definition: kfac_block.hpp:46
void allreduce_lower_tri(El::AbstractMatrix< DataType > &A, El::AbstractMatrix< DataType > &AL, lbann_comm *comm, const El::SyncInfo< Device > &sync_info)
Perform all-reduce on the lower triangular of a symmetric matrix.
kfac_reduce_scatter_mode
Definition: kfac_util.hpp:67
void TranslateBetweenGridsKFACAsync(const El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &A, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &B, std::vector< ReqT > &Requests)
void TranslateBetweenGridsVC(El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > const &A, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &B)
bool is_reduce_scatter_buffer_required(kfac_reduce_scatter_mode mode)
Get whether a global buffer is needed.
void identity(El::Matrix< DataType, Device > &A, const El::SyncInfo< Device > &sync_info)
Substitute the identity matrix. TODO: Replace with El::Identity<El::Device::GPU> once it gets support...
std::string get_matrix_stat(const El::Matrix< DataType, Device > &X, const char *name)
Gets statistics of a given matrix.
void allgather_inverse_matrices_sizes(const std::vector< std::shared_ptr< kfac_block< Device >>> &blocks, El::Matrix< double, El::Device::CPU > &global_buffer, lbann_comm *comm)
Perform allgather for inverse matrices size.
typename BackendT::req_type ReqT
Definition: kfac_block.hpp:49
void get_matrix_entrywise_inverse(El::Matrix< DataType, Device > &input, El::Matrix< DataType, Device > &output, const El::SyncInfo< Device > &sync_info)
Add the damping value to the diagonal elements of A.
void make_diagonal(El::Matrix< DataType, Device > &A, El::Matrix< DataType, Device > &B, DataType value, DataType value_bn_err, bool is_bn, const El::SyncInfo< Device > &sync_info)
Add the damping value to the diagonal elements of A from B.
void update_kronecker_average(El::Matrix< DataType, Device > &Aave, const El::Matrix< DataType, Device > &A, size_t count, double decay, const El::SyncInfo< Device > &sync_info)
Update a Kronecker factor matrix using decay.
void fill_upper_tri(El::Matrix< DataType, Device > &A, const El::SyncInfo< Device > &sync_info)
Fill the upper trianglar with the lower trianglar.
void reduce_scatter_blocks(const std::vector< std::pair< size_t, El::AbstractMatrix< DataType > *>> &blocks, El::Matrix< DataType, Device > &global_buffer, lbann_comm *comm, kfac_reduce_scatter_mode mode)
Perform reduce-scatter on one or more blocks.
std::pair< bool, bool > is_allgather_buffer_required(kfac_allgather_mode mode)
Get whether local and global buffers are needed.
void allgather_blocks(const std::vector< std::pair< size_t, El::AbstractMatrix< DataType > *>> &blocks, El::Matrix< DataType, Device > &send_buffer, El::Matrix< DataType, Device > &recv_buffer, lbann_comm *comm, kfac_allgather_mode mode)
Perform reduce-scatter on one or more blocks.
void TranslateBetweenGridsVCAsync(const El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &A, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &B, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &subset, std::vector< ReqT > &Requests)
void get_matrix_inverse_eigen(El::AbstractMatrix< DataType > &Ainv, El::AbstractMatrix< DataType > &Linv, const El::AbstractMatrix< DataType > &A, bool report_time, DataType damping, DataType damping_bn_err, bool is_bn, const El::SyncInfo< Device > &sync_info)
Gets the inverse matrix of A using Eigen Value Decomposition.
void get_matrix_inverse(El::AbstractMatrix< DataType > &Ainv, El::AbstractMatrix< DataType > &Linv, const El::AbstractMatrix< DataType > &A, bool report_time, DataType damping, DataType damping_bn_err, bool is_bn, const El::SyncInfo< Device > &sync_info)
Gets the inverse matrix of A.
void TranslateBetweenGridsSTARAsync(const El::DistMatrix< T, El::STAR, El::STAR, El::ELEMENT, Device > &A, El::DistMatrix< T, El::STAR, El::STAR, El::ELEMENT, Device > &B, std::vector< ReqT > &Requests)
void TranslateBetweenGridsVCAsyncDirect(const El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &A, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &B, El::Int featureSize, El::Int currentBatchSize, std::vector< ReqT > &Requests)
void pack_lower_tri(El::Matrix< DataType, Device > &L, const El::Matrix< DataType, Device > &A, const El::SyncInfo< Device > &sync_info)
Pack the lower triangular of a symmetric matrix.