27 #ifndef LBANN_EXECUTION_ALGORITHMS_KFAC_KFAC_UTIL_HPP_INCLUDED 28 #define LBANN_EXECUTION_ALGORITHMS_KFAC_KFAC_UTIL_HPP_INCLUDED 40 template <El::Device Device>
48 #if defined AL_HAS_NCCL 50 #elif defined AL_HAS_HOST_TRANSFER 51 using BackendT = ::Al::HostTransferBackend;
56 using ReqT =
typename BackendT::req_type;
82 template <El::Device Device>
84 El::AbstractMatrix<DataType>& Linv,
85 const El::AbstractMatrix<DataType>& A,
88 DataType damping_bn_err,
90 const El::SyncInfo<Device>& sync_info);
93 template <El::Device Device>
95 El::AbstractMatrix<DataType>& Linv,
96 const El::AbstractMatrix<DataType>& A,
99 DataType damping_bn_err,
101 const El::SyncInfo<Device>& sync_info);
104 template <El::Device Device>
109 template <El::Device Device>
111 El::AbstractMatrix<DataType>& AL,
113 const El::SyncInfo<Device>& sync_info);
119 template <El::Device Device>
121 const std::vector<std::pair<
size_t, El::AbstractMatrix<DataType>*>>& blocks,
122 El::Matrix<DataType, Device>& global_buffer,
130 template <El::Device Device>
132 const std::vector<std::pair<
size_t, El::AbstractMatrix<DataType>*>>& blocks,
133 El::Matrix<DataType, Device>& send_buffer,
134 El::Matrix<DataType, Device>& recv_buffer,
139 template <El::Device Device>
142 El::Matrix<DataType, Device>& global_buffer,
146 template <El::Device Device>
149 El::Matrix<double, El::Device::CPU>& global_buffer,
153 template <El::Device Device>
156 DataType value_bn_err,
158 const El::SyncInfo<Device>& sync_info);
161 template <El::Device Device>
163 El::Matrix<DataType, Device>& B,
165 DataType value_bn_err,
167 const El::SyncInfo<Device>& sync_info);
170 template <El::Device Device>
172 El::Matrix<DataType, Device>& output,
173 const El::SyncInfo<Device>& sync_info);
176 template <El::Device Device>
178 const El::SyncInfo<Device>& sync_info);
183 template <El::Device Device>
185 const El::Matrix<DataType, Device>& A,
188 const El::SyncInfo<Device>& sync_info);
193 template <El::Device Device>
194 void identity(El::Matrix<DataType, Device>& A,
195 const El::SyncInfo<Device>& sync_info);
198 template <El::Device Device>
200 const El::Matrix<DataType, Device>& A,
201 const El::SyncInfo<Device>& sync_info);
204 template <El::Device Device>
206 const El::Matrix<DataType, Device>& L,
207 const El::SyncInfo<Device>& sync_info);
209 template <
typename T, El::Device Device>
211 const El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& A,
212 El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& B,
213 El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& subset,
214 std::vector<ReqT>& Requests);
216 template <
typename T, El::Device Device>
218 const El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& A,
219 El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& B,
221 El::Int currentBatchSize,
222 std::vector<ReqT>& Requests);
224 template <
typename T, El::Device Device>
226 const El::DistMatrix<T, El::STAR, El::STAR, El::ELEMENT, Device>& A,
227 El::DistMatrix<T, El::STAR, El::STAR, El::ELEMENT, Device>& B,
228 std::vector<ReqT>& Requests);
230 template <
typename T, El::Device Device>
232 const El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& A,
233 El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& B,
234 std::vector<ReqT>& Requests);
236 template <
typename T, El::Device Device>
238 El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>
const& A,
239 El::DistMatrix<T, El::STAR, El::VC, El::ELEMENT, Device>& B);
244 #endif // LBANN_EXECUTION_ALGORITHMS_KFAC_KFAC_UTIL_HPP_INCLUDED void add_to_diagonal(El::Matrix< DataType, Device > &A, DataType value, DataType value_bn_err, bool is_bn, const El::SyncInfo< Device > &sync_info)
Add the damping value to the diagonal elements of A.
void allgather_inverse_matrices(const std::vector< std::shared_ptr< kfac_block< Device >>> &blocks, El::Matrix< DataType, Device > &global_buffer, lbann_comm *comm)
Perform allgather for inverse matrices.
void unpack_lower_tri(El::Matrix< DataType, Device > &A, const El::Matrix< DataType, Device > &L, const El::SyncInfo< Device > &sync_info)
Unpack the lower triangular of a symmetric matrix.
::Al::MPIBackend BackendT
void allreduce_lower_tri(El::AbstractMatrix< DataType > &A, El::AbstractMatrix< DataType > &AL, lbann_comm *comm, const El::SyncInfo< Device > &sync_info)
Perform all-reduce on the lower triangular of a symmetric matrix.
void TranslateBetweenGridsKFACAsync(const El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &A, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &B, std::vector< ReqT > &Requests)
void TranslateBetweenGridsVC(El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > const &A, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &B)
bool is_reduce_scatter_buffer_required(kfac_reduce_scatter_mode mode)
Get whether a global buffer is needed.
void identity(El::Matrix< DataType, Device > &A, const El::SyncInfo< Device > &sync_info)
Substitute the identity matrix. TODO: Replace with El::Identity<El::Device::GPU> once it gets support...
std::string get_matrix_stat(const El::Matrix< DataType, Device > &X, const char *name)
Gets statistics of a given matrix.
void allgather_inverse_matrices_sizes(const std::vector< std::shared_ptr< kfac_block< Device >>> &blocks, El::Matrix< double, El::Device::CPU > &global_buffer, lbann_comm *comm)
Perform allgather for inverse matrices size.
typename BackendT::req_type ReqT
void get_matrix_entrywise_inverse(El::Matrix< DataType, Device > &input, El::Matrix< DataType, Device > &output, const El::SyncInfo< Device > &sync_info)
Add the damping value to the diagonal elements of A.
void make_diagonal(El::Matrix< DataType, Device > &A, El::Matrix< DataType, Device > &B, DataType value, DataType value_bn_err, bool is_bn, const El::SyncInfo< Device > &sync_info)
Add the damping value to the diagonal elements of A from B.
void update_kronecker_average(El::Matrix< DataType, Device > &Aave, const El::Matrix< DataType, Device > &A, size_t count, double decay, const El::SyncInfo< Device > &sync_info)
Update a Kronecker factor matrix using decay.
void fill_upper_tri(El::Matrix< DataType, Device > &A, const El::SyncInfo< Device > &sync_info)
Fill the upper trianglar with the lower trianglar.
void reduce_scatter_blocks(const std::vector< std::pair< size_t, El::AbstractMatrix< DataType > *>> &blocks, El::Matrix< DataType, Device > &global_buffer, lbann_comm *comm, kfac_reduce_scatter_mode mode)
Perform reduce-scatter on one or more blocks.
std::pair< bool, bool > is_allgather_buffer_required(kfac_allgather_mode mode)
Get whether local and global buffers are needed.
void allgather_blocks(const std::vector< std::pair< size_t, El::AbstractMatrix< DataType > *>> &blocks, El::Matrix< DataType, Device > &send_buffer, El::Matrix< DataType, Device > &recv_buffer, lbann_comm *comm, kfac_allgather_mode mode)
Perform reduce-scatter on one or more blocks.
void TranslateBetweenGridsVCAsync(const El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &A, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &B, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &subset, std::vector< ReqT > &Requests)
void get_matrix_inverse_eigen(El::AbstractMatrix< DataType > &Ainv, El::AbstractMatrix< DataType > &Linv, const El::AbstractMatrix< DataType > &A, bool report_time, DataType damping, DataType damping_bn_err, bool is_bn, const El::SyncInfo< Device > &sync_info)
Gets the inverse matrix of A using Eigen Value Decomposition.
void get_matrix_inverse(El::AbstractMatrix< DataType > &Ainv, El::AbstractMatrix< DataType > &Linv, const El::AbstractMatrix< DataType > &A, bool report_time, DataType damping, DataType damping_bn_err, bool is_bn, const El::SyncInfo< Device > &sync_info)
Gets the inverse matrix of A.
void TranslateBetweenGridsSTARAsync(const El::DistMatrix< T, El::STAR, El::STAR, El::ELEMENT, Device > &A, El::DistMatrix< T, El::STAR, El::STAR, El::ELEMENT, Device > &B, std::vector< ReqT > &Requests)
void TranslateBetweenGridsVCAsyncDirect(const El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &A, El::DistMatrix< T, El::STAR, El::VC, El::ELEMENT, Device > &B, El::Int featureSize, El::Int currentBatchSize, std::vector< ReqT > &Requests)
void pack_lower_tri(El::Matrix< DataType, Device > &L, const El::Matrix< DataType, Device > &A, const El::SyncInfo< Device > &sync_info)
Pack the lower triangular of a symmetric matrix.