da/dfc/kfac_8hpp_source.html

 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
 // Produced at the Lawrence Livermore National Laboratory.
 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 //
 // LLNL-CODE-697807.
 // All rights reserved.
 //
 // This file is part of LBANN: Livermore Big Artificial Neural Network
 // Toolkit. For details, see http://software.llnl.gov/LBANN or
 // https://github.com/LLNL/LBANN.
 //
 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 // may not use this file except in compliance with the License.  You may
 // obtain a copy of the License at:
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.
 #ifndef LBANN_EXECUTION_ALGORITHMS_KFAC_HPP_INCLUDED
 #define LBANN_EXECUTION_ALGORITHMS_KFAC_HPP_INCLUDED

 #include "lbann/data_coordinator/data_coordinator.hpp"
 #include "lbann/execution_algorithms/factory.hpp"
 #include "lbann/execution_algorithms/kfac/execution_context.hpp"
 #include "lbann/execution_algorithms/sgd_execution_context.hpp"
 #include "lbann/execution_algorithms/training_algorithm.hpp"
 #include "lbann/trainers/trainer.hpp"
 #include "lbann/utils/cloneable.hpp"
 #include "lbann/utils/make_abstract.hpp"

 #include <google/protobuf/message.h>
 #include <memory>

 namespace lbann {

 class KFAC final : public TrainingAlgorithm
 {

 public:
   using TermCriteriaType = SGDTerminationCriteria;
   using ExeContextType = kfac::KFACExecutionContext;

 public:

   KFAC(std::string name,
        std::unique_ptr<TermCriteriaType> stop,
        std::vector<double> damping_act_params,
        std::vector<double> damping_err_params,
        std::vector<double> damping_bn_act_params,
        std::vector<double> damping_bn_err_params,
        std::vector<bool> kfac_use_interval,
        size_t damping_warmup_steps,
        double kronecker_decay,
        bool print_time,
        bool print_matrix,
        bool print_matrix_summary,
        bool use_pi,
        std::vector<size_t> update_intervals,
        size_t update_interval_steps,
        kfac::kfac_inverse_strategy inverse_strategy,
        std::vector<std::string> disable_layers,
        double learning_rate_factor,
        double learning_rate_factor_gru,
        size_t compute_interval,
        bool distribute_precondition_compute,
        bool use_eigen_decomposition,
        bool enable_copy_errors,
        bool enable_copy_activations);

   ~KFAC() noexcept = default;
   KFAC(KFAC const& other) = delete;
   KFAC& operator=(const KFAC& other) = delete;
   KFAC(KFAC&& other) = default;
   KFAC& operator=(KFAC&& other) = default;

   std::string get_type() const final;


   void apply(ExecutionContext& context,
              model& m,
              data_coordinator& dc,
              execution_mode mode) final;
   void train(ExeContextType& c,
              model& model,
              data_coordinator& dc,
              TermCriteriaType const& term);

 #ifdef LBANN_HAS_GPU
   constexpr static const El::Device Device = El::Device::GPU;
 #else
   constexpr static const El::Device Device = El::Device::CPU;
 #endif // LBANN_HAS_GPU

   constexpr static const double damping_0_default = 3e-2;
   constexpr static const size_t damping_warmup_steps_default = 100;

   constexpr static const double kronecker_decay_default = 0.99;

   constexpr static const bool prof_sync = true;
   constexpr static const int prof_color = 0;

 protected:
   bool train_mini_batch(ExeContextType& c, model& model, data_coordinator& dc);


   void do_train_begin_cbs(model& model);
   void do_train_end_cbs(model& model);
   void do_epoch_begin_cbs(model& model);
   void do_epoch_end_cbs(model& model);
   void do_batch_begin_cbs(model& model);
   void do_batch_end_cbs(model& model);

   kfac::KFACExecutionContext* do_get_new_execution_context() const final;

   void start_send_recv_inverse_matrices(ExeContextType& context,
                                         lbann_comm* comm);
   void end_send_recv_inverse_matrices(ExeContextType& context,
                                       lbann_comm* comm);

 private:
 #if 1

   void on_forward_prop_end(ExeContextType& context, model& model);
   void on_backward_prop_end(ExeContextType& context, model& model);

 #else

   void compute_kronecker_factors(ExeContextType& context, model& model);

   void invert_kronecker_factors(ExeContextType& context, model& model);

   void precondition_gradients(ExeContextType& context, model& model);
 #endif // 0

   void sync_weights_model(model& model, lbann_comm* comm);
   void start_sync_weights_async(model& model, lbann_comm* comm);
   void end_sync_weights_async(model& model, lbann_comm* comm);

   void start_old_async_weights_model(model& model,
                                      lbann_comm* comm,
                                      ExeContextType& context);
   void end_old_async_weights_model(model& model,
                                    lbann_comm* comm,
                                    ExeContextType& context);
   void allgather_precondition_gradient(lbann_comm& comm,
                                        ExeContextType& context);

   std::unique_ptr<TermCriteriaType> m_stopping_criteria;

   std::vector<double> m_damping_act_params, m_damping_err_params,
     m_damping_bn_act_params, m_damping_bn_err_params;

   size_t m_damping_warmup_steps;

   double m_kronecker_decay;

   bool m_print_time, m_print_matrix, m_print_matrix_summary;

   bool m_use_pi;

   std::vector<size_t> m_update_intervals;

   size_t m_update_interval_steps;

   kfac::kfac_inverse_strategy m_inverse_strategy;

   std::vector<std::string> m_disable_layers;

   double m_learning_rate_factor, m_learning_rate_factor_gru;

   bool m_has_kronecker_inverse = false;

   size_t m_compute_interval;

   bool m_distribute_precondition_compute;

   bool m_enable_copy_errors;

   bool m_enable_copy_activations;

   bool m_use_eigen_decomposition;

   El::Matrix<double, El::Device::CPU> m_inverse_matrices_size;

   int m_global_inverse_buffer_size = 0, m_weight_matrices_buffer_size = 0;

   std::vector<kfac::ReqT> m_inverse_matrix_communication_reqs,
     m_weights_communication_reqs;

   int m_time_span_inverse_comm = 0, m_time_span_inverse_send_recv = 0,
       m_time_span_forward_comm = 0, m_time_span_forward_comm_end = 0,
       m_time_span_backward_comm = 0, m_time_span_backward_comm_end = 0,
       m_time_span_precond_comm = 0, m_time_forward_pass = 0,
       m_time_backward_pass = 0, m_time_kfac = 0;

   std::vector<bool> m_use_KFAC_epoch;

 }; // class KFAC

 } // namespace lbann

 template <>
 std::unique_ptr<lbann::KFAC>
 lbann::make<lbann::KFAC>(google::protobuf::Message const& msg);

 #endif // LBANN_EXECUTION_ALGORITHMS_KFAC_HPP_INCLUDED
lbann::KFAC::m_time_span_forward_comm_end
int m_time_span_forward_comm_end
Definition: kfac.hpp:274

lbann::KFAC::end_old_async_weights_model
void end_old_async_weights_model(model &model, lbann_comm *comm, ExeContextType &context)

lbann::KFAC::KFAC
KFAC(std::string name, std::unique_ptr< TermCriteriaType > stop, std::vector< double > damping_act_params, std::vector< double > damping_err_params, std::vector< double > damping_bn_act_params, std::vector< double > damping_bn_err_params, std::vector< bool > kfac_use_interval, size_t damping_warmup_steps, double kronecker_decay, bool print_time, bool print_matrix, bool print_matrix_summary, bool use_pi, std::vector< size_t > update_intervals, size_t update_interval_steps, kfac::kfac_inverse_strategy inverse_strategy, std::vector< std::string > disable_layers, double learning_rate_factor, double learning_rate_factor_gru, size_t compute_interval, bool distribute_precondition_compute, bool use_eigen_decomposition, bool enable_copy_errors, bool enable_copy_activations)
Construct KFAC from its component pieces.

lbann::KFAC::allgather_precondition_gradient
void allgather_precondition_gradient(lbann_comm &comm, ExeContextType &context)

lbann::KFAC::m_kronecker_decay
double m_kronecker_decay
The decay factor of kronecker factors.
Definition: kfac.hpp:217

lbann::KFAC::m_global_inverse_buffer_size
int m_global_inverse_buffer_size
Definition: kfac.hpp:266

lbann::KFAC::m_time_span_backward_comm
int m_time_span_backward_comm
Definition: kfac.hpp:275

lbann::KFAC::m_update_intervals
std::vector< size_t > m_update_intervals
Space-separated pairs of the initial and the target update intervals. If only one value is specified...
Definition: kfac.hpp:230

lbann::KFAC::do_epoch_begin_cbs
void do_epoch_begin_cbs(model &model)

lbann::KFAC::do_train_begin_cbs
void do_train_begin_cbs(model &model)

trainer.hpp

lbann::KFAC::m_time_span_inverse_comm
int m_time_span_inverse_comm
Profiling variables.
Definition: kfac.hpp:273

lbann::KFAC::Device
static constexpr const El::Device Device
Definition: kfac.hpp:129

lbann::KFAC::m_distribute_precondition_compute
bool m_distribute_precondition_compute
distribute precondition gradient compute.
Definition: kfac.hpp:251

lbann::KFAC::m_damping_err_params
std::vector< double > m_damping_err_params
Definition: kfac.hpp:210

lbann::KFAC::end_sync_weights_async
void end_sync_weights_async(model &model, lbann_comm *comm)

lbann::KFAC::on_forward_prop_end
void on_forward_prop_end(ExeContextType &context, model &model)

lbann::KFAC::get_type
std::string get_type() const final
Queries.

lbann::KFAC::on_backward_prop_end
void on_backward_prop_end(ExeContextType &context, model &model)

lbann::KFAC::start_send_recv_inverse_matrices
void start_send_recv_inverse_matrices(ExeContextType &context, lbann_comm *comm)

lbann::lbann_comm
Definition: comm.hpp:105

lbann::KFAC::m_damping_warmup_steps
size_t m_damping_warmup_steps
The number of warmup steps of the Tikhnov damping technique.
Definition: kfac.hpp:214

lbann::KFAC::m_print_time
bool m_print_time
Knobs to print information for debugging.
Definition: kfac.hpp:220

lbann::data_coordinator
Definition: data_coordinator.hpp:65

lbann::KFAC::m_time_span_inverse_send_recv
int m_time_span_inverse_send_recv
Definition: kfac.hpp:273

lbann::KFAC::sync_weights_model
void sync_weights_model(model &model, lbann_comm *comm)
Data exchange functions to synchronize model and weights.

lbann::KFAC::m_learning_rate_factor_gru
double m_learning_rate_factor_gru
Definition: kfac.hpp:242

lbann::KFAC::prof_color
static constexpr const int prof_color
Definition: kfac.hpp:141

lbann::Device
constexpr El::Device Device
Definition: OperatorTraits.hpp:62

lbann::KFAC::m_time_kfac
int m_time_kfac
Definition: kfac.hpp:277

lbann::KFAC::m_inverse_strategy
kfac::kfac_inverse_strategy m_inverse_strategy
Assignment strategy for the model-parallel part.
Definition: kfac.hpp:236

lbann::KFAC
An implementation of the KFAC second-order optimization algorithm.
Definition: kfac.hpp:59

lbann::KFAC::do_batch_end_cbs
void do_batch_end_cbs(model &model)

lbann::KFAC::m_time_forward_pass
int m_time_forward_pass
Definition: kfac.hpp:276

lbann::KFAC::train_mini_batch
bool train_mini_batch(ExeContextType &c, model &model, data_coordinator &dc)
Train model on one step / mini-batch of an SGD forward pass.

lbann::KFAC::m_use_KFAC_epoch
std::vector< bool > m_use_KFAC_epoch
Definition: kfac.hpp:279

lbann::KFAC::m_update_interval_steps
size_t m_update_interval_steps
The number of steps for changing the update interval.
Definition: kfac.hpp:233

factory.hpp

lbann::model
Abstract base class for neural network models.
Definition: model.hpp:83

lbann::KFAC::m_learning_rate_factor
double m_learning_rate_factor
Factors to be multiplied to the learning rate.
Definition: kfac.hpp:242

lbann::KFAC::m_time_backward_pass
int m_time_backward_pass
Definition: kfac.hpp:277

lbann::KFAC::m_use_pi
bool m_use_pi
Weather to use the pi constant to adjust the damping constant.
Definition: kfac.hpp:224

lbann::kfac::KFACExecutionContext
Definition: kfac/execution_context.hpp:57

execution_context.hpp

lbann::KFAC::do_train_end_cbs
void do_train_end_cbs(model &model)

lbann::KFAC::m_print_matrix
bool m_print_matrix
Definition: kfac.hpp:220

lbann::execution_mode
execution_mode
Neural network execution mode.
Definition: base.hpp:229

lbann::KFAC::damping_0_default
static constexpr const double damping_0_default
The default parameters of a Tikhonov damping technique.
Definition: kfac.hpp:133

lbann::KFAC::m_use_eigen_decomposition
bool m_use_eigen_decomposition
use eigen value decomposition for inversing the matrix.
Definition: kfac.hpp:262

lbann::KFAC::prof_sync
static constexpr const bool prof_sync
Parameters for prof_region_*.
Definition: kfac.hpp:140

lbann::KFAC::m_compute_interval
size_t m_compute_interval
KFAC Compute interval.
Definition: kfac.hpp:248

cloneable.hpp

lbann::ExecutionContext
Definition: execution_context.hpp:47

lbann::KFAC::m_enable_copy_errors
bool m_enable_copy_errors
copy errors to a temporary matrix to increase overlap of compute and communication.
Definition: kfac.hpp:255

lbann::KFAC::m_print_matrix_summary
bool m_print_matrix_summary
Definition: kfac.hpp:220

sgd_execution_context.hpp

lbann::kfac::kfac_inverse_strategy
kfac_inverse_strategy
Definition: kfac_util.hpp:58

lbann::KFAC::start_old_async_weights_model
void start_old_async_weights_model(model &model, lbann_comm *comm, ExeContextType &context)

lbann::KFAC::m_damping_act_params
std::vector< double > m_damping_act_params
Pairs of the initial and the target damping value. If only one value is specified, it will be used throughout training.
Definition: kfac.hpp:210

lbann::KFAC::m_enable_copy_activations
bool m_enable_copy_activations
copy activations to a temporary matrix to increase overlap of compute and communication.
Definition: kfac.hpp:259

lbann::KFAC::do_epoch_end_cbs
void do_epoch_end_cbs(model &model)

lbann::KFAC::m_time_span_precond_comm
int m_time_span_precond_comm
Definition: kfac.hpp:276

lbann::KFAC::m_inverse_matrices_size
El::Matrix< double, El::Device::CPU > m_inverse_matrices_size
Definition: kfac.hpp:264

make_abstract.hpp

lbann::KFAC::train
void train(ExeContextType &c, model &model, data_coordinator &dc, TermCriteriaType const &term)
Train a model using KFAC.

lbann::KFAC::m_disable_layers
std::vector< std::string > m_disable_layers
List of layers to be ignored by the callback.
Definition: kfac.hpp:239

lbann::KFAC::m_inverse_matrix_communication_reqs
std::vector< kfac::ReqT > m_inverse_matrix_communication_reqs
vector for async communication reqs.
Definition: kfac.hpp:269

training_algorithm.hpp

lbann::KFAC::m_damping_bn_err_params
std::vector< double > m_damping_bn_err_params
Definition: kfac.hpp:210

lbann::KFAC::damping_warmup_steps_default
static constexpr const size_t damping_warmup_steps_default
Definition: kfac.hpp:134

lbann::KFAC::~KFAC
~KFAC() noexcept=default

lbann::KFAC::do_batch_begin_cbs
void do_batch_begin_cbs(model &model)

lbann::KFAC::m_weight_matrices_buffer_size
int m_weight_matrices_buffer_size
Definition: kfac.hpp:266

lbann::KFAC::m_weights_communication_reqs
std::vector< kfac::ReqT > m_weights_communication_reqs
Definition: kfac.hpp:269

lbann::KFAC::m_stopping_criteria
std::unique_ptr< TermCriteriaType > m_stopping_criteria
The KFAC stopping criteria.
Definition: kfac.hpp:205

lbann::KFAC::m_damping_bn_act_params
std::vector< double > m_damping_bn_act_params
Definition: kfac.hpp:210

lbann::KFAC::end_send_recv_inverse_matrices
void end_send_recv_inverse_matrices(ExeContextType &context, lbann_comm *comm)

lbann::KFAC::do_get_new_execution_context
kfac::KFACExecutionContext * do_get_new_execution_context() const final
Covariant return-friendly implementation of get_new_exection_context().

lbann::KFAC::kronecker_decay_default
static constexpr const double kronecker_decay_default
The default parameters of the decay factor.
Definition: kfac.hpp:137

lbann::TrainingAlgorithm
Base class for LBANN training_algorithms.
Definition: training_algorithm.hpp:86

lbann::KFAC::start_sync_weights_async
void start_sync_weights_async(model &model, lbann_comm *comm)

lbann::KFAC::m_time_span_backward_comm_end
int m_time_span_backward_comm_end
Definition: kfac.hpp:275

data_coordinator.hpp

lbann::KFAC::m_has_kronecker_inverse
bool m_has_kronecker_inverse
Whether inverse of Kronecker factors are available.
Definition: kfac.hpp:245

lbann::KFAC::m_time_span_forward_comm
int m_time_span_forward_comm
Definition: kfac.hpp:274

lbann
Definition: callback_helpers.hpp:32

lbann::SGDTerminationCriteria
Base class for SGD stopping.
Definition: sgd_execution_context.hpp:135

lbann::KFAC::apply
void apply(ExecutionContext &context, model &m, data_coordinator &dc, execution_mode mode) final
Apply the training algorithm to refine model weights.