|
LBANN
0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
|
Checkpoint at given interval in given directory. More...
#include <checkpoint.hpp>
Classes | |
| struct | header_t |
Public Member Functions | |
| checkpoint (std::string checkpoint_dir, std::string restart_dir, int checkpoint_epochs, int checkpoint_steps, int checkpoint_secs, std::string per_rank_dir, int ckpt_dist_epochs, int ckpt_dist_steps) | |
| Construct the checkpoint callback. More... | |
| checkpoint (const checkpoint &)=default | |
| checkpoint & | operator= (const checkpoint &)=default |
| checkpoint * | copy () const override |
| void | setup (model *m) override |
| Called once to set up the callback on the model (after all layers are set up). More... | |
| void | setup (trainer *t) override |
| Called once to set up the callback on the trainer. More... | |
| void | on_train_begin (model *m) override |
| Called at the beginning of training. More... | |
| void | on_train_end (model *m) override |
| Called at the end of training. More... | |
| void | on_epoch_begin (model *m) override |
| Called at the beginning of each epoch. More... | |
| void | on_batch_begin (model *m) override |
| Called at the beginning of a (mini-)batch. More... | |
| void | on_validation_begin (model *m) override |
| Called at the beginning of validation. More... | |
| void | set_checkpoint_dir (const std::string &dir) |
| const std::string & | get_checkpoint_dir () |
| void | set_restart_dir (const std::string &dir) |
| const std::string & | get_restart_dir () |
| void | set_active_trainer (trainer *t) |
| trainer & | get_active_trainer () |
| void | set_active_training_algorithm (TrainingAlgorithm *t) |
| TrainingAlgorithm & | get_active_training_algorithm () |
| void | set_checkpoint_epochs (int epochs) |
| void | set_checkpoint_steps (int steps) |
| void | set_checkpoint_secs (EvalType secs) |
| void | set_per_rank_dir (std::string dir) |
| const std::string & | get_per_rank_dir () |
| void | set_ckpt_dist_epochs (int ckpt_dist_epochs) |
| void | set_ckpt_dist_steps (int ckpt_dist_steps) |
| std::string | get_shared_checkpoint_rootdir () |
| std::string | get_distributed_checkpoint_rootdir () |
| bool | need_checkpoint (model *m, callback_phase phase) |
| std::string | find_latest_checkpoint (lbann_comm &comm, const std::string &trainer_name, const std::string &alg_name, visitor_hook &hook, execution_mode &mode, size_t &epoch, size_t &step, bool &shared) |
| bool | open_latest_checkpoint (lbann_comm &comm, const std::string &task_label, const std::string &trainer_name, const std::string &alg_name, std::function< bool(persist &)> reload_shared_ckpt, std::function< bool(persist &)> reload_distributed_ckpt) |
| bool | reload_model (model *m) |
| bool | reload_trainer (trainer *t) |
| bool | restart (model *m) |
| std::string | name () const override |
| Return this callback's name. More... | |
Public Member Functions inherited from lbann::callback_base | |
| callback_base (int batch_interval=1) | |
| Initialize a callback with an optional batch interval. More... | |
| callback_base (const callback_base &)=default | |
| virtual | ~callback_base ()=default |
| virtual void | on_setup_end (model *m) |
| Called at the end of setup. More... | |
| virtual void | on_phase_end (model *m) |
| Called at the end of every phase (multiple epochs) in a layer-wise model training. More... | |
| virtual void | on_epoch_end (model *m) |
| Called immediate after the end of each epoch. More... | |
| virtual void | on_batch_end (model *m) |
| Called immediately after the end of a (mini-)batch. More... | |
| virtual void | on_test_begin (model *m) |
| Called at the beginning of testing. More... | |
| virtual void | on_test_end (model *m) |
| Called immediately after the end of testing. More... | |
| virtual void | on_validation_end (model *m) |
| Called immediately after the end of validation. More... | |
| virtual void | on_forward_prop_begin (model *m) |
| Called when a model begins forward propagation. More... | |
| virtual void | on_forward_prop_begin (model *m, Layer *l) |
| Called when a layer begins forward propagation. More... | |
| virtual void | on_forward_prop_end (model *m) |
| Called when a model ends forward propagation. More... | |
| virtual void | on_forward_prop_end (model *m, Layer *l) |
| Called when a layer ends forward propagation. More... | |
| virtual void | on_backward_prop_begin (model *m) |
| Called when a model begins backward propagation. More... | |
| virtual void | on_backward_prop_begin (model *m, Layer *l) |
| Called when a layer begins backward propagation. More... | |
| virtual void | on_backward_prop_end (model *m) |
| Called when a model ends backward propagation. More... | |
| virtual void | on_backward_prop_end (model *m, Layer *l) |
| Called when a layer ends backward propagation. More... | |
| virtual void | on_optimize_begin (model *m) |
| Called when a model begins optimization. More... | |
| virtual void | on_optimize_begin (model *m, weights *w) |
| Called when weights begins optimization. More... | |
| virtual void | on_optimize_end (model *m) |
| Called when a model ends optimization. More... | |
| virtual void | on_optimize_end (model *m, weights *w) |
| Called when weights ends optimization. More... | |
| virtual void | on_batch_evaluate_begin (model *m) |
| Called at the beginning of a (mini-)batch evaluation (validation / testing). More... | |
| virtual void | on_batch_evaluate_end (model *m) |
| Called at the end of a (mini-)batch evaluation (validation / testing). More... | |
| virtual void | on_evaluate_forward_prop_begin (model *m) |
| Called when a model begins forward propagation for evaluation (validation / testing). More... | |
| virtual void | on_evaluate_forward_prop_begin (model *m, Layer *l) |
| Called when a layer begins forward propagation for evaluation (validation / testing). More... | |
| virtual void | on_evaluate_forward_prop_end (model *m) |
| Called when a model ends forward propagation for evaluation (validation / testing). More... | |
| virtual void | on_evaluate_forward_prop_end (model *m, Layer *l) |
| Called when a layer ends forward propagation for evaluation (validation / testing). More... | |
| int | get_batch_interval () const |
| Return the batch interval. More... | |
| virtual description | get_description () const |
| Human-readable description. More... | |
| template<class Archive > | |
| void | serialize (Archive &ar) |
| Store state to archive for checkpoint and restart. More... | |
| void | write_proto (lbann_data::Callback &proto) const |
| Write a protobuf description of the callback. More... | |
Private Member Functions | |
| void | write_specific_proto (lbann_data::Callback &proto) const final |
| bool | do_checkpoint (model *m, visitor_hook hook) |
| void | do_distributed_checkpoint (lbann_comm &comm, trainer &t, model &m, visitor_hook hook, execution_mode mode, persist &p, size_t epoch, size_t step) |
| void | do_shared_checkpoint (lbann_comm &comm, trainer &t, model &m, visitor_hook hook, execution_mode mode, persist &p, size_t epoch, size_t step) |
Private Attributes | |
| trainer * | m_active_trainer |
| TrainingAlgorithm * | m_active_training_algorithm |
| std::string | m_checkpoint_dir |
| std::string | m_restart_dir |
| int | m_checkpoint_epochs |
| int | m_checkpoint_steps |
| EvalType | m_checkpoint_secs |
| std::string | m_per_rank_dir |
| int | m_ckpt_dist_epochs |
| int | m_ckpt_dist_steps |
| EvalType | m_checkpoint_last |
| bool | m_checkpoint_dist |
| bool | m_checkpoint_shared |
Additional Inherited Members | |
Protected Member Functions inherited from lbann::callback_base | |
| std::string | get_multi_trainer_path (const model &m, const std::string &root_dir) |
| Build a standard directory hierarchy including trainer ID. More... | |
| std::string | get_multi_trainer_ec_model_path (const model &m, const std::string &root_dir) |
| Build a standard directory hierachy including trainer, execution context, and model information (in that order). More... | |
| std::string | get_multi_trainer_model_path (const model &m, const std::string &root_dir) |
| Build a standard directory hierachy including trainer, model information in that order. More... | |
| callback_base & | operator= (const callback_base &)=default |
| Copy-assignment operator. More... | |
Protected Attributes inherited from lbann::callback_base | |
| int | m_batch_interval |
| Batch methods should once every this many steps. More... | |
Checkpoint at given interval in given directory.
Definition at line 52 of file checkpoint.hpp.
|
inline |
Construct the checkpoint callback.
It may be beneficial to the distributed checkpoints at a higher tempo than the shared checkpoints because they are less expensive.
| checkpoint_dir | directory to save checkpoint files |
| restart_dir | directory to find checkpoint files |
| checkpoint_epochs | interval to checkpoint |
| checkpoint_steps | interval to checkpoint |
| checkpoint_secs | interval to checkpoint |
| per_rank_dir | The directory into which to dump distributed checkpoints |
| ckpt_dist_epochs | The frequency of distributed checkpoints in epochs |
| ckpt_dist_steps | The frequence of distributed checkpoints in steps |
Definition at line 71 of file checkpoint.hpp.
|
default |
|
inlineoverridevirtual |
Implements lbann::callback_base.
Definition at line 93 of file checkpoint.hpp.
|
private |
|
private |
|
private |
| std::string lbann::callback::checkpoint::find_latest_checkpoint | ( | lbann_comm & | comm, |
| const std::string & | trainer_name, | ||
| const std::string & | alg_name, | ||
| visitor_hook & | hook, | ||
| execution_mode & | mode, | ||
| size_t & | epoch, | ||
| size_t & | step, | ||
| bool & | shared | ||
| ) |
| trainer& lbann::callback::checkpoint::get_active_trainer | ( | ) |
| TrainingAlgorithm& lbann::callback::checkpoint::get_active_training_algorithm | ( | ) |
|
inline |
Definition at line 107 of file checkpoint.hpp.
|
inline |
Definition at line 163 of file checkpoint.hpp.
|
inline |
Definition at line 144 of file checkpoint.hpp.
|
inline |
Definition at line 111 of file checkpoint.hpp.
|
inline |
Definition at line 156 of file checkpoint.hpp.
|
inlineoverridevirtual |
Return this callback's name.
Implements lbann::callback_base.
Definition at line 192 of file checkpoint.hpp.
| bool lbann::callback::checkpoint::need_checkpoint | ( | model * | m, |
| callback_phase | phase | ||
| ) |
|
overridevirtual |
Called at the beginning of a (mini-)batch.
Reimplemented from lbann::callback_base.
|
overridevirtual |
Called at the beginning of each epoch.
Reimplemented from lbann::callback_base.
|
overridevirtual |
Called at the beginning of training.
Reimplemented from lbann::callback_base.
|
overridevirtual |
Called at the end of training.
Reimplemented from lbann::callback_base.
|
overridevirtual |
Called at the beginning of validation.
Reimplemented from lbann::callback_base.
| bool lbann::callback::checkpoint::open_latest_checkpoint | ( | lbann_comm & | comm, |
| const std::string & | task_label, | ||
| const std::string & | trainer_name, | ||
| const std::string & | alg_name, | ||
| std::function< bool(persist &)> | reload_shared_ckpt, | ||
| std::function< bool(persist &)> | reload_distributed_ckpt | ||
| ) |
|
default |
| bool lbann::callback::checkpoint::reload_model | ( | model * | m | ) |
| bool lbann::callback::checkpoint::reload_trainer | ( | trainer * | t | ) |
| bool lbann::callback::checkpoint::restart | ( | model * | m | ) |
|
inline |
Definition at line 122 of file checkpoint.hpp.
|
inline |
Definition at line 126 of file checkpoint.hpp.
|
inline |
Definition at line 102 of file checkpoint.hpp.
|
inline |
Definition at line 133 of file checkpoint.hpp.
|
inline |
Definition at line 140 of file checkpoint.hpp.
|
inline |
Definition at line 138 of file checkpoint.hpp.
|
inline |
Definition at line 146 of file checkpoint.hpp.
|
inline |
Definition at line 151 of file checkpoint.hpp.
|
inline |
Definition at line 142 of file checkpoint.hpp.
|
inline |
Definition at line 109 of file checkpoint.hpp.
|
overridevirtual |
Called once to set up the callback on the model (after all layers are set up).
Reimplemented from lbann::callback_base.
|
overridevirtual |
Called once to set up the callback on the trainer.
Reimplemented from lbann::callback_base.
|
finalprivatevirtual |
Add callback specific data to prototext
Implements lbann::callback_base.
|
private |
Definition at line 217 of file checkpoint.hpp.
|
private |
Definition at line 218 of file checkpoint.hpp.
|
private |
Definition at line 219 of file checkpoint.hpp.
|
private |
Definition at line 230 of file checkpoint.hpp.
|
private |
Definition at line 223 of file checkpoint.hpp.
|
private |
Definition at line 229 of file checkpoint.hpp.
|
private |
Definition at line 225 of file checkpoint.hpp.
|
private |
Definition at line 231 of file checkpoint.hpp.
|
private |
Definition at line 224 of file checkpoint.hpp.
|
private |
Definition at line 227 of file checkpoint.hpp.
|
private |
Definition at line 228 of file checkpoint.hpp.
|
private |
Definition at line 226 of file checkpoint.hpp.
|
private |
Definition at line 222 of file checkpoint.hpp.