LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
lbann::callback::checkpoint Class Reference

Checkpoint at given interval in given directory. More...

#include <checkpoint.hpp>

Inheritance diagram for lbann::callback::checkpoint:
[legend]
Collaboration diagram for lbann::callback::checkpoint:
[legend]

Classes

struct  header_t
 

Public Member Functions

 checkpoint (std::string checkpoint_dir, std::string restart_dir, int checkpoint_epochs, int checkpoint_steps, int checkpoint_secs, std::string per_rank_dir, int ckpt_dist_epochs, int ckpt_dist_steps)
 Construct the checkpoint callback. More...
 
 checkpoint (const checkpoint &)=default
 
checkpointoperator= (const checkpoint &)=default
 
checkpointcopy () const override
 
void setup (model *m) override
 Called once to set up the callback on the model (after all layers are set up). More...
 
void setup (trainer *t) override
 Called once to set up the callback on the trainer. More...
 
void on_train_begin (model *m) override
 Called at the beginning of training. More...
 
void on_train_end (model *m) override
 Called at the end of training. More...
 
void on_epoch_begin (model *m) override
 Called at the beginning of each epoch. More...
 
void on_batch_begin (model *m) override
 Called at the beginning of a (mini-)batch. More...
 
void on_validation_begin (model *m) override
 Called at the beginning of validation. More...
 
void set_checkpoint_dir (const std::string &dir)
 
const std::string & get_checkpoint_dir ()
 
void set_restart_dir (const std::string &dir)
 
const std::string & get_restart_dir ()
 
void set_active_trainer (trainer *t)
 
trainerget_active_trainer ()
 
void set_active_training_algorithm (TrainingAlgorithm *t)
 
TrainingAlgorithmget_active_training_algorithm ()
 
void set_checkpoint_epochs (int epochs)
 
void set_checkpoint_steps (int steps)
 
void set_checkpoint_secs (EvalType secs)
 
void set_per_rank_dir (std::string dir)
 
const std::string & get_per_rank_dir ()
 
void set_ckpt_dist_epochs (int ckpt_dist_epochs)
 
void set_ckpt_dist_steps (int ckpt_dist_steps)
 
std::string get_shared_checkpoint_rootdir ()
 
std::string get_distributed_checkpoint_rootdir ()
 
bool need_checkpoint (model *m, callback_phase phase)
 
std::string find_latest_checkpoint (lbann_comm &comm, const std::string &trainer_name, const std::string &alg_name, visitor_hook &hook, execution_mode &mode, size_t &epoch, size_t &step, bool &shared)
 
bool open_latest_checkpoint (lbann_comm &comm, const std::string &task_label, const std::string &trainer_name, const std::string &alg_name, std::function< bool(persist &)> reload_shared_ckpt, std::function< bool(persist &)> reload_distributed_ckpt)
 
bool reload_model (model *m)
 
bool reload_trainer (trainer *t)
 
bool restart (model *m)
 
std::string name () const override
 Return this callback's name. More...
 
- Public Member Functions inherited from lbann::callback_base
 callback_base (int batch_interval=1)
 Initialize a callback with an optional batch interval. More...
 
 callback_base (const callback_base &)=default
 
virtual ~callback_base ()=default
 
virtual void on_setup_end (model *m)
 Called at the end of setup. More...
 
virtual void on_phase_end (model *m)
 Called at the end of every phase (multiple epochs) in a layer-wise model training. More...
 
virtual void on_epoch_end (model *m)
 Called immediate after the end of each epoch. More...
 
virtual void on_batch_end (model *m)
 Called immediately after the end of a (mini-)batch. More...
 
virtual void on_test_begin (model *m)
 Called at the beginning of testing. More...
 
virtual void on_test_end (model *m)
 Called immediately after the end of testing. More...
 
virtual void on_validation_end (model *m)
 Called immediately after the end of validation. More...
 
virtual void on_forward_prop_begin (model *m)
 Called when a model begins forward propagation. More...
 
virtual void on_forward_prop_begin (model *m, Layer *l)
 Called when a layer begins forward propagation. More...
 
virtual void on_forward_prop_end (model *m)
 Called when a model ends forward propagation. More...
 
virtual void on_forward_prop_end (model *m, Layer *l)
 Called when a layer ends forward propagation. More...
 
virtual void on_backward_prop_begin (model *m)
 Called when a model begins backward propagation. More...
 
virtual void on_backward_prop_begin (model *m, Layer *l)
 Called when a layer begins backward propagation. More...
 
virtual void on_backward_prop_end (model *m)
 Called when a model ends backward propagation. More...
 
virtual void on_backward_prop_end (model *m, Layer *l)
 Called when a layer ends backward propagation. More...
 
virtual void on_optimize_begin (model *m)
 Called when a model begins optimization. More...
 
virtual void on_optimize_begin (model *m, weights *w)
 Called when weights begins optimization. More...
 
virtual void on_optimize_end (model *m)
 Called when a model ends optimization. More...
 
virtual void on_optimize_end (model *m, weights *w)
 Called when weights ends optimization. More...
 
virtual void on_batch_evaluate_begin (model *m)
 Called at the beginning of a (mini-)batch evaluation (validation / testing). More...
 
virtual void on_batch_evaluate_end (model *m)
 Called at the end of a (mini-)batch evaluation (validation / testing). More...
 
virtual void on_evaluate_forward_prop_begin (model *m)
 Called when a model begins forward propagation for evaluation (validation / testing). More...
 
virtual void on_evaluate_forward_prop_begin (model *m, Layer *l)
 Called when a layer begins forward propagation for evaluation (validation / testing). More...
 
virtual void on_evaluate_forward_prop_end (model *m)
 Called when a model ends forward propagation for evaluation (validation / testing). More...
 
virtual void on_evaluate_forward_prop_end (model *m, Layer *l)
 Called when a layer ends forward propagation for evaluation (validation / testing). More...
 
int get_batch_interval () const
 Return the batch interval. More...
 
virtual description get_description () const
 Human-readable description. More...
 
template<class Archive >
void serialize (Archive &ar)
 Store state to archive for checkpoint and restart. More...
 
void write_proto (lbann_data::Callback &proto) const
 Write a protobuf description of the callback. More...
 

Private Member Functions

void write_specific_proto (lbann_data::Callback &proto) const final
 
bool do_checkpoint (model *m, visitor_hook hook)
 
void do_distributed_checkpoint (lbann_comm &comm, trainer &t, model &m, visitor_hook hook, execution_mode mode, persist &p, size_t epoch, size_t step)
 
void do_shared_checkpoint (lbann_comm &comm, trainer &t, model &m, visitor_hook hook, execution_mode mode, persist &p, size_t epoch, size_t step)
 

Private Attributes

trainerm_active_trainer
 
TrainingAlgorithmm_active_training_algorithm
 
std::string m_checkpoint_dir
 
std::string m_restart_dir
 
int m_checkpoint_epochs
 
int m_checkpoint_steps
 
EvalType m_checkpoint_secs
 
std::string m_per_rank_dir
 
int m_ckpt_dist_epochs
 
int m_ckpt_dist_steps
 
EvalType m_checkpoint_last
 
bool m_checkpoint_dist
 
bool m_checkpoint_shared
 

Additional Inherited Members

- Protected Member Functions inherited from lbann::callback_base
std::string get_multi_trainer_path (const model &m, const std::string &root_dir)
 Build a standard directory hierarchy including trainer ID. More...
 
std::string get_multi_trainer_ec_model_path (const model &m, const std::string &root_dir)
 Build a standard directory hierachy including trainer, execution context, and model information (in that order). More...
 
std::string get_multi_trainer_model_path (const model &m, const std::string &root_dir)
 Build a standard directory hierachy including trainer, model information in that order. More...
 
callback_baseoperator= (const callback_base &)=default
 Copy-assignment operator. More...
 
- Protected Attributes inherited from lbann::callback_base
int m_batch_interval
 Batch methods should once every this many steps. More...
 

Detailed Description

Checkpoint at given interval in given directory.

Definition at line 52 of file checkpoint.hpp.

Constructor & Destructor Documentation

◆ checkpoint() [1/2]

lbann::callback::checkpoint::checkpoint ( std::string  checkpoint_dir,
std::string  restart_dir,
int  checkpoint_epochs,
int  checkpoint_steps,
int  checkpoint_secs,
std::string  per_rank_dir,
int  ckpt_dist_epochs,
int  ckpt_dist_steps 
)
inline

Construct the checkpoint callback.

It may be beneficial to the distributed checkpoints at a higher tempo than the shared checkpoints because they are less expensive.

Parameters
checkpoint_dirdirectory to save checkpoint files
restart_dirdirectory to find checkpoint files
checkpoint_epochsinterval to checkpoint
checkpoint_stepsinterval to checkpoint
checkpoint_secsinterval to checkpoint
per_rank_dirThe directory into which to dump distributed checkpoints
ckpt_dist_epochsThe frequency of distributed checkpoints in epochs
ckpt_dist_stepsThe frequence of distributed checkpoints in steps

Definition at line 71 of file checkpoint.hpp.

◆ checkpoint() [2/2]

lbann::callback::checkpoint::checkpoint ( const checkpoint )
default

Member Function Documentation

◆ copy()

checkpoint* lbann::callback::checkpoint::copy ( ) const
inlineoverridevirtual

Implements lbann::callback_base.

Definition at line 93 of file checkpoint.hpp.

◆ do_checkpoint()

bool lbann::callback::checkpoint::do_checkpoint ( model m,
visitor_hook  hook 
)
private

◆ do_distributed_checkpoint()

void lbann::callback::checkpoint::do_distributed_checkpoint ( lbann_comm comm,
trainer t,
model m,
visitor_hook  hook,
execution_mode  mode,
persist p,
size_t  epoch,
size_t  step 
)
private

◆ do_shared_checkpoint()

void lbann::callback::checkpoint::do_shared_checkpoint ( lbann_comm comm,
trainer t,
model m,
visitor_hook  hook,
execution_mode  mode,
persist p,
size_t  epoch,
size_t  step 
)
private

◆ find_latest_checkpoint()

std::string lbann::callback::checkpoint::find_latest_checkpoint ( lbann_comm comm,
const std::string &  trainer_name,
const std::string &  alg_name,
visitor_hook hook,
execution_mode mode,
size_t &  epoch,
size_t &  step,
bool &  shared 
)

◆ get_active_trainer()

trainer& lbann::callback::checkpoint::get_active_trainer ( )

◆ get_active_training_algorithm()

TrainingAlgorithm& lbann::callback::checkpoint::get_active_training_algorithm ( )

◆ get_checkpoint_dir()

const std::string& lbann::callback::checkpoint::get_checkpoint_dir ( )
inline

Definition at line 107 of file checkpoint.hpp.

◆ get_distributed_checkpoint_rootdir()

std::string lbann::callback::checkpoint::get_distributed_checkpoint_rootdir ( )
inline
Todo:
BVE FIMME this looks wrong I think that the order should be reversed

Definition at line 163 of file checkpoint.hpp.

◆ get_per_rank_dir()

const std::string& lbann::callback::checkpoint::get_per_rank_dir ( )
inline

Definition at line 144 of file checkpoint.hpp.

◆ get_restart_dir()

const std::string& lbann::callback::checkpoint::get_restart_dir ( )
inline

Definition at line 111 of file checkpoint.hpp.

◆ get_shared_checkpoint_rootdir()

std::string lbann::callback::checkpoint::get_shared_checkpoint_rootdir ( )
inline

Definition at line 156 of file checkpoint.hpp.

◆ name()

std::string lbann::callback::checkpoint::name ( ) const
inlineoverridevirtual

Return this callback's name.

Implements lbann::callback_base.

Definition at line 192 of file checkpoint.hpp.

◆ need_checkpoint()

bool lbann::callback::checkpoint::need_checkpoint ( model m,
callback_phase  phase 
)

◆ on_batch_begin()

void lbann::callback::checkpoint::on_batch_begin ( model m)
overridevirtual

Called at the beginning of a (mini-)batch.

Reimplemented from lbann::callback_base.

◆ on_epoch_begin()

void lbann::callback::checkpoint::on_epoch_begin ( model m)
overridevirtual

Called at the beginning of each epoch.

Reimplemented from lbann::callback_base.

◆ on_train_begin()

void lbann::callback::checkpoint::on_train_begin ( model m)
overridevirtual

Called at the beginning of training.

Reimplemented from lbann::callback_base.

◆ on_train_end()

void lbann::callback::checkpoint::on_train_end ( model m)
overridevirtual

Called at the end of training.

Reimplemented from lbann::callback_base.

◆ on_validation_begin()

void lbann::callback::checkpoint::on_validation_begin ( model m)
overridevirtual

Called at the beginning of validation.

Reimplemented from lbann::callback_base.

◆ open_latest_checkpoint()

bool lbann::callback::checkpoint::open_latest_checkpoint ( lbann_comm comm,
const std::string &  task_label,
const std::string &  trainer_name,
const std::string &  alg_name,
std::function< bool(persist &)>  reload_shared_ckpt,
std::function< bool(persist &)>  reload_distributed_ckpt 
)

◆ operator=()

checkpoint& lbann::callback::checkpoint::operator= ( const checkpoint )
default

◆ reload_model()

bool lbann::callback::checkpoint::reload_model ( model m)

◆ reload_trainer()

bool lbann::callback::checkpoint::reload_trainer ( trainer t)

◆ restart()

bool lbann::callback::checkpoint::restart ( model m)

◆ set_active_trainer()

void lbann::callback::checkpoint::set_active_trainer ( trainer t)
inline

Definition at line 122 of file checkpoint.hpp.

◆ set_active_training_algorithm()

void lbann::callback::checkpoint::set_active_training_algorithm ( TrainingAlgorithm t)
inline

Definition at line 126 of file checkpoint.hpp.

◆ set_checkpoint_dir()

void lbann::callback::checkpoint::set_checkpoint_dir ( const std::string &  dir)
inline

Definition at line 102 of file checkpoint.hpp.

◆ set_checkpoint_epochs()

void lbann::callback::checkpoint::set_checkpoint_epochs ( int  epochs)
inline

Definition at line 133 of file checkpoint.hpp.

◆ set_checkpoint_secs()

void lbann::callback::checkpoint::set_checkpoint_secs ( EvalType  secs)
inline

Definition at line 140 of file checkpoint.hpp.

◆ set_checkpoint_steps()

void lbann::callback::checkpoint::set_checkpoint_steps ( int  steps)
inline

Definition at line 138 of file checkpoint.hpp.

◆ set_ckpt_dist_epochs()

void lbann::callback::checkpoint::set_ckpt_dist_epochs ( int  ckpt_dist_epochs)
inline

Definition at line 146 of file checkpoint.hpp.

◆ set_ckpt_dist_steps()

void lbann::callback::checkpoint::set_ckpt_dist_steps ( int  ckpt_dist_steps)
inline

Definition at line 151 of file checkpoint.hpp.

◆ set_per_rank_dir()

void lbann::callback::checkpoint::set_per_rank_dir ( std::string  dir)
inline

Definition at line 142 of file checkpoint.hpp.

◆ set_restart_dir()

void lbann::callback::checkpoint::set_restart_dir ( const std::string &  dir)
inline

Definition at line 109 of file checkpoint.hpp.

◆ setup() [1/2]

void lbann::callback::checkpoint::setup ( model m)
overridevirtual

Called once to set up the callback on the model (after all layers are set up).

Reimplemented from lbann::callback_base.

◆ setup() [2/2]

void lbann::callback::checkpoint::setup ( trainer t)
overridevirtual

Called once to set up the callback on the trainer.

Reimplemented from lbann::callback_base.

◆ write_specific_proto()

void lbann::callback::checkpoint::write_specific_proto ( lbann_data::Callback &  proto) const
finalprivatevirtual

Add callback specific data to prototext

Implements lbann::callback_base.

Member Data Documentation

◆ m_active_trainer

trainer* lbann::callback::checkpoint::m_active_trainer
private

Definition at line 217 of file checkpoint.hpp.

◆ m_active_training_algorithm

TrainingAlgorithm* lbann::callback::checkpoint::m_active_training_algorithm
private

Definition at line 218 of file checkpoint.hpp.

◆ m_checkpoint_dir

std::string lbann::callback::checkpoint::m_checkpoint_dir
private

Definition at line 219 of file checkpoint.hpp.

◆ m_checkpoint_dist

bool lbann::callback::checkpoint::m_checkpoint_dist
private

Definition at line 230 of file checkpoint.hpp.

◆ m_checkpoint_epochs

int lbann::callback::checkpoint::m_checkpoint_epochs
private

Definition at line 223 of file checkpoint.hpp.

◆ m_checkpoint_last

EvalType lbann::callback::checkpoint::m_checkpoint_last
private

Definition at line 229 of file checkpoint.hpp.

◆ m_checkpoint_secs

EvalType lbann::callback::checkpoint::m_checkpoint_secs
private

Definition at line 225 of file checkpoint.hpp.

◆ m_checkpoint_shared

bool lbann::callback::checkpoint::m_checkpoint_shared
private

Definition at line 231 of file checkpoint.hpp.

◆ m_checkpoint_steps

int lbann::callback::checkpoint::m_checkpoint_steps
private

Definition at line 224 of file checkpoint.hpp.

◆ m_ckpt_dist_epochs

int lbann::callback::checkpoint::m_ckpt_dist_epochs
private

Definition at line 227 of file checkpoint.hpp.

◆ m_ckpt_dist_steps

int lbann::callback::checkpoint::m_ckpt_dist_steps
private

Definition at line 228 of file checkpoint.hpp.

◆ m_per_rank_dir

std::string lbann::callback::checkpoint::m_per_rank_dir
private

Definition at line 226 of file checkpoint.hpp.

◆ m_restart_dir

std::string lbann::callback::checkpoint::m_restart_dir
private

Definition at line 222 of file checkpoint.hpp.


The documentation for this class was generated from the following file: