LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
checkpoint.hpp File Reference
Include dependency graph for checkpoint.hpp:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  lbann::callback::checkpoint
 Checkpoint at given interval in given directory. More...
 
struct  lbann::callback::checkpoint::header_t< _max_dir_len >
 

Namespaces

 lbann
 
 lbann::callback
 

Enumerations

enum  lbann::callback::callback_phase {
  lbann::callback::callback_phase::batch, lbann::callback::callback_phase::epoch, lbann::callback::callback_phase::validation, lbann::callback::callback_phase::inference,
  lbann::callback::callback_phase::invalid
}
 

Functions

std::string lbann::callback::get_trainer_checkpoint_dirname (const std::string &trainer_name, const std::string &dir)
 
std::string lbann::callback::get_last_shared_checkpoint_filename (const std::string &alg_name, const std::string &dir)
 
std::string lbann::callback::get_last_shared_checkpoint_filename (const std::string &trainer_name, const std::string &alg_name, const std::string &dir)
 
std::string lbann::callback::get_shared_checkpoint_dirname (const std::string &alg_name, const std::string &dir, visitor_hook hook, execution_mode mode, size_t epoch, size_t step)
 
std::string lbann::callback::get_shared_checkpoint_dirname (const std::string &trainer_name, const std::string &alg_name, const std::string &dir, visitor_hook hook, execution_mode mode, size_t epoch, size_t step)
 
std::string lbann::callback::get_last_distributed_checkpoint_filename (const std::string &alg_name, const std::string &dir)
 
std::string lbann::callback::get_last_distributed_checkpoint_filename (const std::string &trainer_name, const std::string &alg_name, const std::string &dir)
 
std::string lbann::callback::get_distributed_checkpoint_dirname (const std::string &alg_name, const int rank_in_trainer, const std::string &dir, visitor_hook hook, execution_mode mode, size_t epoch, size_t step)
 
std::string lbann::callback::get_distributed_checkpoint_dirname (const std::string &trainer_name, const std::string &alg_name, const int rank_in_trainer, const std::string &dir, visitor_hook hook, execution_mode mode, size_t epoch, size_t step)
 
bool lbann::callback::write_latest (std::string filename, visitor_hook hook, execution_mode mode, size_t epoch, size_t train)
 
bool lbann::callback::read_latest (std::string filename, visitor_hook *hook, execution_mode *mode, size_t *epochLast, size_t *trainLast)
 Reads the "latest" file and returns the epoch number and sample offset for most recent checkpoint. More...
 
std::unique_ptr< callback_base > lbann::callback::build_checkpoint_callback_from_pbuf (const google::protobuf::Message &)