Abstract base class for neural network models. More...

#include <model.hpp>

Collaboration diagram for lbann::model:

Public Member Functions
	model (lbann_comm *comm, std::unique_ptr< objective_function > obj_fn, std::unique_ptr< lbann_data::Optimizer > default_optimizer_msg=nullptr)

	model (const model &other)

model &	operator= (const model &other)

	~model ()=default

void	copy_trained_weights_from (std::vector< weights *> &w)
	Copy trained weights from input parameter w. More...

template<typename TensorDataType >
std::unique_ptr< optimizer >	create_optimizer () const
	Construct an instance of the default optimizer. More...

void	allow_background_io_activity (bool enable) noexcept
	Set a flag that can be used to enable / disable the background I/O activities. More...

bool	background_io_activity_allowed () const noexcept
	Are background I/O activities enabled by the input layers. More...

void	setup (size_t max_mini_batch_size, const std::vector< El::Grid *> &grids, bool force=false)

std::vector< observer_ptr< callback_base > >	get_callbacks ()
	Get the list of callbacks for the model. More...

std::vector< std::shared_ptr< callback_base > > &	get_callbacks_with_ownership () noexcept

bool	has_valid_execution_context () const noexcept

ExecutionContext const &	get_execution_context () const

ExecutionContext &	get_execution_context ()

void	reset_mode (ExecutionContext &context, execution_mode mode)
	Reset model pointer and execution mode. More...

void	reset_epoch_statistics (execution_mode mode)
	Reset model statistics for an epoch. More...

void	forward_prop (execution_mode mode)
	Forward propagation step. More...

void	backward_prop (bool compute_weight_grads_only=true)
	Backward propagation step. More...

void	evaluate_metrics (execution_mode mode, size_t current_mini_batch_size)

void	clear_gradients ()
	Clear each optimizer's gradient. More...

void	update_weights ()
	Update weights step. More...

bool	update_layers ()
	Update layers step. More...

void	reconcile_weight_values ()
	Reconcile weight values. More...

void	do_setup_end_cbs ()
	Execute callbacks at end of setup. More...

void	do_model_forward_prop_begin_cbs (execution_mode mode)
	Execute callbacks at start of model forward propagation. More...

void	do_model_forward_prop_end_cbs (execution_mode mode)
	Execute callbacks at end of model forward propagation. More...

void	do_layer_forward_prop_begin_cbs (execution_mode mode, Layer *l)
	Execute callbacks at start of layer forward propagation. More...

void	do_layer_forward_prop_end_cbs (execution_mode mode, Layer *l)
	Execute callbacks at end of layer forward propagation. More...

void	do_model_backward_prop_begin_cbs ()
	Execute callbacks at start of model backward propagation. More...

void	do_model_backward_prop_end_cbs ()
	Execute callbacks at end of model backward propagation. More...

void	do_layer_backward_prop_begin_cbs (Layer *l)
	Execute callbacks at start of layer backward propagation. More...

void	do_layer_backward_prop_end_cbs (Layer *l)
	Execute callbacks at end of layer backward propagation. More...

void	do_model_optimize_begin_cbs ()
	Execute callbacks at start of model optimization. More...

void	do_model_optimize_end_cbs ()
	Execute callbacks at end of model optimization. More...

void	do_weight_optimize_begin_cbs (weights *w)
	Execute callbacks at the start of weight optimization. More...

void	do_weight_optimize_end_cbs (weights *w)
	Execute callbacks at the end of weight optimization. More...

El::Int	get_max_mini_batch_size () const noexcept
	Return the maximum mini-batch size. More...

El::Int	get_current_mini_batch_size () const noexcept
	Return the current mini-batch size. More...

void	set_current_mini_batch_size (El::Int) noexcept
	Set the current mini-batch size. More...


void	set_name (std::string name)
	Metadata Accessors. More...

std::string	get_name () const noexcept
	Model instance name. More...

description	get_description () const
	Human-readable description. More...

lbann_comm *	get_comm () const noexcept
	Get the model's comm. More...


El::Int	get_num_layers () const noexcept
	Machine-learning object accessors. More...

Layer &	get_layer (El::Int pos)

Layer const &	get_layer (El::Int pos) const

std::vector< Layer * >	get_layers ()
	Return list of layers in model. More...

std::vector< Layer const * >	get_layers () const
	Return list of layers in model. More...

std::vector< weights * >	get_weights ()

std::vector< weights const * >	get_weights () const

std::vector< ViewingWeightsPtr >	get_weights_pointers () const

observer_ptr< objective_function const >	get_objective_function () const noexcept
	Mathematical function to be minimized during training. More...

observer_ptr< objective_function >	get_objective_function () noexcept

std::vector< metric * >	get_metrics ()
	Return the model's metrics. More...

std::vector< metric const * >	get_metrics () const

Model specification
void	add_layer (OwningLayerPtr &&l)
	Add layer to model. More...

void	add_weights (OwningWeightsPtr &&w)
	Add weights to model. More...

void	remove_weights (std::string const &name)
	Remove weights from model. More...

void	add_callback (std::shared_ptr< callback_base > cb)
	Register a new callback for the model. More...

void	add_metric (std::unique_ptr< metric > m)
	Register a new metric for the model. More...

void	insert_layer (OwningLayerPtr &&l, std::string const &parent_name)
	Insert layer in model. More...

void	remove_layer (std::string const &name)
	Remove layer from model. More...

void	replace_layer (OwningLayerPtr &&l, std::string const &name)
	Replace layer in model. More...

void	swap_layers (model &other)

void	swap_weights (model &other)

void	swap_metrics (model &other)

void	swap_objective_function (model &other)

Summarization
void	summarize_stats (lbann_summary &summarizer)
	Summarize statistics (e.g. timers, counters). More...

void	summarize_matrices (lbann_summary &summarizer)
	Summarize matrices (e.g. means). More...

Checkpointing and serialization.
template<class Archive >
void	serialize (Archive &ar)
	Serialization for checkpoint and restart with Cereal. More...

bool	save_to_checkpoint_shared (persist &p)
	Checkpoint model to given file descriptor, return number of bytes written. More...

bool	load_from_checkpoint_shared (persist &p)
	Restore model by reading checkpoint from given file descriptor, return number of bytes read. More...

bool	save_to_checkpoint_distributed (persist &p)

bool	load_from_checkpoint_distributed (persist &p)

void	write_proto (lbann_data::Model &proto)
	Write model to proto file. More...

void	save_model ()
	Saves the model explicitly if the save_model callback is present. More...


void	set_subgrid_communication_type (int type) noexcept
	Subgraph Parallelism Interface. More...

int	get_subgrid_communication_type () const noexcept

void	set_subgraph_num_parent_resources (int num_resources) noexcept

int	get_subgraph_num_parent_resources () const noexcept

void	set_subgrid_topology (bool type) noexcept

bool	get_subgrid_topology () const noexcept

void	enable_subgraph_parallelism () noexcept

bool	is_subgraph_parallelism_enabled () const noexcept

int	get_num_resources_non_branch_layers () const noexcept

int	get_num_resources_branch_layers () const noexcept

void	set_num_resources_non_branch_layers (int num) noexcept

void	set_num_resources_branch_layers (int num) noexcept

Private Member Functions
	model ()

void	add_evaluation_layers (std::unordered_set< Layer *> &layer_set, std::unordered_set< std::string > &layer_names)
	Insert evaluation layers where needed. More...

void	add_dummy_layers (std::unordered_set< std::string > &layer_names)
	Insert dummy layers after layers with too few children. More...

void	add_split_layers (std::unordered_set< std::string > &layer_names)
	Insert split layers after layers with too many children. More...

void	ensure_input_layers_first ()


void	reorder_layers (const std::vector< El::Int > &gather_indices)
	Setup-related implementation. More...

void	remap_pointers (const std::unordered_map< Layer , ViewingLayerPtr > &layer_map, const std::unordered_map< weights , ViewingWeightsPtr > &weights_map)
	Remap pointers. More...

void	setup_layer_topology ()
	Set up topology of layer graph. More...

void	setup_layer_execution_order ()
	Set up layer execution order. More...

void	setup_layer_grid_tags (const std::vector< El::Grid *> &grids)
	Set up grid tags for all layers. More...

void	setup_layers (size_t max_mini_batch_size, const std::vector< El::Grid *> &grids)
	Set up layers. More...

void	setup_weights ()
	Set up weights. More...

Subgraph parallelism implementation
void	setup_subgrids ()
	Setup sub grids for the sub graph parallelism. More...

void	get_subgrids_order (std::vector< int > &ranks_order, int num_branches)

int	get_max_subgraph_branches ()

void	check_subgraph_parallelism ()

void	setup_subgrid_layers_run_condition ()

void	get_parent_subgrid_tags (int layer_index)

void	get_subgraph_subgrids_ranks (std::vector< int > &parent_ranks, std::vector< int > &subgrid_ranks, int layer_index, int number_ranks_in_grid)

void	get_resources_for_spliting_point (std::vector< int > &parent_ranks, std::vector< int > &subgrid_ranks, int layer_index, int number_ranks_in_grid, int num_subgrids)

void	get_resources_for_merge_layers (std::set< int > &pooled_set, int child_index, int num_subgrids)

void	get_resources_for_input_layer (std::vector< int > &masterSubGrid, int num_subgrids)

void	setup_subcommunicators (const std::vector< El::Grid *> &grids)

Private Attributes
std::unordered_map< std::string, std::shared_ptr< El::Grid > >	grids

std::unordered_map< std::string, std::shared_ptr< El::mpi::Comm > >	subCommunicatorsSubgrids

std::unordered_map< std::string, std::unique_ptr< El::mpi::Group > >	grids_mpi_groups

observer_ptr< ExecutionContext >	m_execution_context

lbann_comm *	m_comm
	LBANN communicator. More...

int	vector_communication_subgraph = 0

int	subgraph_num_resources_parent = 0

bool	enable_subgraph_topology = false

bool	apply_subgraph_parallelism = false

int	num_resources_branch_layers

int	num_resources_non_branch_layers

std::string	m_name
	Model instance's name. More...

std::vector< OwningLayerPtr >	m_layers
	Tensor operations. More...

std::vector< OwningWeightsPtr >	m_weights
	Trainable parameters. More...

std::unique_ptr< lbann_data::Optimizer >	m_default_optimizer_msg

std::unique_ptr< objective_function >	m_objective_function
	Mathematical function to be minimized during training. More...

std::vector< std::unique_ptr< metric > >	m_metrics
	Numerical quantities to evaluate model performance. More...

std::vector< std::shared_ptr< callback_base > >	m_callbacks
	Current callbacks to process. More...

bool	m_background_io_allowed = true
	Flag that allows input layers to fetch data in the background. More...

bool	m_model_is_setup = false
	Is the model setup. More...

El::Int	m_max_mini_batch_size
	The maximum mini-batch size. More...

El::Int	m_current_mini_batch_size
	The current mini-batch size. More...

Detailed Description

Abstract base class for neural network models.

Definition at line 83 of file model.hpp.

Constructor & Destructor Documentation

◆ model() [1/3]

lbann::model::model	(	lbann_comm *	comm,
		std::unique_ptr< objective_function >	obj_fn,
		std::unique_ptr< lbann_data::Optimizer >	default_optimizer_msg = `nullptr`
	)

◆ model() [2/3]

lbann::model::model ( const model & other )

◆ ~model()

lbann::model::~model ( )

default

◆ model() [3/3]

lbann::model::model ( )

private

Member Function Documentation

◆ add_callback()

void lbann::model::add_callback ( std::shared_ptr< callback_base > cb )

Register a new callback for the model.

◆ add_dummy_layers()

void lbann::model::add_dummy_layers ( std::unordered_set< std::string > & layer_names )

private

Insert dummy layers after layers with too few children.

If a layer expects more child layers than it has, add dummy layers until it has enough children.

Parameters

layer_names Names of layers in model. Updated with any newly created layers.

◆ add_evaluation_layers()

void lbann::model::add_evaluation_layers	(	std::unordered_set< Layer *> &	layer_set,
		std::unordered_set< std::string > &	layer_names
	)

private

Insert evaluation layers where needed.

If a lbann::layer_term or lbann::layer_metric corresponds to a layer that is not an evaluation_layer, an evaluation layer is created and added to the model.

Parameters

layer_set	Layers in model. Updated with any newly created layers.
layer_names	Names of layers in model. Updated with any newly created layers.

◆ add_layer()

void lbann::model::add_layer ( OwningLayerPtr && l )

Add layer to model.

◆ add_metric()

void lbann::model::add_metric ( std::unique_ptr< metric > m )

Register a new metric for the model.

◆ add_split_layers()

void lbann::model::add_split_layers ( std::unordered_set< std::string > & layer_names )

private

Insert split layers after layers with too many children.

If a layer expects one child layer but has multiple, add a split layer to the model.

Parameters

layer_names Names of layers in model. Updated with any newly created layers.

◆ add_weights()

void lbann::model::add_weights ( OwningWeightsPtr && w )

Add weights to model.

Here is the caller graph for this function:

◆ allow_background_io_activity()

void lbann::model::allow_background_io_activity ( bool enable )

inlinenoexcept

Set a flag that can be used to enable / disable the background I/O activities.

Definition at line 681 of file model.hpp.

◆ background_io_activity_allowed()

bool lbann::model::background_io_activity_allowed ( ) const

inlinenoexcept

Are background I/O activities enabled by the input layers.

Definition at line 686 of file model.hpp.

◆ backward_prop()

void lbann::model::backward_prop ( bool compute_weight_grads_only = true )

Backward propagation step.

◆ check_subgraph_parallelism()

void lbann::model::check_subgraph_parallelism ( )

private

◆ clear_gradients()

void lbann::model::clear_gradients ( )

Clear each optimizer's gradient.

This must be called before training forward prop since layers set an optimizer flag during forward prop.

◆ copy_trained_weights_from()

void lbann::model::copy_trained_weights_from ( std::vector< weights *> & w )

Copy trained weights from input parameter w.

Only weight values are placed, pointers and layer structure are in place. Weights to be copied are of the same name

◆ create_optimizer()

template<typename TensorDataType >

std::unique_ptr< optimizer > lbann::model::create_optimizer ( ) const

inline

Construct an instance of the default optimizer.

If there is no default optimizer, a null pointer is returned.

Definition at line 674 of file model.hpp.

◆ do_layer_backward_prop_begin_cbs()

void lbann::model::do_layer_backward_prop_begin_cbs ( Layer * l )

Execute callbacks at start of layer backward propagation.

◆ do_layer_backward_prop_end_cbs()

void lbann::model::do_layer_backward_prop_end_cbs ( Layer * l )

Execute callbacks at end of layer backward propagation.

◆ do_layer_forward_prop_begin_cbs()

void lbann::model::do_layer_forward_prop_begin_cbs	(	execution_mode	mode,
		Layer *	l
	)

Execute callbacks at start of layer forward propagation.

◆ do_layer_forward_prop_end_cbs()

void lbann::model::do_layer_forward_prop_end_cbs	(	execution_mode	mode,
		Layer *	l
	)

Execute callbacks at end of layer forward propagation.

◆ do_model_backward_prop_begin_cbs()

void lbann::model::do_model_backward_prop_begin_cbs ( )

Execute callbacks at start of model backward propagation.

◆ do_model_backward_prop_end_cbs()

void lbann::model::do_model_backward_prop_end_cbs ( )

Execute callbacks at end of model backward propagation.

◆ do_model_forward_prop_begin_cbs()

void lbann::model::do_model_forward_prop_begin_cbs ( execution_mode mode )

Execute callbacks at start of model forward propagation.

◆ do_model_forward_prop_end_cbs()

void lbann::model::do_model_forward_prop_end_cbs ( execution_mode mode )

Execute callbacks at end of model forward propagation.

◆ do_model_optimize_begin_cbs()

void lbann::model::do_model_optimize_begin_cbs ( )

Execute callbacks at start of model optimization.

◆ do_model_optimize_end_cbs()

void lbann::model::do_model_optimize_end_cbs ( )

Execute callbacks at end of model optimization.

◆ do_setup_end_cbs()

void lbann::model::do_setup_end_cbs ( )

Execute callbacks at end of setup.

◆ do_weight_optimize_begin_cbs()

void lbann::model::do_weight_optimize_begin_cbs ( weights * w )

Execute callbacks at the start of weight optimization.

◆ do_weight_optimize_end_cbs()

void lbann::model::do_weight_optimize_end_cbs ( weights * w )

Execute callbacks at the end of weight optimization.

◆ enable_subgraph_parallelism()

void lbann::model::enable_subgraph_parallelism ( )

inlinenoexcept

Definition at line 721 of file model.hpp.

◆ ensure_input_layers_first()

void lbann::model::ensure_input_layers_first ( )

private

◆ evaluate_metrics()

void lbann::model::evaluate_metrics	(	execution_mode	mode,
		size_t	current_mini_batch_size
	)

Evaluate any metrics in the model

◆ forward_prop()

void lbann::model::forward_prop ( execution_mode mode )

Forward propagation step.

Here is the caller graph for this function:

◆ get_callbacks()

std::vector< observer_ptr< callback_base > > lbann::model::get_callbacks ( )

inline

Get the list of callbacks for the model.

Definition at line 636 of file model.hpp.

◆ get_callbacks_with_ownership()

std::vector< std::shared_ptr< callback_base > > & lbann::model::get_callbacks_with_ownership ( )

inlinenoexcept

Definition at line 647 of file model.hpp.

◆ get_comm()

lbann_comm * lbann::model::get_comm ( ) const

inlinenoexcept

Get the model's comm.

Definition at line 652 of file model.hpp.

Here is the caller graph for this function:

◆ get_current_mini_batch_size()

El::Int lbann::model::get_current_mini_batch_size ( ) const

inlinenoexcept

Return the current mini-batch size.

Definition at line 756 of file model.hpp.

◆ get_description()

description lbann::model::get_description ( ) const

Human-readable description.

◆ get_execution_context() [1/2]

ExecutionContext const & lbann::model::get_execution_context ( ) const

inline

Grab the training context of the model

Definition at line 659 of file model.hpp.

Here is the caller graph for this function:

◆ get_execution_context() [2/2]

ExecutionContext & lbann::model::get_execution_context ( )

inline

Grab the training context of the model

Definition at line 667 of file model.hpp.

Here is the call graph for this function:

◆ get_layer() [1/2]

Layer& lbann::model::get_layer ( El::Int pos )

Parameters

pos	Position in model's list of layers.

Here is the caller graph for this function:

◆ get_layer() [2/2]

Layer const& lbann::model::get_layer ( El::Int pos ) const

Parameters

pos	Position in model's list of layers.

◆ get_layers() [1/2]

std::vector<Layer*> lbann::model::get_layers ( )

Return list of layers in model.

The list is in execution order for forward propagation.

Here is the caller graph for this function:

◆ get_layers() [2/2]

std::vector<Layer const*> lbann::model::get_layers ( ) const

Return list of layers in model.

The list is in execution order for forward propagation.

◆ get_max_mini_batch_size()

El::Int lbann::model::get_max_mini_batch_size ( ) const

inlinenoexcept

Return the maximum mini-batch size.

Definition at line 751 of file model.hpp.

◆ get_max_subgraph_branches()

int lbann::model::get_max_subgraph_branches ( )

private

◆ get_metrics() [1/2]

std::vector<metric*> lbann::model::get_metrics ( )

Return the model's metrics.

◆ get_metrics() [2/2]

std::vector<metric const*> lbann::model::get_metrics ( ) const

◆ get_name()

std::string lbann::model::get_name ( ) const

inlinenoexcept

Model instance name.

Each model in a trainer should have a unique, and preferably human-readable, name.

Definition at line 623 of file model.hpp.

◆ get_num_layers()

El::Int lbann::model::get_num_layers ( ) const

noexcept

Machine-learning object accessors.

Size of model's list of layers.

Here is the caller graph for this function:

◆ get_num_resources_branch_layers()

int lbann::model::get_num_resources_branch_layers ( ) const

inlinenoexcept

Definition at line 736 of file model.hpp.

◆ get_num_resources_non_branch_layers()

int lbann::model::get_num_resources_non_branch_layers ( ) const

inlinenoexcept

Definition at line 731 of file model.hpp.

◆ get_objective_function() [1/2]

observer_ptr< objective_function const > lbann::model::get_objective_function ( ) const

inlinenoexcept

Mathematical function to be minimized during training.

Definition at line 631 of file model.hpp.

◆ get_objective_function() [2/2]

observer_ptr< objective_function > lbann::model::get_objective_function ( )

inlinenoexcept

Definition at line 625 of file model.hpp.

◆ get_parent_subgrid_tags()

void lbann::model::get_parent_subgrid_tags ( int layer_index )

private

◆ get_resources_for_input_layer()

void lbann::model::get_resources_for_input_layer	(	std::vector< int > &	masterSubGrid,
		int	num_subgrids
	)

private

◆ get_resources_for_merge_layers()

void lbann::model::get_resources_for_merge_layers	(	std::set< int > &	pooled_set,
		int	child_index,
		int	num_subgrids
	)

private

◆ get_resources_for_spliting_point()

void lbann::model::get_resources_for_spliting_point	(	std::vector< int > &	parent_ranks,
		std::vector< int > &	subgrid_ranks,
		int	layer_index,
		int	number_ranks_in_grid,
		int	num_subgrids
	)

private

◆ get_subgraph_num_parent_resources()

int lbann::model::get_subgraph_num_parent_resources ( ) const

inlinenoexcept

Definition at line 706 of file model.hpp.

◆ get_subgraph_subgrids_ranks()

void lbann::model::get_subgraph_subgrids_ranks	(	std::vector< int > &	parent_ranks,
		std::vector< int > &	subgrid_ranks,
		int	layer_index,
		int	number_ranks_in_grid
	)

private

◆ get_subgrid_communication_type()

int lbann::model::get_subgrid_communication_type ( ) const

inlinenoexcept

Definition at line 696 of file model.hpp.

◆ get_subgrid_topology()

bool lbann::model::get_subgrid_topology ( ) const

inlinenoexcept

Definition at line 716 of file model.hpp.

◆ get_subgrids_order()

void lbann::model::get_subgrids_order	(	std::vector< int > &	ranks_order,
		int	num_branches
	)

private

◆ get_weights() [1/2]

std::vector<weights*> lbann::model::get_weights ( )

Here is the caller graph for this function:

◆ get_weights() [2/2]

std::vector<weights const*> lbann::model::get_weights ( ) const

◆ get_weights_pointers()

std::vector<ViewingWeightsPtr> lbann::model::get_weights_pointers ( ) const

◆ has_valid_execution_context()

bool lbann::model::has_valid_execution_context ( ) const

inlinenoexcept

Check to see if there is a valid training context for the model

Definition at line 654 of file model.hpp.

◆ insert_layer()

void lbann::model::insert_layer	(	OwningLayerPtr &&	l,
		std::string const &	parent_name
	)

Insert layer in model.

◆ is_subgraph_parallelism_enabled()

bool lbann::model::is_subgraph_parallelism_enabled ( ) const

inlinenoexcept

Definition at line 726 of file model.hpp.

◆ load_from_checkpoint_distributed()

bool lbann::model::load_from_checkpoint_distributed ( persist & p )

◆ load_from_checkpoint_shared()

bool lbann::model::load_from_checkpoint_shared ( persist & p )

Restore model by reading checkpoint from given file descriptor, return number of bytes read.

◆ operator=()

model& lbann::model::operator= ( const model & other )

◆ reconcile_weight_values()

void lbann::model::reconcile_weight_values ( )

Reconcile weight values.

If weight values are duplicated across multiple processes, they are set to the average across the processes.

◆ remap_pointers()

void lbann::model::remap_pointers	(	const std::unordered_map< Layer *, ViewingLayerPtr > &	layer_map,
		const std::unordered_map< weights *, ViewingWeightsPtr > &	weights_map
	)

private

Remap pointers.

Layer and weights pointers are remapped using the provided maps. If a pointer is not a key in the corresponding map, the pointer is not changed.

◆ remove_layer()

void lbann::model::remove_layer ( std::string const & name )

Remove layer from model.

◆ remove_weights()

void lbann::model::remove_weights ( std::string const & name )

Remove weights from model.

◆ reorder_layers()

void lbann::model::reorder_layers ( const std::vector< El::Int > & gather_indices )

private

Setup-related implementation.

Reorder layer list with a gather.

The new layer list is the same length as gather_indices and its entries are given by

$\text{new\_list}[i] = \text{old\_list}[\text{gather\_indices}[i]]$

Since entries in the layer list must be unique, this will fail if gather_indices has any repeated entries.

◆ replace_layer()

void lbann::model::replace_layer	(	OwningLayerPtr &&	l,
		std::string const &	name
	)

Replace layer in model.

◆ reset_epoch_statistics()

void lbann::model::reset_epoch_statistics ( execution_mode mode )

Reset model statistics for an epoch.

◆ reset_mode()

void lbann::model::reset_mode	(	ExecutionContext &	context,
		execution_mode	mode
	)

Reset model pointer and execution mode.

◆ save_model()

void lbann::model::save_model ( )

Saves the model explicitly if the save_model callback is present.

Deprecated:: This function both holds on to the notion that models support callbacks (the majority of those in the current iteration of callbacks should be thought of as extensions to training algorithms rather than extensions of models) and is only used by the "cycgan" and "aecycgan" drivers, which themselves are not well-supported.

◆ save_to_checkpoint_distributed()

bool lbann::model::save_to_checkpoint_distributed ( persist & p )

◆ save_to_checkpoint_shared()

bool lbann::model::save_to_checkpoint_shared ( persist & p )

Checkpoint model to given file descriptor, return number of bytes written.

◆ serialize()

template<class Archive >

void lbann::model::serialize ( Archive & ar )

Serialization for checkpoint and restart with Cereal.

◆ set_current_mini_batch_size()

void lbann::model::set_current_mini_batch_size ( El::Int mini_batch_size )

inlinenoexcept

Set the current mini-batch size.

Definition at line 761 of file model.hpp.

◆ set_name()

void lbann::model::set_name ( std::string name )

Metadata Accessors.

Model instance name.

Each model in a trainer should have a unique, and preferably human-readable, name.

Here is the caller graph for this function:

◆ set_num_resources_branch_layers()

void lbann::model::set_num_resources_branch_layers ( int num )

inlinenoexcept

Definition at line 746 of file model.hpp.

◆ set_num_resources_non_branch_layers()

void lbann::model::set_num_resources_non_branch_layers ( int num )

inlinenoexcept

Definition at line 741 of file model.hpp.

◆ set_subgraph_num_parent_resources()

void lbann::model::set_subgraph_num_parent_resources ( int num_resources )

inlinenoexcept

Definition at line 701 of file model.hpp.

◆ set_subgrid_communication_type()

void lbann::model::set_subgrid_communication_type ( int type )

inlinenoexcept

Subgraph Parallelism Interface.

Definition at line 691 of file model.hpp.

◆ set_subgrid_topology()

void lbann::model::set_subgrid_topology ( bool type )

inlinenoexcept

Definition at line 711 of file model.hpp.

◆ setup()

void lbann::model::setup	(	size_t	max_mini_batch_size,
		const std::vector< El::Grid *> &	grids,
		bool	force = `false`
	)

Must be called after model specification and before execution.

◆ setup_layer_execution_order()

void lbann::model::setup_layer_execution_order ( )

private

Set up layer execution order.

Called in setup function. A topological sort applied is to the layer list so that we can traverse the directed acyclic graph without violating dependencies.

◆ setup_layer_grid_tags()

void lbann::model::setup_layer_grid_tags ( const std::vector< El::Grid *> & grids )

private

Set up grid tags for all layers.

Called in setup function.

◆ setup_layer_topology()

void lbann::model::setup_layer_topology ( )

private

Set up topology of layer graph.

Called in setup function. All layers in connected component of layer graph are added to the model and all parent/child relationships between layers are reciprocated.

◆ setup_layers()

void lbann::model::setup_layers	(	size_t	max_mini_batch_size,
		const std::vector< El::Grid *> &	grids
	)

private

Set up layers.

Called in setup function.

◆ setup_subcommunicators()

void lbann::model::setup_subcommunicators ( const std::vector< El::Grid *> & grids )

private

◆ setup_subgrid_layers_run_condition()

void lbann::model::setup_subgrid_layers_run_condition ( )

private

◆ setup_subgrids()

void lbann::model::setup_subgrids ( )

private

Setup sub grids for the sub graph parallelism.

◆ setup_weights()

void lbann::model::setup_weights ( )

private

Set up weights.

Called in setup function. All weights being used by layers or the objective function are added to the model and all unused weights are deleted.

◆ summarize_matrices()

void lbann::model::summarize_matrices ( lbann_summary & summarizer )

Summarize matrices (e.g. means).

These are called less frequently and can be more expensive.

◆ summarize_stats()

void lbann::model::summarize_stats ( lbann_summary & summarizer )

Summarize statistics (e.g. timers, counters).

These should be computable quickly.

◆ swap_layers()

void lbann::model::swap_layers ( model & other )

◆ swap_metrics()

void lbann::model::swap_metrics ( model & other )

◆ swap_objective_function()

void lbann::model::swap_objective_function ( model & other )

◆ swap_weights()

void lbann::model::swap_weights ( model & other )

◆ update_layers()

bool lbann::model::update_layers ( )

Update layers step.

◆ update_weights()

void lbann::model::update_weights ( )

Update weights step.

◆ write_proto()

void lbann::model::write_proto ( lbann_data::Model & proto )

Write model to proto file.

Member Data Documentation

◆ apply_subgraph_parallelism

bool lbann::model::apply_subgraph_parallelism = false

private

Definition at line 511 of file model.hpp.

◆ enable_subgraph_topology

bool lbann::model::enable_subgraph_topology = false

private

Definition at line 508 of file model.hpp.

◆ grids

std::unordered_map<std::string, std::shared_ptr<El::Grid> > lbann::model::grids

private

Definition at line 474 of file model.hpp.

◆ grids_mpi_groups

std::unordered_map<std::string, std::unique_ptr<El::mpi::Group> > lbann::model::grids_mpi_groups

private

Definition at line 481 of file model.hpp.

◆ m_background_io_allowed

bool lbann::model::m_background_io_allowed = true

private

Flag that allows input layers to fetch data in the background.

Definition at line 552 of file model.hpp.

◆ m_callbacks

std::vector<std::shared_ptr<callback_base> > lbann::model::m_callbacks

private

Current callbacks to process.

Definition at line 549 of file model.hpp.

◆ m_comm

lbann_comm* lbann::model::m_comm

private

LBANN communicator.

Definition at line 489 of file model.hpp.

◆ m_current_mini_batch_size

El::Int lbann::model::m_current_mini_batch_size

private

The current mini-batch size.

This should be set on each step by the execution algorithm using the value that the data coordinator gets from the data readers.

Number of samples being processed in the current step (iteration), used for correctly averaging gradients.

Definition at line 613 of file model.hpp.

◆ m_default_optimizer_msg

std::unique_ptr<lbann_data::Optimizer> lbann::model::m_default_optimizer_msg

private

If a layer needs to construct an optimizer during setup, it will make a copy of the default optimizer. This object is just used to create copies and is not actually used for optimization.

Definition at line 538 of file model.hpp.

◆ m_execution_context

observer_ptr<ExecutionContext> lbann::model::m_execution_context

private

Pointer to the execution context object used for training or evaluating this model

Definition at line 486 of file model.hpp.

◆ m_layers

std::vector<OwningLayerPtr> lbann::model::m_layers

private

Tensor operations.

The list is in execution order for forward propagation.

Definition at line 528 of file model.hpp.

◆ m_max_mini_batch_size

El::Int lbann::model::m_max_mini_batch_size

private

The maximum mini-batch size.

This should be set before setup_distconv() is called.

Definition at line 602 of file model.hpp.

◆ m_metrics

std::vector<std::unique_ptr<metric> > lbann::model::m_metrics

private

Numerical quantities to evaluate model performance.

Does not affect training.

Definition at line 546 of file model.hpp.

◆ m_model_is_setup

bool lbann::model::m_model_is_setup = false

private

Is the model setup.

Flag to indicate if the setup function has been called

Definition at line 557 of file model.hpp.

◆ m_name

std::string lbann::model::m_name

private

Model instance's name.

Each model in a trainer should have a unique, preferably human-readable, name.

Definition at line 523 of file model.hpp.

◆ m_objective_function

std::unique_ptr<objective_function> lbann::model::m_objective_function

private

Mathematical function to be minimized during training.

Definition at line 541 of file model.hpp.

◆ m_weights

std::vector<OwningWeightsPtr> lbann::model::m_weights

private

Trainable parameters.

Definition at line 531 of file model.hpp.

◆ num_resources_branch_layers

int lbann::model::num_resources_branch_layers

private

Definition at line 514 of file model.hpp.

◆ num_resources_non_branch_layers

int lbann::model::num_resources_non_branch_layers

private

Definition at line 517 of file model.hpp.

◆ subCommunicatorsSubgrids

std::unordered_map<std::string, std::shared_ptr<El::mpi::Comm> > lbann::model::subCommunicatorsSubgrids

private

Definition at line 477 of file model.hpp.

◆ subgraph_num_resources_parent

int lbann::model::subgraph_num_resources_parent = 0

private

Definition at line 503 of file model.hpp.

◆ vector_communication_subgraph

int lbann::model::vector_communication_subgraph = 0

private

Enable vector communication for the subgraph parallelism

Definition at line 499 of file model.hpp.

The documentation for this class was generated from the following file:

model.hpp

Public Member Functions

Private Member Functions

Private Attributes

Detailed Description

Constructor & Destructor Documentation

◆ model() [1/3]

◆ model() [2/3]

◆ ~model()

◆ model() [3/3]

Member Function Documentation

◆ add_callback()

◆ add_dummy_layers()

◆ add_evaluation_layers()

◆ add_layer()

◆ add_metric()

◆ add_split_layers()

◆ add_weights()

◆ allow_background_io_activity()

◆ background_io_activity_allowed()

◆ backward_prop()

◆ check_subgraph_parallelism()

◆ clear_gradients()

◆ copy_trained_weights_from()

◆ create_optimizer()

◆ do_layer_backward_prop_begin_cbs()

◆ do_layer_backward_prop_end_cbs()

◆ do_layer_forward_prop_begin_cbs()

◆ do_layer_forward_prop_end_cbs()

◆ do_model_backward_prop_begin_cbs()

◆ do_model_backward_prop_end_cbs()

◆ do_model_forward_prop_begin_cbs()

◆ do_model_forward_prop_end_cbs()

◆ do_model_optimize_begin_cbs()

◆ do_model_optimize_end_cbs()

◆ do_setup_end_cbs()

◆ do_weight_optimize_begin_cbs()

◆ do_weight_optimize_end_cbs()

◆ enable_subgraph_parallelism()

◆ ensure_input_layers_first()

◆ evaluate_metrics()

◆ forward_prop()

◆ get_callbacks()

◆ get_callbacks_with_ownership()

◆ get_comm()

◆ get_current_mini_batch_size()

◆ get_description()

◆ get_execution_context() [1/2]

◆ get_execution_context() [2/2]

◆ get_layer() [1/2]

◆ get_layer() [2/2]

◆ get_layers() [1/2]

◆ get_layers() [2/2]

◆ get_max_mini_batch_size()

◆ get_max_subgraph_branches()

◆ get_metrics() [1/2]

◆ get_metrics() [2/2]

◆ get_name()

◆ get_num_layers()

◆ get_num_resources_branch_layers()

◆ get_num_resources_non_branch_layers()

◆ get_objective_function() [1/2]

◆ get_objective_function() [2/2]

◆ get_parent_subgrid_tags()

◆ get_resources_for_input_layer()

◆ get_resources_for_merge_layers()

◆ get_resources_for_spliting_point()

◆ get_subgraph_num_parent_resources()

◆ get_subgraph_subgrids_ranks()

◆ get_subgrid_communication_type()

◆ get_subgrid_topology()

◆ get_subgrids_order()

◆ get_weights() [1/2]

◆ get_weights() [2/2]

◆ get_weights_pointers()

◆ has_valid_execution_context()

◆ insert_layer()

◆ is_subgraph_parallelism_enabled()

◆ load_from_checkpoint_distributed()

◆ load_from_checkpoint_shared()

◆ operator=()