27 #ifndef LBANN_MODELS_MODEL_HPP_INCLUDED 28 #define LBANN_MODELS_MODEL_HPP_INCLUDED 37 #include <onnx/onnx_pb.h> 38 #endif // LBANN_HAS_ONNX 45 #include "lbann/proto/optimizers.pb.h" 48 #include <unordered_map> 69 class TrainingAlgorithm;
74 class objective_function;
91 std::unique_ptr<objective_function> obj_fn,
92 std::unique_ptr<lbann_data::Optimizer> default_optimizer_msg =
nullptr);
104 void set_name(std::string name);
110 std::string get_name()
const noexcept;
123 El::Int get_num_layers()
const noexcept;
125 Layer& get_layer(El::Int pos);
127 Layer const& get_layer(El::Int pos)
const;
131 std::vector<Layer*> get_layers();
135 std::vector<Layer const*> get_layers()
const;
136 std::vector<weights*> get_weights();
137 std::vector<weights const*> get_weights()
const;
138 std::vector<ViewingWeightsPtr> get_weights_pointers()
const;
142 get_objective_function()
const noexcept;
147 std::vector<metric*> get_metrics();
148 std::vector<metric const*> get_metrics()
const;
150 #ifdef LBANN_HAS_ONNX 152 void serialize_to_onnx(onnx::ModelProto& mp);
153 #endif // LBANN_HAS_ONNX 169 void remove_weights(std::string
const& name);
172 void add_callback(std::shared_ptr<callback_base> cb);
175 void add_metric(std::unique_ptr<metric> m);
178 void insert_layer(
OwningLayerPtr&& l, std::string
const& parent_name);
181 void remove_layer(std::string
const& name);
186 void swap_layers(
model& other);
187 void swap_weights(
model& other);
188 void swap_metrics(
model& other);
189 void swap_objective_function(
model& other);
198 void copy_trained_weights_from(std::vector<weights*>& w);
204 template <
typename TensorDataType>
205 std::unique_ptr<optimizer> create_optimizer()
const;
210 void allow_background_io_activity(
bool enable) noexcept;
213 bool background_io_activity_allowed()
const noexcept;
221 void setup(
size_t max_mini_batch_size,
222 const std::vector<El::Grid*>& grids,
243 template <
class Archive>
248 bool save_to_checkpoint_shared(
persist& p);
251 bool load_from_checkpoint_shared(
persist& p);
253 bool save_to_checkpoint_distributed(
persist& p);
254 bool load_from_checkpoint_distributed(
persist& p);
257 void write_proto(lbann_data::Model& proto);
275 void set_subgrid_communication_type(
int type) noexcept;
276 int get_subgrid_communication_type()
const noexcept;
277 void set_subgraph_num_parent_resources(
int num_resources) noexcept;
278 int get_subgraph_num_parent_resources()
const noexcept;
279 void set_subgrid_topology(
bool type) noexcept;
280 bool get_subgrid_topology()
const noexcept;
281 void enable_subgraph_parallelism() noexcept;
282 bool is_subgraph_parallelism_enabled()
const noexcept;
283 int get_num_resources_non_branch_layers()
const noexcept;
284 int get_num_resources_branch_layers()
const noexcept;
285 void set_num_resources_non_branch_layers(
int num) noexcept;
286 void set_num_resources_branch_layers(
int num) noexcept;
303 void reorder_layers(
const std::vector<El::Int>& gather_indices);
312 const std::unordered_map<Layer*, ViewingLayerPtr>& layer_map,
313 const std::unordered_map<weights*, ViewingWeightsPtr>& weights_map);
321 void setup_layer_topology();
329 void setup_layer_execution_order();
336 void setup_layer_grid_tags(
const std::vector<El::Grid*>& grids);
342 void setup_layers(
size_t max_mini_batch_size,
343 const std::vector<El::Grid*>& grids);
351 void setup_weights();
358 void setup_subgrids();
360 void get_subgrids_order(std::vector<int>& ranks_order,
int num_branches);
361 int get_max_subgraph_branches();
362 void check_subgraph_parallelism();
363 void setup_subgrid_layers_run_condition();
364 void get_parent_subgrid_tags(
int layer_index);
365 void get_subgraph_subgrids_ranks(std::vector<int>& parent_ranks,
366 std::vector<int>& subgrid_ranks,
368 int number_ranks_in_grid);
369 void get_resources_for_spliting_point(std::vector<int>& parent_ranks,
370 std::vector<int>& subgrid_ranks,
372 int number_ranks_in_grid,
374 void get_resources_for_merge_layers(std::set<int>& pooled_set,
377 void get_resources_for_input_layer(std::vector<int>& masterSubGrid,
379 void setup_subcommunicators(
const std::vector<El::Grid*>& grids);
389 std::vector<observer_ptr<callback_base>> get_callbacks();
391 std::vector<std::shared_ptr<callback_base>>&
392 get_callbacks_with_ownership() noexcept;
395 bool has_valid_execution_context()
const noexcept;
411 void backward_prop(
bool compute_weight_grads_only =
true);
413 void evaluate_metrics(
execution_mode mode,
size_t current_mini_batch_size);
419 void clear_gradients();
421 void update_weights();
423 bool update_layers();
429 void reconcile_weight_values();
436 void do_setup_end_cbs();
446 void do_model_backward_prop_begin_cbs();
448 void do_model_backward_prop_end_cbs();
450 void do_layer_backward_prop_begin_cbs(
Layer* l);
452 void do_layer_backward_prop_end_cbs(
Layer* l);
454 void do_model_optimize_begin_cbs();
456 void do_model_optimize_end_cbs();
458 void do_weight_optimize_begin_cbs(
weights* w);
460 void do_weight_optimize_end_cbs(
weights* w);
462 El::Int get_max_mini_batch_size()
const noexcept;
464 El::Int get_current_mini_batch_size()
const noexcept;
466 void set_current_mini_batch_size(El::Int) noexcept;
469 friend cereal::access;
474 std::unordered_map<std::string, std::shared_ptr<El::Grid>>
grids;
476 std::unordered_map<std::string, std::shared_ptr<El::mpi::Comm>>
480 std::unordered_map<std::string, std::unique_ptr<El::mpi::Group>>
499 int vector_communication_subgraph = 0;
503 int subgraph_num_resources_parent = 0;
508 bool enable_subgraph_topology =
false;
511 bool apply_subgraph_parallelism =
false;
552 bool m_background_io_allowed =
true;
557 bool m_model_is_setup =
false;
575 void add_evaluation_layers(std::unordered_set<Layer*>& layer_set,
576 std::unordered_set<std::string>& layer_names);
586 void add_dummy_layers(std::unordered_set<std::string>& layer_names);
595 void add_split_layers(std::unordered_set<std::string>& layer_names);
597 void ensure_input_layers_first();
615 #ifdef LBANN_HAS_DISTCONV 617 void setup_distconv();
618 void setup_distributions();
619 void print_distributions()
const;
620 #endif // LBANN_HAS_DISTCONV 623 inline std::string model::get_name() const noexcept {
return m_name; }
627 return m_objective_function.get();
631 model::get_objective_function() const noexcept
633 return m_objective_function.get();
636 inline std::vector<observer_ptr<callback_base>> model::get_callbacks()
638 std::vector<observer_ptr<callback_base>> callback_list;
639 callback_list.reserve(m_callbacks.size());
640 for (
const auto& ptr : m_callbacks) {
641 callback_list.push_back(ptr.get());
643 return callback_list;
646 inline std::vector<std::shared_ptr<callback_base>>&
647 model::get_callbacks_with_ownership() noexcept
652 inline lbann_comm* model::get_comm() const noexcept {
return m_comm; }
654 inline bool model::has_valid_execution_context() const noexcept
656 return (m_execution_context !=
nullptr);
661 if (m_execution_context ==
nullptr) {
664 return *m_execution_context;
673 template <
typename TensorDataType>
674 inline std::unique_ptr<optimizer> model::create_optimizer()
const 676 if (m_default_optimizer_msg)
677 return proto::construct_optimizer<TensorDataType>(*m_default_optimizer_msg);
681 inline void model::allow_background_io_activity(
bool enable) noexcept
683 m_background_io_allowed = enable;
686 inline bool model::background_io_activity_allowed() const noexcept
688 return m_background_io_allowed;
691 inline void model::set_subgrid_communication_type(
int type) noexcept
693 vector_communication_subgraph = type;
696 inline int model::get_subgrid_communication_type() const noexcept
698 return vector_communication_subgraph;
701 inline void model::set_subgraph_num_parent_resources(
int num_resources) noexcept
703 subgraph_num_resources_parent = num_resources;
706 inline int model::get_subgraph_num_parent_resources() const noexcept
708 return subgraph_num_resources_parent;
711 inline void model::set_subgrid_topology(
bool type) noexcept
713 enable_subgraph_topology = type;
716 inline bool model::get_subgrid_topology() const noexcept
718 return enable_subgraph_topology;
721 inline void model::enable_subgraph_parallelism() noexcept
723 apply_subgraph_parallelism =
true;
726 inline bool model::is_subgraph_parallelism_enabled() const noexcept
728 return apply_subgraph_parallelism;
731 inline int model::get_num_resources_non_branch_layers() const noexcept
733 return num_resources_non_branch_layers;
736 inline int model::get_num_resources_branch_layers() const noexcept
738 return num_resources_branch_layers;
741 inline void model::set_num_resources_non_branch_layers(
int num) noexcept
743 num_resources_non_branch_layers = num;
746 inline void model::set_num_resources_branch_layers(
int num) noexcept
748 num_resources_branch_layers = num;
751 inline El::Int model::get_max_mini_batch_size() const noexcept
753 return m_max_mini_batch_size;
756 inline El::Int model::get_current_mini_batch_size() const noexcept
758 return m_current_mini_batch_size;
761 inline void model::set_current_mini_batch_size(El::Int mini_batch_size) noexcept
763 if (mini_batch_size > m_max_mini_batch_size) {
765 "LOGICAL ERROR: the current mini-batch size ",
767 " is being set to larger than the established maximum mini-batch size ",
768 m_max_mini_batch_size,
769 ". Note that this should work properly as all matrices will be resized, " 770 "but this is a logical error as the maximum mini-batch size should be " 771 "established at setup time to avoid dynamic allocation.");
773 m_current_mini_batch_size = mini_batch_size;
779 #endif // LBANN_MODELS_MODEL_HPP_INCLUDED std::unordered_map< std::string, std::unique_ptr< El::mpi::Group > > grids_mpi_groups
El::Int m_current_mini_batch_size
The current mini-batch size.
std::weak_ptr< Layer > ViewingLayerPtr
Smart pointer to reference a layer object.
void serialize(std::ostream &os, google::protobuf::Message const &msg)
Serialize the protobuf message to a stream.
Neural network tensor operation.
Generates nicely formatted description messages.
lbann_comm * m_comm
LBANN communicator.
std::vector< std::unique_ptr< metric > > m_metrics
Numerical quantities to evaluate model performance.
std::unordered_map< std::string, std::shared_ptr< El::mpi::Comm > > subCommunicatorsSubgrids
std::unique_ptr< lbann_data::Optimizer > m_default_optimizer_msg
std::unique_ptr< objective_function > m_objective_function
Mathematical function to be minimized during training.
Abstract base class for neural network models.
std::vector< std::shared_ptr< callback_base > > m_callbacks
Current callbacks to process.
The execution context for an KFAC algorithm.
std::shared_ptr< weights > OwningWeightsPtr
Smart pointer to manage ownership of a weights object.
std::weak_ptr< weights > ViewingWeightsPtr
Smart pointer to reference a weights object.
typename std::add_pointer< T >::type observer_ptr
Creating an observer_ptr to complement the unique_ptr and shared_ptr.
execution_mode
Neural network execution mode.
std::vector< OwningLayerPtr > m_layers
Tensor operations.
std::string m_name
Model instance's name.
ExecutionContext const & get_execution_context() const
std::unordered_map< std::string, std::shared_ptr< El::Grid > > grids
auto force(El::MultiSync< D, Ds... > const &x) -> El::SyncInfo< D > const &
Force the MultiSync to the master SyncInfo.
std::vector< OwningWeightsPtr > m_weights
Trainable parameters.
int num_resources_non_branch_layers
#define LBANN_WARNING(...)
observer_ptr< ExecutionContext > m_execution_context
int num_resources_branch_layers
std::shared_ptr< Layer > OwningLayerPtr
Smart pointer to manage ownership of a layer object.
El::Int m_max_mini_batch_size
The maximum mini-batch size.