LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
model.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_MODELS_MODEL_HPP_INCLUDED
28 #define LBANN_MODELS_MODEL_HPP_INCLUDED
29 
30 #include "lbann/base.hpp"
31 #include "lbann/io/file_io.hpp"
33 #include "lbann/utils/summary.hpp"
35 
36 #ifdef LBANN_HAS_ONNX
37 #include <onnx/onnx_pb.h>
38 #endif // LBANN_HAS_ONNX
39 
40 // Note (trb): There's what is, IMO, an STL error in GCC in which the
41 // dtor for unique_ptr is checking sizeof(T), so this must be a
42 // complete type. Sigh. (The greater implication of this is that you
43 // cannot have `unique_ptr<IncompleteType>` as a drop-in for
44 // `IncompleteType*`, which is annoying.)
45 #include "lbann/proto/optimizers.pb.h"
46 
47 #include <string>
48 #include <unordered_map>
49 #include <vector>
50 
51 // Forward-declare protobuf class
52 namespace lbann_data {
53 class Model;
54 }
55 
56 // Forward declaration
57 namespace cereal {
58 class access;
59 }
60 
61 namespace lbann {
62 
63 // Forward declarations
64 class lbann_comm;
65 class description;
66 class Layer;
67 class lbann_callback;
68 class Layer;
69 class TrainingAlgorithm;
70 class callback_base;
71 class metric;
72 class weights;
73 class optimizer;
74 class objective_function;
75 class ExecutionContext;
76 class persist;
77 using OwningWeightsPtr = std::shared_ptr<weights>;
78 using ViewingWeightsPtr = std::weak_ptr<weights>;
79 using OwningLayerPtr = std::shared_ptr<Layer>;
80 using ViewingLayerPtr = std::weak_ptr<Layer>;
81 
83 class model
84 {
85 public:
86  // ===========================================
87  // Life cycle functions
88  // ===========================================
89 
90  model(lbann_comm* comm,
91  std::unique_ptr<objective_function> obj_fn,
92  std::unique_ptr<lbann_data::Optimizer> default_optimizer_msg = nullptr);
93  model(const model& other);
94  model& operator=(const model& other);
95  ~model() = default;
96 
98 
104  void set_name(std::string name);
105 
110  std::string get_name() const noexcept;
111 
113  description get_description() const;
114 
116  lbann_comm* get_comm() const noexcept;
117 
119 
120 
123  El::Int get_num_layers() const noexcept;
125  Layer& get_layer(El::Int pos);
127  Layer const& get_layer(El::Int pos) const;
131  std::vector<Layer*> get_layers();
135  std::vector<Layer const*> get_layers() const;
136  std::vector<weights*> get_weights();
137  std::vector<weights const*> get_weights() const;
138  std::vector<ViewingWeightsPtr> get_weights_pointers() const;
139 
142  get_objective_function() const noexcept;
143 
144  observer_ptr<objective_function> get_objective_function() noexcept;
145 
147  std::vector<metric*> get_metrics();
148  std::vector<metric const*> get_metrics() const;
149 
150 #ifdef LBANN_HAS_ONNX
151 
152  void serialize_to_onnx(onnx::ModelProto& mp);
153 #endif // LBANN_HAS_ONNX
154 
155  // ===========================================
156  // Model specification
157  // ===========================================
159 
160 
163  void add_layer(OwningLayerPtr&& l);
164 
166  void add_weights(OwningWeightsPtr&& w);
167 
169  void remove_weights(std::string const& name);
170 
172  void add_callback(std::shared_ptr<callback_base> cb);
173 
175  void add_metric(std::unique_ptr<metric> m);
176 
178  void insert_layer(OwningLayerPtr&& l, std::string const& parent_name);
179 
181  void remove_layer(std::string const& name);
182 
184  void replace_layer(OwningLayerPtr&& l, std::string const& name);
185 
186  void swap_layers(model& other);
187  void swap_weights(model& other);
188  void swap_metrics(model& other);
189  void swap_objective_function(model& other);
190 
192 
198  void copy_trained_weights_from(std::vector<weights*>& w);
199 
204  template <typename TensorDataType>
205  std::unique_ptr<optimizer> create_optimizer() const;
206 
210  void allow_background_io_activity(bool enable) noexcept;
211 
213  bool background_io_activity_allowed() const noexcept;
214 
215  // ===========================================
216  // Setup
217  // ===========================================
218 
221  void setup(size_t max_mini_batch_size,
222  const std::vector<El::Grid*>& grids,
223  bool force = false);
224 
226 
231  void summarize_stats(lbann_summary& summarizer);
232 
236  void summarize_matrices(lbann_summary& summarizer);
237 
239 
240 
243  template <class Archive>
244  void serialize(Archive& ar);
245 
248  bool save_to_checkpoint_shared(persist& p);
251  bool load_from_checkpoint_shared(persist& p);
252 
253  bool save_to_checkpoint_distributed(persist& p);
254  bool load_from_checkpoint_distributed(persist& p);
255 
257  void write_proto(lbann_data::Model& proto);
258 
269  void save_model();
270 
272 
273 
275  void set_subgrid_communication_type(int type) noexcept;
276  int get_subgrid_communication_type() const noexcept;
277  void set_subgraph_num_parent_resources(int num_resources) noexcept;
278  int get_subgraph_num_parent_resources() const noexcept;
279  void set_subgrid_topology(bool type) noexcept;
280  bool get_subgrid_topology() const noexcept;
281  void enable_subgraph_parallelism() noexcept;
282  bool is_subgraph_parallelism_enabled() const noexcept;
283  int get_num_resources_non_branch_layers() const noexcept;
284  int get_num_resources_branch_layers() const noexcept;
285  void set_num_resources_non_branch_layers(int num) noexcept;
286  void set_num_resources_branch_layers(int num) noexcept;
287 
289 
290 private:
292 
303  void reorder_layers(const std::vector<El::Int>& gather_indices);
304 
311  void remap_pointers(
312  const std::unordered_map<Layer*, ViewingLayerPtr>& layer_map,
313  const std::unordered_map<weights*, ViewingWeightsPtr>& weights_map);
314 
321  void setup_layer_topology();
322 
329  void setup_layer_execution_order();
330 
336  void setup_layer_grid_tags(const std::vector<El::Grid*>& grids);
337 
342  void setup_layers(size_t max_mini_batch_size,
343  const std::vector<El::Grid*>& grids);
344 
351  void setup_weights();
352 
354 
355 
358  void setup_subgrids();
359 
360  void get_subgrids_order(std::vector<int>& ranks_order, int num_branches);
361  int get_max_subgraph_branches();
362  void check_subgraph_parallelism();
363  void setup_subgrid_layers_run_condition();
364  void get_parent_subgrid_tags(int layer_index);
365  void get_subgraph_subgrids_ranks(std::vector<int>& parent_ranks,
366  std::vector<int>& subgrid_ranks,
367  int layer_index,
368  int number_ranks_in_grid);
369  void get_resources_for_spliting_point(std::vector<int>& parent_ranks,
370  std::vector<int>& subgrid_ranks,
371  int layer_index,
372  int number_ranks_in_grid,
373  int num_subgrids);
374  void get_resources_for_merge_layers(std::set<int>& pooled_set,
375  int child_index,
376  int num_subgrids);
377  void get_resources_for_input_layer(std::vector<int>& masterSubGrid,
378  int num_subgrids);
379  void setup_subcommunicators(const std::vector<El::Grid*>& grids);
380 
382 
383 public:
384  // ===========================================
385  // Execution
386  // ===========================================
387 
389  std::vector<observer_ptr<callback_base>> get_callbacks();
390 
391  std::vector<std::shared_ptr<callback_base>>&
392  get_callbacks_with_ownership() noexcept;
393 
395  bool has_valid_execution_context() const noexcept;
396 
398  ExecutionContext const& get_execution_context() const;
399 
401  ExecutionContext& get_execution_context();
402 
404  void reset_mode(ExecutionContext& context, execution_mode mode);
406  void reset_epoch_statistics(execution_mode mode);
407 
409  void forward_prop(execution_mode mode);
411  void backward_prop(bool compute_weight_grads_only = true);
413  void evaluate_metrics(execution_mode mode, size_t current_mini_batch_size);
419  void clear_gradients();
421  void update_weights();
423  bool update_layers();
429  void reconcile_weight_values();
430 
431  // ===========================================
432  // Callbacks
433  // ===========================================
434 
436  void do_setup_end_cbs();
438  void do_model_forward_prop_begin_cbs(execution_mode mode);
440  void do_model_forward_prop_end_cbs(execution_mode mode);
442  void do_layer_forward_prop_begin_cbs(execution_mode mode, Layer* l);
444  void do_layer_forward_prop_end_cbs(execution_mode mode, Layer* l);
446  void do_model_backward_prop_begin_cbs();
448  void do_model_backward_prop_end_cbs();
450  void do_layer_backward_prop_begin_cbs(Layer* l);
452  void do_layer_backward_prop_end_cbs(Layer* l);
454  void do_model_optimize_begin_cbs();
456  void do_model_optimize_end_cbs();
458  void do_weight_optimize_begin_cbs(weights* w);
460  void do_weight_optimize_end_cbs(weights* w);
462  El::Int get_max_mini_batch_size() const noexcept;
464  El::Int get_current_mini_batch_size() const noexcept;
466  void set_current_mini_batch_size(El::Int) noexcept;
467 
468 private:
469  friend cereal::access;
470  model();
471 
472 private:
473  // map to store all distinct grids in the model
474  std::unordered_map<std::string, std::shared_ptr<El::Grid>> grids;
475 
476  std::unordered_map<std::string, std::shared_ptr<El::mpi::Comm>>
478  // map to store all distinct mpi groups in the model (one to one mapping with
479  // grids)
480  std::unordered_map<std::string, std::unique_ptr<El::mpi::Group>>
482 
483 private:
487 
490 
491  /*experimental code for Sub graph*/
493  // 0: send-recv based subgrid communication
494  // 1: collective based subgrid communication without optimization that
495  // requires specific assumptions like subgrids should have same size and
496  // creates sub-communicators everytime 2: collective based subgrid
497  // communication with optimization
498 
499  int vector_communication_subgraph = 0;
500 
501  // Number of resources for parent (common) grid
502  // 0: use all resources (default)
503  int subgraph_num_resources_parent = 0;
504 
505  // 0: no topology aware design
506  // 1: master grid in round robin manner of nodes (GPUs per node 4) 1 3 5 7, 2
507  // 4 6 8
508  bool enable_subgraph_topology = false;
509 
510  // whether subgraph parallelism is enabled or not for the model
511  bool apply_subgraph_parallelism = false;
512 
513  // total number of resources / ranks for branch (subgrid) layers
515 
516  // total number of resources / ranks for common/seq layers
518 
523  std::string m_name;
524 
528  std::vector<OwningLayerPtr> m_layers;
529 
531  std::vector<OwningWeightsPtr> m_weights;
532 
538  std::unique_ptr<lbann_data::Optimizer> m_default_optimizer_msg;
539 
541  std::unique_ptr<objective_function> m_objective_function;
542 
546  std::vector<std::unique_ptr<metric>> m_metrics;
547 
549  std::vector<std::shared_ptr<callback_base>> m_callbacks;
550 
552  bool m_background_io_allowed = true;
553 
557  bool m_model_is_setup = false;
558 
559 private:
560  // ===========================================
561  // Functions to add utility layers
562  // ===========================================
563 
575  void add_evaluation_layers(std::unordered_set<Layer*>& layer_set,
576  std::unordered_set<std::string>& layer_names);
577 
586  void add_dummy_layers(std::unordered_set<std::string>& layer_names);
595  void add_split_layers(std::unordered_set<std::string>& layer_names);
596 
597  void ensure_input_layers_first();
598 
603 
614 
615 #ifdef LBANN_HAS_DISTCONV
616 private:
617  void setup_distconv();
618  void setup_distributions();
619  void print_distributions() const;
620 #endif // LBANN_HAS_DISTCONV
621 }; // class model
622 
623 inline std::string model::get_name() const noexcept { return m_name; }
624 
625 inline observer_ptr<objective_function> model::get_objective_function() noexcept
626 {
627  return m_objective_function.get();
628 }
629 
631 model::get_objective_function() const noexcept
632 {
633  return m_objective_function.get();
634 }
635 
636 inline std::vector<observer_ptr<callback_base>> model::get_callbacks()
637 {
638  std::vector<observer_ptr<callback_base>> callback_list;
639  callback_list.reserve(m_callbacks.size());
640  for (const auto& ptr : m_callbacks) {
641  callback_list.push_back(ptr.get());
642  }
643  return callback_list;
644 }
645 
646 inline std::vector<std::shared_ptr<callback_base>>&
647 model::get_callbacks_with_ownership() noexcept
648 {
649  return m_callbacks;
650 }
651 
652 inline lbann_comm* model::get_comm() const noexcept { return m_comm; }
653 
654 inline bool model::has_valid_execution_context() const noexcept
655 {
656  return (m_execution_context != nullptr);
657 }
658 
659 inline ExecutionContext const& model::get_execution_context() const
660 {
661  if (m_execution_context == nullptr) {
662  LBANN_ERROR("execution context is not set");
663  }
664  return *m_execution_context;
665 }
666 
667 inline ExecutionContext& model::get_execution_context()
668 {
669  return const_cast<ExecutionContext&>(
670  static_cast<const model&>(*this).get_execution_context());
671 }
672 
673 template <typename TensorDataType>
674 inline std::unique_ptr<optimizer> model::create_optimizer() const
675 {
676  if (m_default_optimizer_msg)
677  return proto::construct_optimizer<TensorDataType>(*m_default_optimizer_msg);
678  return nullptr;
679 }
680 
681 inline void model::allow_background_io_activity(bool enable) noexcept
682 {
683  m_background_io_allowed = enable;
684 }
685 
686 inline bool model::background_io_activity_allowed() const noexcept
687 {
688  return m_background_io_allowed;
689 }
690 
691 inline void model::set_subgrid_communication_type(int type) noexcept
692 {
693  vector_communication_subgraph = type;
694 }
695 
696 inline int model::get_subgrid_communication_type() const noexcept
697 {
698  return vector_communication_subgraph;
699 }
700 
701 inline void model::set_subgraph_num_parent_resources(int num_resources) noexcept
702 {
703  subgraph_num_resources_parent = num_resources;
704 }
705 
706 inline int model::get_subgraph_num_parent_resources() const noexcept
707 {
708  return subgraph_num_resources_parent;
709 }
710 
711 inline void model::set_subgrid_topology(bool type) noexcept
712 {
713  enable_subgraph_topology = type;
714 }
715 
716 inline bool model::get_subgrid_topology() const noexcept
717 {
718  return enable_subgraph_topology;
719 }
720 
721 inline void model::enable_subgraph_parallelism() noexcept
722 {
723  apply_subgraph_parallelism = true;
724 }
725 
726 inline bool model::is_subgraph_parallelism_enabled() const noexcept
727 {
728  return apply_subgraph_parallelism;
729 }
730 
731 inline int model::get_num_resources_non_branch_layers() const noexcept
732 {
733  return num_resources_non_branch_layers;
734 }
735 
736 inline int model::get_num_resources_branch_layers() const noexcept
737 {
738  return num_resources_branch_layers;
739 }
740 
741 inline void model::set_num_resources_non_branch_layers(int num) noexcept
742 {
743  num_resources_non_branch_layers = num;
744 }
745 
746 inline void model::set_num_resources_branch_layers(int num) noexcept
747 {
748  num_resources_branch_layers = num;
749 }
750 
751 inline El::Int model::get_max_mini_batch_size() const noexcept
752 {
753  return m_max_mini_batch_size;
754 }
755 
756 inline El::Int model::get_current_mini_batch_size() const noexcept
757 {
758  return m_current_mini_batch_size;
759 }
760 
761 inline void model::set_current_mini_batch_size(El::Int mini_batch_size) noexcept
762 {
763  if (mini_batch_size > m_max_mini_batch_size) {
765  "LOGICAL ERROR: the current mini-batch size ",
766  mini_batch_size,
767  " is being set to larger than the established maximum mini-batch size ",
768  m_max_mini_batch_size,
769  ". Note that this should work properly as all matrices will be resized, "
770  "but this is a logical error as the maximum mini-batch size should be "
771  "established at setup time to avoid dynamic allocation.");
772  }
773  m_current_mini_batch_size = mini_batch_size;
774  return;
775 }
776 
777 } // namespace lbann
778 
779 #endif // LBANN_MODELS_MODEL_HPP_INCLUDED
std::unordered_map< std::string, std::unique_ptr< El::mpi::Group > > grids_mpi_groups
Definition: model.hpp:481
El::Int m_current_mini_batch_size
The current mini-batch size.
Definition: model.hpp:613
#define LBANN_ERROR(...)
Definition: exception.hpp:37
std::weak_ptr< Layer > ViewingLayerPtr
Smart pointer to reference a layer object.
Definition: layer.hpp:133
void serialize(std::ostream &os, google::protobuf::Message const &msg)
Serialize the protobuf message to a stream.
Neural network tensor operation.
Definition: layer.hpp:285
Generates nicely formatted description messages.
Definition: description.hpp:49
lbann_comm * m_comm
LBANN communicator.
Definition: model.hpp:489
std::vector< std::unique_ptr< metric > > m_metrics
Numerical quantities to evaluate model performance.
Definition: model.hpp:546
std::unordered_map< std::string, std::shared_ptr< El::mpi::Comm > > subCommunicatorsSubgrids
Definition: model.hpp:477
std::unique_ptr< lbann_data::Optimizer > m_default_optimizer_msg
Definition: model.hpp:538
std::unique_ptr< objective_function > m_objective_function
Mathematical function to be minimized during training.
Definition: model.hpp:541
Abstract base class for neural network models.
Definition: model.hpp:83
std::vector< std::shared_ptr< callback_base > > m_callbacks
Current callbacks to process.
Definition: model.hpp:549
The execution context for an KFAC algorithm.
std::shared_ptr< weights > OwningWeightsPtr
Smart pointer to manage ownership of a weights object.
Definition: model.hpp:77
std::weak_ptr< weights > ViewingWeightsPtr
Smart pointer to reference a weights object.
Definition: layer.hpp:89
typename std::add_pointer< T >::type observer_ptr
Creating an observer_ptr to complement the unique_ptr and shared_ptr.
Definition: base.hpp:54
execution_mode
Neural network execution mode.
Definition: base.hpp:229
std::vector< OwningLayerPtr > m_layers
Tensor operations.
Definition: model.hpp:528
std::string m_name
Model instance&#39;s name.
Definition: model.hpp:523
ExecutionContext const & get_execution_context() const
Definition: model.hpp:659
std::unordered_map< std::string, std::shared_ptr< El::Grid > > grids
Definition: model.hpp:474
auto force(El::MultiSync< D, Ds... > const &x) -> El::SyncInfo< D > const &
Force the MultiSync to the master SyncInfo.
std::vector< OwningWeightsPtr > m_weights
Trainable parameters.
Definition: model.hpp:531
int num_resources_non_branch_layers
Definition: model.hpp:517
#define LBANN_WARNING(...)
Definition: exception.hpp:53
observer_ptr< ExecutionContext > m_execution_context
Definition: model.hpp:486
int num_resources_branch_layers
Definition: model.hpp:514
std::shared_ptr< Layer > OwningLayerPtr
Smart pointer to manage ownership of a layer object.
Definition: layer.hpp:125
El::Int m_max_mini_batch_size
The maximum mini-batch size.
Definition: model.hpp:602