dc/ddc/model_8hpp_source.html

 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
 // Produced at the Lawrence Livermore National Laboratory.
 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 //
 // LLNL-CODE-697807.
 // All rights reserved.
 //
 // This file is part of LBANN: Livermore Big Artificial Neural Network
 // Toolkit. For details, see http://software.llnl.gov/LBANN or
 // https://github.com/LLNL/LBANN.
 //
 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 // may not use this file except in compliance with the License.  You may
 // obtain a copy of the License at:
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.

 #ifndef LBANN_MODELS_MODEL_HPP_INCLUDED
 #define LBANN_MODELS_MODEL_HPP_INCLUDED

 #include "lbann/base.hpp"
 #include "lbann/io/file_io.hpp"
 #include "lbann/proto/factories.hpp"
 #include "lbann/utils/summary.hpp"
 #include "lbann/utils/threads/thread_pool.hpp"

 #ifdef LBANN_HAS_ONNX
 #include <onnx/onnx_pb.h>
 #endif // LBANN_HAS_ONNX

 // Note (trb): There's what is, IMO, an STL error in GCC in which the
 // dtor for unique_ptr is checking sizeof(T), so this must be a
 // complete type. Sigh. (The greater implication of this is that you
 // cannot have `unique_ptr<IncompleteType>` as a drop-in for
 // `IncompleteType*`, which is annoying.)
 #include "lbann/proto/optimizers.pb.h"

 #include <string>
 #include <unordered_map>
 #include <vector>

 // Forward-declare protobuf class
 namespace lbann_data {
 class Model;
 }

 // Forward declaration
 namespace cereal {
 class access;
 }

 namespace lbann {

 // Forward declarations
 class lbann_comm;
 class description;
 class Layer;
 class lbann_callback;
 class Layer;
 class TrainingAlgorithm;
 class callback_base;
 class metric;
 class weights;
 class optimizer;
 class objective_function;
 class ExecutionContext;
 class persist;
 using OwningWeightsPtr = std::shared_ptr<weights>;
 using ViewingWeightsPtr = std::weak_ptr<weights>;
 using OwningLayerPtr = std::shared_ptr<Layer>;
 using ViewingLayerPtr = std::weak_ptr<Layer>;

 class model
 {
 public:
   // ===========================================
   // Life cycle functions
   // ===========================================

   model(lbann_comm* comm,
         std::unique_ptr<objective_function> obj_fn,
         std::unique_ptr<lbann_data::Optimizer> default_optimizer_msg = nullptr);
   model(const model& other);
   model& operator=(const model& other);
   ~model() = default;


   void set_name(std::string name);

   std::string get_name() const noexcept;

   description get_description() const;

   lbann_comm* get_comm() const noexcept;


   El::Int get_num_layers() const noexcept;
   Layer& get_layer(El::Int pos);
   Layer const& get_layer(El::Int pos) const;
   std::vector<Layer*> get_layers();
   std::vector<Layer const*> get_layers() const;
   std::vector<weights*> get_weights();
   std::vector<weights const*> get_weights() const;
   std::vector<ViewingWeightsPtr> get_weights_pointers() const;

   observer_ptr<objective_function const>
   get_objective_function() const noexcept;

   observer_ptr<objective_function> get_objective_function() noexcept;

   std::vector<metric*> get_metrics();
   std::vector<metric const*> get_metrics() const;

 #ifdef LBANN_HAS_ONNX

   void serialize_to_onnx(onnx::ModelProto& mp);
 #endif // LBANN_HAS_ONNX

   // ===========================================
   // Model specification
   // ===========================================


   void add_layer(OwningLayerPtr&& l);

   void add_weights(OwningWeightsPtr&& w);

   void remove_weights(std::string const& name);

   void add_callback(std::shared_ptr<callback_base> cb);

   void add_metric(std::unique_ptr<metric> m);

   void insert_layer(OwningLayerPtr&& l, std::string const& parent_name);

   void remove_layer(std::string const& name);

   void replace_layer(OwningLayerPtr&& l, std::string const& name);

   void swap_layers(model& other);
   void swap_weights(model& other);
   void swap_metrics(model& other);
   void swap_objective_function(model& other);


   void copy_trained_weights_from(std::vector<weights*>& w);

   template <typename TensorDataType>
   std::unique_ptr<optimizer> create_optimizer() const;

   void allow_background_io_activity(bool enable) noexcept;

   bool background_io_activity_allowed() const noexcept;

   // ===========================================
   // Setup
   // ===========================================

   void setup(size_t max_mini_batch_size,
              const std::vector<El::Grid*>& grids,
              bool force = false);


   void summarize_stats(lbann_summary& summarizer);

   void summarize_matrices(lbann_summary& summarizer);


   template <class Archive>
   void serialize(Archive& ar);

   bool save_to_checkpoint_shared(persist& p);
   bool load_from_checkpoint_shared(persist& p);

   bool save_to_checkpoint_distributed(persist& p);
   bool load_from_checkpoint_distributed(persist& p);

   void write_proto(lbann_data::Model& proto);

   void save_model();


   void set_subgrid_communication_type(int type) noexcept;
   int get_subgrid_communication_type() const noexcept;
   void set_subgraph_num_parent_resources(int num_resources) noexcept;
   int get_subgraph_num_parent_resources() const noexcept;
   void set_subgrid_topology(bool type) noexcept;
   bool get_subgrid_topology() const noexcept;
   void enable_subgraph_parallelism() noexcept;
   bool is_subgraph_parallelism_enabled() const noexcept;
   int get_num_resources_non_branch_layers() const noexcept;
   int get_num_resources_branch_layers() const noexcept;
   void set_num_resources_non_branch_layers(int num) noexcept;
   void set_num_resources_branch_layers(int num) noexcept;


 private:

   void reorder_layers(const std::vector<El::Int>& gather_indices);

   void remap_pointers(
     const std::unordered_map<Layer*, ViewingLayerPtr>& layer_map,
     const std::unordered_map<weights*, ViewingWeightsPtr>& weights_map);

   void setup_layer_topology();

   void setup_layer_execution_order();

   void setup_layer_grid_tags(const std::vector<El::Grid*>& grids);

   void setup_layers(size_t max_mini_batch_size,
                     const std::vector<El::Grid*>& grids);

   void setup_weights();


   void setup_subgrids();

   void get_subgrids_order(std::vector<int>& ranks_order, int num_branches);
   int get_max_subgraph_branches();
   void check_subgraph_parallelism();
   void setup_subgrid_layers_run_condition();
   void get_parent_subgrid_tags(int layer_index);
   void get_subgraph_subgrids_ranks(std::vector<int>& parent_ranks,
                                    std::vector<int>& subgrid_ranks,
                                    int layer_index,
                                    int number_ranks_in_grid);
   void get_resources_for_spliting_point(std::vector<int>& parent_ranks,
                                         std::vector<int>& subgrid_ranks,
                                         int layer_index,
                                         int number_ranks_in_grid,
                                         int num_subgrids);
   void get_resources_for_merge_layers(std::set<int>& pooled_set,
                                       int child_index,
                                       int num_subgrids);
   void get_resources_for_input_layer(std::vector<int>& masterSubGrid,
                                      int num_subgrids);
   void setup_subcommunicators(const std::vector<El::Grid*>& grids);


 public:
   // ===========================================
   // Execution
   // ===========================================

   std::vector<observer_ptr<callback_base>> get_callbacks();

   std::vector<std::shared_ptr<callback_base>>&
   get_callbacks_with_ownership() noexcept;

   bool has_valid_execution_context() const noexcept;

   ExecutionContext const& get_execution_context() const;

   ExecutionContext& get_execution_context();

   void reset_mode(ExecutionContext& context, execution_mode mode);
   void reset_epoch_statistics(execution_mode mode);

   void forward_prop(execution_mode mode);
   void backward_prop(bool compute_weight_grads_only = true);
   void evaluate_metrics(execution_mode mode, size_t current_mini_batch_size);
   void clear_gradients();
   void update_weights();
   bool update_layers();
   void reconcile_weight_values();

   // ===========================================
   // Callbacks
   // ===========================================

   void do_setup_end_cbs();
   void do_model_forward_prop_begin_cbs(execution_mode mode);
   void do_model_forward_prop_end_cbs(execution_mode mode);
   void do_layer_forward_prop_begin_cbs(execution_mode mode, Layer* l);
   void do_layer_forward_prop_end_cbs(execution_mode mode, Layer* l);
   void do_model_backward_prop_begin_cbs();
   void do_model_backward_prop_end_cbs();
   void do_layer_backward_prop_begin_cbs(Layer* l);
   void do_layer_backward_prop_end_cbs(Layer* l);
   void do_model_optimize_begin_cbs();
   void do_model_optimize_end_cbs();
   void do_weight_optimize_begin_cbs(weights* w);
   void do_weight_optimize_end_cbs(weights* w);
   El::Int get_max_mini_batch_size() const noexcept;
   El::Int get_current_mini_batch_size() const noexcept;
   void set_current_mini_batch_size(El::Int) noexcept;

 private:
   friend cereal::access;
   model();

 private:
   // map to store all distinct grids in the model
   std::unordered_map<std::string, std::shared_ptr<El::Grid>> grids;

   std::unordered_map<std::string, std::shared_ptr<El::mpi::Comm>>
     subCommunicatorsSubgrids;
   // map to store all distinct mpi groups in the model (one to one mapping with
   // grids)
   std::unordered_map<std::string, std::unique_ptr<El::mpi::Group>>
     grids_mpi_groups;

 private:
   observer_ptr<ExecutionContext> m_execution_context;

   lbann_comm* m_comm;

   /*experimental code for Sub graph*/
   // 0: send-recv based subgrid communication
   // 1: collective based subgrid communication without optimization that
   // requires specific assumptions like subgrids should have same size and
   // creates sub-communicators everytime 2: collective based subgrid
   // communication with optimization

   int vector_communication_subgraph = 0;

   // Number of resources for parent (common) grid
   // 0: use all resources (default)
   int subgraph_num_resources_parent = 0;

   // 0: no topology aware design
   // 1: master grid in round robin manner of nodes (GPUs per node 4)  1 3 5 7, 2
   // 4 6 8
   bool enable_subgraph_topology = false;

   // whether subgraph parallelism is enabled or not for the model
   bool apply_subgraph_parallelism = false;

   // total number of resources / ranks for branch (subgrid) layers
   int num_resources_branch_layers;

   // total number of resources / ranks for common/seq layers
   int num_resources_non_branch_layers;

   std::string m_name;

   std::vector<OwningLayerPtr> m_layers;

   std::vector<OwningWeightsPtr> m_weights;

   std::unique_ptr<lbann_data::Optimizer> m_default_optimizer_msg;

   std::unique_ptr<objective_function> m_objective_function;

   std::vector<std::unique_ptr<metric>> m_metrics;

   std::vector<std::shared_ptr<callback_base>> m_callbacks;

   bool m_background_io_allowed = true;

   bool m_model_is_setup = false;

 private:
   // ===========================================
   // Functions to add utility layers
   // ===========================================

   void add_evaluation_layers(std::unordered_set<Layer*>& layer_set,
                              std::unordered_set<std::string>& layer_names);

   void add_dummy_layers(std::unordered_set<std::string>& layer_names);
   void add_split_layers(std::unordered_set<std::string>& layer_names);

   void ensure_input_layers_first();

   El::Int m_max_mini_batch_size;

   El::Int m_current_mini_batch_size;

 #ifdef LBANN_HAS_DISTCONV
 private:
   void setup_distconv();
   void setup_distributions();
   void print_distributions() const;
 #endif // LBANN_HAS_DISTCONV
 };     // class model

 inline std::string model::get_name() const noexcept { return m_name; }

 inline observer_ptr<objective_function> model::get_objective_function() noexcept
 {
   return m_objective_function.get();
 }

 inline observer_ptr<objective_function const>
 model::get_objective_function() const noexcept
 {
   return m_objective_function.get();
 }

 inline std::vector<observer_ptr<callback_base>> model::get_callbacks()
 {
   std::vector<observer_ptr<callback_base>> callback_list;
   callback_list.reserve(m_callbacks.size());
   for (const auto& ptr : m_callbacks) {
     callback_list.push_back(ptr.get());
   }
   return callback_list;
 }

 inline std::vector<std::shared_ptr<callback_base>>&
 model::get_callbacks_with_ownership() noexcept
 {
   return m_callbacks;
 }

 inline lbann_comm* model::get_comm() const noexcept { return m_comm; }

 inline bool model::has_valid_execution_context() const noexcept
 {
   return (m_execution_context != nullptr);
 }

 inline ExecutionContext const& model::get_execution_context() const
 {
   if (m_execution_context == nullptr) {
     LBANN_ERROR("execution context is not set");
   }
   return *m_execution_context;
 }

 inline ExecutionContext& model::get_execution_context()
 {
   return const_cast<ExecutionContext&>(
     static_cast<const model&>(*this).get_execution_context());
 }

 template <typename TensorDataType>
 inline std::unique_ptr<optimizer> model::create_optimizer() const
 {
   if (m_default_optimizer_msg)
     return proto::construct_optimizer<TensorDataType>(*m_default_optimizer_msg);
   return nullptr;
 }

 inline void model::allow_background_io_activity(bool enable) noexcept
 {
   m_background_io_allowed = enable;
 }

 inline bool model::background_io_activity_allowed() const noexcept
 {
   return m_background_io_allowed;
 }

 inline void model::set_subgrid_communication_type(int type) noexcept
 {
   vector_communication_subgraph = type;
 }

 inline int model::get_subgrid_communication_type() const noexcept
 {
   return vector_communication_subgraph;
 }

 inline void model::set_subgraph_num_parent_resources(int num_resources) noexcept
 {
   subgraph_num_resources_parent = num_resources;
 }

 inline int model::get_subgraph_num_parent_resources() const noexcept
 {
   return subgraph_num_resources_parent;
 }

 inline void model::set_subgrid_topology(bool type) noexcept
 {
   enable_subgraph_topology = type;
 }

 inline bool model::get_subgrid_topology() const noexcept
 {
   return enable_subgraph_topology;
 }

 inline void model::enable_subgraph_parallelism() noexcept
 {
   apply_subgraph_parallelism = true;
 }

 inline bool model::is_subgraph_parallelism_enabled() const noexcept
 {
   return apply_subgraph_parallelism;
 }

 inline int model::get_num_resources_non_branch_layers() const noexcept
 {
   return num_resources_non_branch_layers;
 }

 inline int model::get_num_resources_branch_layers() const noexcept
 {
   return num_resources_branch_layers;
 }

 inline void model::set_num_resources_non_branch_layers(int num) noexcept
 {
   num_resources_non_branch_layers = num;
 }

 inline void model::set_num_resources_branch_layers(int num) noexcept
 {
   num_resources_branch_layers = num;
 }

 inline El::Int model::get_max_mini_batch_size() const noexcept
 {
   return m_max_mini_batch_size;
 }

 inline El::Int model::get_current_mini_batch_size() const noexcept
 {
   return m_current_mini_batch_size;
 }

 inline void model::set_current_mini_batch_size(El::Int mini_batch_size) noexcept
 {
   if (mini_batch_size > m_max_mini_batch_size) {
     LBANN_WARNING(
       "LOGICAL ERROR: the current mini-batch size ",
       mini_batch_size,
       " is being set to larger than the established maximum mini-batch size ",
       m_max_mini_batch_size,
       ".  Note that this should work properly as all matrices will be resized, "
       "but this is a logical error as the maximum mini-batch size should be "
       "established at setup time to avoid dynamic allocation.");
   }
   m_current_mini_batch_size = mini_batch_size;
   return;
 }

 } // namespace lbann

 #endif // LBANN_MODELS_MODEL_HPP_INCLUDED
lbann::model::grids_mpi_groups
std::unordered_map< std::string, std::unique_ptr< El::mpi::Group > > grids_mpi_groups
Definition: model.hpp:481

lbann::model::m_current_mini_batch_size
El::Int m_current_mini_batch_size
The current mini-batch size.
Definition: model.hpp:613

LBANN_ERROR
#define LBANN_ERROR(...)
Definition: exception.hpp:37

lbann::ViewingLayerPtr
std::weak_ptr< Layer > ViewingLayerPtr
Smart pointer to reference a layer object.
Definition: layer.hpp:133

file_io.hpp

lbann::lbann_comm
Definition: comm.hpp:105

lbann::protobuf::serialize
void serialize(std::ostream &os, google::protobuf::Message const &msg)
Serialize the protobuf message to a stream.

lbann::Layer
Neural network tensor operation.
Definition: layer.hpp:285

lbann::description
Generates nicely formatted description messages.
Definition: description.hpp:49

lbann::model::m_comm
lbann_comm * m_comm
LBANN communicator.
Definition: model.hpp:489

lbann::weights
Definition: weights/weights.hpp:100

lbann::model::m_metrics
std::vector< std::unique_ptr< metric > > m_metrics
Numerical quantities to evaluate model performance.
Definition: model.hpp:546

lbann::model::subCommunicatorsSubgrids
std::unordered_map< std::string, std::shared_ptr< El::mpi::Comm > > subCommunicatorsSubgrids
Definition: model.hpp:477

cereal
Definition: callback.hpp:56

lbann::model::m_default_optimizer_msg
std::unique_ptr< lbann_data::Optimizer > m_default_optimizer_msg
Definition: model.hpp:538

lbann::model::m_objective_function
std::unique_ptr< objective_function > m_objective_function
Mathematical function to be minimized during training.
Definition: model.hpp:541

lbann::model
Abstract base class for neural network models.
Definition: model.hpp:83

lbann::model::m_callbacks
std::vector< std::shared_ptr< callback_base > > m_callbacks
Current callbacks to process.
Definition: model.hpp:549

base.hpp

ExecutionContext
The execution context for an KFAC algorithm.

lbann::OwningWeightsPtr
std::shared_ptr< weights > OwningWeightsPtr
Smart pointer to manage ownership of a weights object.
Definition: model.hpp:77

lbann::ViewingWeightsPtr
std::weak_ptr< weights > ViewingWeightsPtr
Smart pointer to reference a weights object.
Definition: layer.hpp:89

thread_pool.hpp

lbann::observer_ptr
typename std::add_pointer< T >::type observer_ptr
Creating an observer_ptr to complement the unique_ptr and shared_ptr.
Definition: base.hpp:54

lbann::execution_mode
execution_mode
Neural network execution mode.
Definition: base.hpp:229

lbann::model::m_layers
std::vector< OwningLayerPtr > m_layers
Tensor operations.
Definition: model.hpp:528

lbann_data
Definition: callback.hpp:42

lbann::model::m_name
std::string m_name
Model instance&#39;s name.
Definition: model.hpp:523

lbann::ExecutionContext
Definition: execution_context.hpp:47

lbann::model::get_execution_context
ExecutionContext const  & get_execution_context() const
Definition: model.hpp:659

lbann::model::grids
std::unordered_map< std::string, std::shared_ptr< El::Grid > > grids
Definition: model.hpp:474

lbann::force
auto force(El::MultiSync< D, Ds... > const &x) -> El::SyncInfo< D > const &
Force the MultiSync to the master SyncInfo.
Definition: sync_info_helpers.hpp:67

lbann::model::m_weights
std::vector< OwningWeightsPtr > m_weights
Trainable parameters.
Definition: model.hpp:531

lbann::model::num_resources_non_branch_layers
int num_resources_non_branch_layers
Definition: model.hpp:517

LBANN_WARNING
#define LBANN_WARNING(...)
Definition: exception.hpp:53

factories.hpp

lbann::model::m_execution_context
observer_ptr< ExecutionContext > m_execution_context
Definition: model.hpp:486

lbann::model::num_resources_branch_layers
int num_resources_branch_layers
Definition: model.hpp:514

lbann::lbann_summary
Definition: utils/summary.hpp:262

summary.hpp

lbann::OwningLayerPtr
std::shared_ptr< Layer > OwningLayerPtr
Smart pointer to manage ownership of a layer object.
Definition: layer.hpp:125

lbann
Definition: callback_helpers.hpp:32

lbann::persist
Definition: persist.hpp:72

lbann::model::m_max_mini_batch_size
El::Int m_max_mini_batch_size
The maximum mini-batch size.
Definition: model.hpp:602