27 #ifndef LBANN_LAYERS_LAYER_HPP_INCLUDED 28 #define LBANN_LAYERS_LAYER_HPP_INCLUDED 36 #include <onnx/onnx_pb.h> 41 #define LBANN_DEFINE_LAYER_BUILDER(LAYER_NAME) \ 42 template <typename TensorDataType, data_layout Layout, El::Device Device> \ 43 std::unique_ptr<Layer> build_##LAYER_NAME##_layer_from_pbuf( \ 45 lbann_data::Layer const&) 50 #define LBANN_LAYER_DEFAULT_BUILDER(LAYER_NAME) \ 51 template <typename TensorDataType, data_layout Layout, El::Device Device> \ 52 std::unique_ptr<Layer> build_##LAYER_NAME##_layer_from_pbuf( \ 54 lbann_data::Layer const&) \ 56 using LayerType = LAYER_NAME##_layer<TensorDataType, Layout, Device>; \ 57 return std::make_unique<LayerType>(comm); \ 63 #define LBANN_LAYER_BUILDER_ETI(LAYER_NAME, T, Device) \ 64 template std::unique_ptr<Layer> \ 65 build_##LAYER_NAME##_layer_from_pbuf<T, \ 66 ::lbann::data_layout::DATA_PARALLEL, \ 67 Device>(lbann_comm*, \ 68 lbann_data::Layer const&); \ 69 template std::unique_ptr<Layer> \ 70 build_##LAYER_NAME##_layer_from_pbuf<T, \ 71 ::lbann::data_layout::MODEL_PARALLEL, \ 72 Device>(lbann_comm*, \ 73 lbann_data::Layer const&) 90 #ifdef LBANN_HAS_DISTCONV 92 #endif // LBANN_HAS_DISTCONV 97 template <hydrogen::Device Device>
99 template <hydrogen::Device Device>
101 template <hydrogen::Device Device>
103 template <hydrogen::Device Device>
139 int sample_groups = 0;
141 int sample_splits = 0;
143 int depth_groups = 0;
145 int depth_splits = 0;
147 int height_groups = 0;
149 int height_splits = 0;
151 int width_groups = 0;
153 int width_splits = 0;
155 int channel_groups = 0;
157 int channel_splits = 0;
159 int filter_groups = 0;
161 int filter_splits = 0;
163 int replications = 0;
165 bool enable_subgraph =
false;
167 int sub_branch_tag = 0;
169 int sub_branch_resource_percentage = 0;
184 sub_branch_resource_percentage ==
209 os <<
"Axis over which DistConv can parallelize:\n" 210 <<
"\tSamples in the mini-batch (N)\n" 211 <<
"\tDepth, Height, and Width (D x H x W)\n" 214 <<
"\tReplications (R): Number of times the layer is replicated (for FC " 215 "layers right now)\n" 216 <<
"\tBranch number in the subgraph (T)\n" 217 <<
"\tPercentage of parent resources to be allocated to this branch (%)\n" 218 <<
"\tEnable subgraph for the layer (e)\n" 219 <<
"\nFor each of the above dimensions there are two fields:\n" 220 <<
"\t# Groups (G): refers to how many reduced-order tensors exist with " 221 "respect to that dimension" 223 <<
"\t e.g. For a kD tensor you would have a stack of G " 226 <<
"\t\t[N, C, D, H, W]" << std::endl
227 <<
"\t\t[2, 1, 4, 1, 1] ---" << std::endl
228 <<
"\t\t |" << std::endl
229 <<
"\t\t V" << std::endl
230 <<
"\t\t 4 Depth groups: [N, C, H, W]" << std::endl
231 <<
"\t\t [2, 1, 1, 1]" << std::endl
232 <<
"\t\t [2, 1, 1, 1]" << std::endl
233 <<
"\t\t [2, 1, 1, 1]" << std::endl
234 <<
"\t\t [2, 1, 1, 1]" << std::endl
235 <<
"\t\t[1, 1, 4, 1, 2] ---" << std::endl
236 <<
"\t\t |" << std::endl
237 <<
"\t\t V" << std::endl
238 <<
"\t\t 2 Sample groups: [C, D, H, W]" << std::endl
239 <<
"\t\t [1, 4, 1, 1]" << std::endl
240 <<
"\t\t [1, 4, 1, 1]" << std::endl
241 <<
"\n\tSplit per Dimension (S): Number of groups the dimension is split " 242 "over (i.e. split K times) (aka H2 split shape) (must divide groups " 246 os <<
"Reporting order for the parallel strategy" << std::endl;
289 template <hydrogen::Device Device>
291 template <hydrogen::Device Device>
293 template <hydrogen::Device Device>
295 template <hydrogen::Device Device>
302 virtual ~
Layer() =
default;
309 virtual Layer* copy()
const = 0;
319 void set_name(
const std::string name) { m_name = name; }
347 virtual std::string get_type()
const = 0;
350 virtual std::string get_datatype_name()
const = 0;
368 virtual El::Device get_device_allocation()
const = 0;
375 return m_expected_num_parent_layers;
383 return m_expected_num_child_layers;
397 return m_parallel_strategy;
402 return m_parallel_strategy;
415 bool using_gpus()
const {
return get_device_allocation() == El::Device::GPU; }
418 #endif // LBANN_HAS_GPU 428 virtual void forward_prop() = 0;
450 virtual void setup(
size_t max_mini_batch_size,
451 const std::vector<El::Grid*>& grids);
454 virtual void check_setup();
459 void write_proto(lbann_data::Layer& proto)
const;
471 virtual void summarize_matrices(
lbann_summary& summarizer,
int step) = 0;
474 void reset_counters();
483 subgraph_communication_method = type;
489 return subgraph_communication_method;
495 m_num_spliting_groups = spliting_groups;
511 m_interSubGridVCComm = std::move(mpi_comm);
517 m_subgraph_parallelism_execution =
true;
524 return m_subgraph_parallelism_execution;
533 return run_layer_in_subgraph;
539 virtual void write_specific_proto(lbann_data::Layer& proto)
const = 0;
542 #ifdef LBANN_HAS_ONNX 548 virtual void fill_onnx_node(onnx::GraphProto& graph)
const;
557 virtual std::string get_onnx_op_type()
const;
558 #endif // LBANN_HAS_ONNX 564 const Layer& get_parent_layer(
size_t index = 0)
const;
565 const Layer& get_child_layer(
size_t index = 0)
const;
567 std::vector<const Layer*> get_parent_layers()
const;
568 std::vector<const Layer*> get_child_layers()
const;
570 size_t find_parent_layer_index(
const Layer& l)
const;
571 size_t find_child_layer_index(
const Layer& l)
const;
607 virtual std::vector<ViewingLayerPtr> get_layer_pointers();
612 virtual void set_layer_pointers(std::vector<ViewingLayerPtr> layers);
619 std::vector<ViewingWeightsPtr> get_weights_pointers()
const;
621 void set_weights_pointers(std::vector<ViewingWeightsPtr> ptrs);
624 void replace_weights(
Layer const& other_layer);
633 virtual const BaseDistMat& get_error_signals(
const Layer& parent)
const = 0;
640 std::vector<int> get_input_dims(
size_t input_index = 0)
const;
642 int get_input_size(
size_t input_index = 0)
const;
644 std::vector<int> get_output_dims(
size_t output_index = 0)
const;
646 int get_output_size(
size_t output_index = 0)
const;
649 void set_output_dims(std::vector<int> dims,
size_t output_index = 0);
651 El::Int infer_mini_batch_size_from_parents()
const;
652 virtual El::Int current_output_mini_batch_size()
const = 0;
654 infer_mini_batch_size_from_parents_or_default_to_current()
const = 0;
662 int get_grid_tag()
const noexcept;
664 void set_grid_tag(
int tag);
678 const Layer* get_hint_layer()
const;
686 bool is_frozen()
const;
695 virtual void set_keep_error_signals(
bool) = 0;
707 template <
typename ArchiveT>
725 m_weights.emplace_back(std::move(w));
731 return ((idx < m_weights.size()) && (!m_weights[idx].expired()));
736 m_weights.at(idx) = std::move(w);
738 weights const& get_weights(
size_t idx)
const;
740 weights& get_weights(
size_t idx);
742 void add_as_gradient_source();
744 void remove_as_gradient_source();
758 virtual void setup_pointers();
764 virtual void setup_dims();
773 virtual void setup_matrices(
const std::vector<El::Grid*>& grids) = 0;
793 virtual void fp_setup_inputs() = 0;
798 virtual void fp_setup_outputs() = 0;
803 virtual void fp_compute() = 0;
813 virtual void bp_setup_gradient_wrt_inputs() = 0;
838 int m_expected_num_parent_layers = 1;
842 int m_expected_num_child_layers = 1;
872 bool m_runs_inplace =
false;
900 bool m_subgraph_parallelism_execution =
false;
902 bool run_layer_in_subgraph =
false;
907 El::Int m_num_spliting_groups = 1;
926 friend void attempt_move_error_signal(
Layer& parent,
928 std::unique_ptr<BaseDistMat> signal);
929 friend void attempt_view_error_signal(
Layer& parent,
932 friend void deep_copy_error_signal(
Layer& parent,
937 virtual void back_prop_impl_() = 0;
945 virtual void allocate_new_gradients_() = 0;
954 virtual void propagate_error_signals_to_parents_() = 0;
966 virtual void clear_prev_error_signals_() = 0;
977 virtual void move_or_copy_prev_error_signal_(
979 std::unique_ptr<El::BaseDistMatrix> signal) = 0;
991 view_or_copy_prev_error_signal_(
const Layer& child,
992 const El::BaseDistMatrix& signal) = 0;
1001 deep_copy_prev_error_signal_(
const Layer& child,
1002 const El::BaseDistMatrix& signal) = 0;
1038 #ifdef LBANN_HAS_DISTCONV 1044 bool distconv_enabled()
const;
1046 virtual bool keep_original_inputs(
int index)
const;
1048 virtual bool keep_original_outputs(
int index)
const;
1051 virtual bool keep_original_gradient_wrt_inputs(
int index)
const;
1054 virtual bool keep_original_gradient_wrt_outputs(
int index)
const;
1062 virtual bool is_distconv_supported()
const {
return false; }
1064 void prepare_distconv();
1065 virtual void setup_distconv_adapter() = 0;
1066 std::unique_ptr<distconv_adapter>& get_distconv_adapter_ptr()
1070 const std::unique_ptr<distconv_adapter>& get_distconv_adapter_ptr()
const 1076 mutable bool m_distconv_enabled =
false;
1077 mutable bool m_distconv_enabled_set =
false;
1078 std::unique_ptr<distconv_adapter> m_dc;
1084 #endif // LBANN_HAS_DISTCONV 1089 #endif // LBANN_LAYERS_LAYER_HPP_INCLUDED
bool distconv_enabled() const
Indicate whether distconv is enabled.
int sub_branch_resource_percentage
model * get_model() const noexcept
Get a reference to the model that manages this layer.
ParallelStrategy & get_parallel_strategy() noexcept
Get the parallel strategy for the layer.
EvalType m_fp_time
Time spent in forward propagation.
virtual int get_backprop_requirements() const
Returns the necessary tensors for computing backpropagation.
virtual void setup_data(size_t max_mini_batch_size)
Setup layer data. Called by the 'setup' function. Memory is allocated for distributed matrices...
EvalType m_update_time
Time spent in updates.
int get_expected_num_child_layers() const noexcept
Get expected number of child layers. A negative value indicates no limit.
std::unique_ptr< std::set< int > > m_subgrid_ranks
std::weak_ptr< Layer > ViewingLayerPtr
Smart pointer to reference a layer object.
bool operator==(const ParallelStrategy &ps) const
int get_num_parents() const noexcept
Get number of parent layers.
bool runs_inplace() const
If true, the layer will run in-place (the input and output activations point to the same tensor)...
void clear_parent_layers()
Remove pointers to parent layers.
void serialize(std::ostream &os, google::protobuf::Message const &msg)
Serialize the protobuf message to a stream.
Neural network tensor operation.
Generates nicely formatted description messages.
int get_expected_num_parent_layers() const noexcept
Get expected number of parent layers. A negative value indicates no limit.
void set_model(model *m)
Set the model that manages this layer.
void set_num_spliting_groups(El::Int spliting_groups)
constexpr El::Device Device
bool m_frozen
Avoid back prop if frozen.
An implementation of the KFAC second-order optimization algorithm.
std::ostream & operator<<(std::ostream &os, lbann::utils::argument_parser< ErrorHandler > const &parser)
Write the parser's help string to the given ostream.
std::shared_ptr< El::mpi::Comm > m_interSubGridVCComm
bool get_run_layer_in_subgraph() const noexcept
void set_run_layer_in_subgraph()
std::shared_ptr< El::Grid > get_mygrid() const
EvalType m_fp_compute_time
Time spent in the forward propagation computation.
size_t num_weights() const noexcept
Abstract base class for neural network models.
virtual void setup_gpu()
Setup GPU objects. Called by the 'setup' function if the layer is on GPUs.
std::vector< ViewingLayerPtr > m_parent_layers
References to parent layers.
bool has_weights() const noexcept
int get_num_children() const noexcept
Get number of child layers.
std::weak_ptr< weights > ViewingWeightsPtr
Smart pointer to reference a weights object.
void reset_inter_subgrid_vc_comm(std::shared_ptr< El::mpi::Comm > mpi_comm)
ViewingLayerPtr m_hint_layer
Hint layer. During setup, the output tensor dimensions are set to match the first output tensor of th...
void set_subgraph_parallelism_execution()
El::Int get_num_spliting_groups() const
ParallelStrategy const & get_parallel_strategy() const noexcept
Get the parallel strategy for the layer.
std::string get_name() const
Get the layer instance's name.
void set_num_weights(size_t n)
std::ostream & print_parallel_strategy_header(std::ostream &os)
El::BaseDistMatrix BaseDistMat
bool operator!=(const ParallelStrategy &ps) const
data_layout
Data layout that is optimized for different modes of parallelism.
virtual bool update_compute()
Perform the computation for the update step. Returns false if the layer must reset for a new training...
ParallelStrategy m_parallel_strategy
Parallel strategy for the layer.
virtual void bp_compute()
Compute objective funciton gradients. Called by the 'back_prop' function. Given the input...
void clear_child_layers()
Remove pointers to child layers.
EvalType m_bp_compute_time
Time spent in the backward propagation computation.
void set_weights(size_t idx, ViewingWeightsPtr w)
SubGraphCommunication get_communication_flag()
bool using_gpus() const noexcept
Whether the layer is using a GPU implementation.
EvalType m_bp_time
Time spent in backward propagation.
std::vector< ViewingWeightsPtr > m_weights
References to layer weights.
void set_communication_flag(SubGraphCommunication type)
std::shared_ptr< Layer > OwningLayerPtr
Smart pointer to manage ownership of a layer object.
bool has_weights(size_t idx) const noexcept
void set_name(const std::string name)
Set the layer instance's name. Each layer in a model should have a unique, preferably human-readable...
bool subgraph_parallelism_execution() const noexcept
std::string m_name
Layer instance's name. Each layer in a model should have a unique, preferably human-readable, name.
std::vector< std::vector< int > > m_output_dims_list
Dimensions of output tensors.
virtual bool can_run_inplace() const
If True, the computation can run in-place (feeding each input activations tensor as the corresponding...
std::vector< ViewingLayerPtr > m_child_layers
References to child layers.