LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
sum.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_LAYER_SUM_HPP_INCLUDED
28 #define LBANN_LAYER_SUM_HPP_INCLUDED
29 
32 #include "lbann/proto/lbann.pb.h"
33 #include "lbann/utils/distconv.hpp"
35 
36 namespace lbann {
37 
38 #ifdef LBANN_HAS_DISTCONV
39 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
40 class sum_distconv_adapter : public data_type_distconv_adapter<TensorDataType>
41 {
42 public:
43  using TensorDevType =
45  sum_distconv_adapter(Layer& layer)
46  : data_type_distconv_adapter<TensorDataType>(layer)
47  {}
48  virtual ~sum_distconv_adapter() = default;
49  std::unique_ptr<TensorDevType>
50  setup_error_signals_i(int index) const override;
51  void fp_compute();
52 };
53 #endif // LBANN_HAS_DISTCONV
54 
56 template <typename TensorDataType,
58  El::Device Dev = El::Device::CPU>
59 class sum_layer : public data_type_layer<TensorDataType>
60 {
61 public:
62  sum_layer(lbann_comm* comm) : data_type_layer<TensorDataType>(comm)
63  {
64  this->m_expected_num_parent_layers = -1; // No limit on parents
65  }
66 
67  sum_layer* copy() const override { return new sum_layer(*this); }
68 
70 
72  template <typename ArchiveT>
73  void serialize(ArchiveT& ar);
74 
76 
77  std::string get_type() const override { return "sum"; }
78  data_layout get_data_layout() const override { return T_layout; }
79  El::Device get_device_allocation() const override { return Dev; }
80  bool can_run_inplace() const override { return true; }
81  int get_backprop_requirements() const override { return ERROR_SIGNALS; }
82 
83 protected:
85  void write_specific_proto(lbann_data::Layer& proto) const final;
86 
87  El::SyncInfo<Dev> syncSubGridCommunication = El::SyncInfo<Dev>();
88 
89  friend class cereal::access;
90  sum_layer() : sum_layer(nullptr) {}
91 
92  void setup_pointers() override
93  {
95  if (this->get_num_parents() < 1) {
96  std::stringstream err;
97  err << get_type() << " layer \"" << this->get_name() << "\" "
98  << "has no parent layers";
99  LBANN_ERROR(err.str());
100  }
101  }
102 
103  void setup_dims() override
104  {
106  this->set_output_dims(this->get_input_dims());
107 
108  // Check that input dimensions match
109  const auto& output_dims = this->get_output_dims();
110  for (int i = 0; i < this->get_num_parents(); ++i) {
111  if (this->get_input_dims(i) != output_dims) {
112  const auto& parents = this->get_parent_layers();
113  std::stringstream err;
114  err << get_type() << " layer \"" << this->get_name() << "\" "
115  << "has input tensors with incompatible dimensions (";
116  for (int j = 0; j < this->get_num_parents(); ++j) {
117  const auto& dims = this->get_input_dims(j);
118  err << (j > 0 ? ", " : "") << "layer \"" << parents[j]->get_name()
119  << "\" outputs ";
120  for (size_t k = 0; k < dims.size(); ++k) {
121  err << (k > 0 ? " x " : "") << dims[k];
122  }
123  }
124  err << ")";
125  LBANN_ERROR(err.str());
126  }
127  }
128  }
129 
130  void fp_compute() override
131  {
132 #ifdef LBANN_HAS_DISTCONV
133  if (this->distconv_enabled()) {
134  get_distconv_adapter().fp_compute();
135  return;
136  }
137 #endif // LBANN_HAS_DISTCONV
138  auto& output = this->get_activations();
139  const auto& parents = this->get_parent_layers();
140 
141  if (this->subgraph_parallelism_execution()) {
142  int tag = 0;
143 
144  std::vector<bool> is_initialized_tensor(this->m_num_spliting_groups,
145  false);
146 
147  // Copy data internally with same branch tag
148  for (int i = 0; i < this->get_num_parents(); ++i) {
149  tag = parents[i]->get_grid_tag() - 1;
150 
151  if (is_initialized_tensor[tag]) {
152 
153  if (this->get_prev_activations(i).Participating()) {
154  El::Axpy(DataType(1),
155  this->get_prev_activations(i),
156  this->get_branch_tag_input(tag));
157  }
158  }
159  else {
160  if (this->get_prev_activations(i).Participating()) {
161  El::Copy(this->get_prev_activations(i),
162  this->get_branch_tag_input(tag));
163  is_initialized_tensor[tag] = true;
164  }
165  }
166  }
167 
168  // copy and add data from reduced gradients from same branch
169 
170  if (this->get_communication_flag() == COLL_OPT)
171  // If vector is enabled copy data using allreduce operation from
172  // aggregated subgrids to the output
173  {
174  auto* ptr_output = dynamic_cast<
175  El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Dev>*>(
176  &output);
177 
178  El::copy::TranslateBetweenGridsAllreduce<TensorDataType, Dev, Dev>(
179  *ptr_output,
180  this->get_branch_tag_input_vector(),
181  this->get_subgrid_comm(),
182  syncSubGridCommunication,
183  1);
184  }
185  else if (this->get_communication_flag() == COLL) {
186  auto* ptr_output = dynamic_cast<
187  El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Dev>*>(
188  &output);
189 
190  El::copy::TranslateBetweenGridsAllreduce<TensorDataType, Dev, Dev>(
191  *ptr_output,
192  this->get_branch_tag_input_vector());
193  }
194  else {
195  if (this->get_num_parents() > 0) {
196  El::Copy(this->get_branch_tag_input(0), output);
197  }
198  else {
199  El::Zero(output);
200  }
201 
202  for (int i = 1; i < this->m_num_spliting_groups; i++) {
203 
204  El::Copy(this->get_branch_tag_input(i), this->get_temp_grad());
205  El::Axpy(DataType(1), this->get_temp_grad(), output);
206  }
207  }
208  } // if subgraph parallelism is enabled
209  else {
210  El::Copy(this->get_prev_activations(0), output);
211  for (int i = 1; i < this->get_num_parents(); ++i) {
212  El::Axpy(DataType(1), this->get_prev_activations(i), output);
213  }
214  }
215  }
216 
217  void fp_setup_outputs() override
218  {
219 
220  if (this->get_num_children() < 1) {
221  return;
222  }
223  auto mini_batch_size =
224  this->infer_mini_batch_size_from_parents_or_default_to_current();
225  // Determine distributed matrix alignment
226  const bool align_outputs = this->get_num_parents() > 0;
227  const auto& alignment_dist =
228  (align_outputs ? this->get_prev_activations().DistData()
229  : this->get_activations().DistData());
230 
231  // Initialize output tensors
232  for (int i = 0; i < this->get_num_children(); ++i) {
233 #ifdef LBANN_HAS_DISTCONV
234  if (!this->keep_original_outputs(i))
235  continue;
236 #endif // LBANN_HAS_DISTCONV
237 
238  auto& output = this->get_activations(i);
239  output.Empty(false);
240  if (align_outputs && this->subgraph_parallelism_execution() == false) {
241  output.AlignWith(alignment_dist);
242  }
243  output.Resize(this->get_output_size(i), mini_batch_size);
244  }
245  }
246 
248  {
249  int tag = 0;
250  const auto& parents = this->get_parent_layers();
251  const auto& gradient_wrt_output = this->get_prev_error_signals();
252 
253  if (this->subgraph_parallelism_execution()) {
254 
255  if (this->get_communication_flag() == COLL_OPT)
256  // If vector copy is enable, broadcast the gradients from parent grid to
257  // multiple subgrids
258  {
259  auto const* ptr_gradient =
260  dynamic_cast<El::DistMatrix<TensorDataType,
261  El::STAR,
262  El::VC,
263  El::ELEMENT,
264  Dev> const*>(&gradient_wrt_output);
265  El::copy::TranslateBetweenGridsBroadcast<TensorDataType, Dev, Dev>(
266  *ptr_gradient,
267  this->get_branch_tag_input_vector(),
268  this->get_subgrid_comm(),
269  syncSubGridCommunication);
270  }
271  else if (this->get_communication_flag() == COLL) {
272  auto const* ptr_gradient =
273  dynamic_cast<El::DistMatrix<TensorDataType,
274  El::STAR,
275  El::VC,
276  El::ELEMENT,
277  Dev> const*>(&gradient_wrt_output);
278  El::copy::TranslateBetweenGridsBroadcast<TensorDataType, Dev, Dev>(
279  *ptr_gradient,
280  this->get_branch_tag_input_vector());
281  }
282  else {
283  for (int i = 0; i < this->m_num_spliting_groups; i++) {
284 
285  El::Copy(gradient_wrt_output, this->get_branch_tag_input(i));
286  }
287 
288  } // end vector copy condition
289 
290  for (int i = 0; i < this->get_num_parents(); ++i) {
291  tag = parents[i]->get_grid_tag() - 1;
292 
293  El::LockedView(this->get_error_signals(i),
294  this->get_branch_tag_input(tag));
295  }
296  }
297  else {
298  for (int i = 0; i < this->get_num_parents(); ++i) {
299 
300  El::LockedView(this->get_error_signals(i), gradient_wrt_output);
301  }
302  }
303  }
304 
305  void bp_compute() override {}
306 
307 #ifdef LBANN_HAS_DISTCONV
308  friend class sum_distconv_adapter<TensorDataType, T_layout, Dev>;
309 
310 protected:
311  bool is_distconv_supported() const override
312  {
313  return Dev == El::Device::GPU && T_layout == data_layout::DATA_PARALLEL;
314  }
315  void setup_distconv_adapter() override
316  {
317  this->get_distconv_adapter_ptr() =
318  std::make_unique<sum_distconv_adapter<TensorDataType, T_layout, Dev>>(
319  *this);
320  }
321  sum_distconv_adapter<TensorDataType, T_layout, Dev>&
322  get_distconv_adapter() override;
323  const sum_distconv_adapter<TensorDataType, T_layout, Dev>&
324  get_distconv_adapter() const override;
325 #endif // LBANN_HAS_DISTCONV
326 };
327 
328 template <typename T, data_layout L, El::Device D>
329 void sum_layer<T, L, D>::write_specific_proto(lbann_data::Layer& proto) const
330 {
331  proto.set_datatype(proto::ProtoDataType<T>);
332  proto.mutable_sum();
333 }
334 
335 #ifdef LBANN_HAS_DISTCONV
336 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
337 sum_distconv_adapter<TensorDataType, T_layout, Dev>&
339 {
340  return const_cast<sum_distconv_adapter<TensorDataType, T_layout, Dev>&>(
341  static_cast<const sum_layer<TensorDataType, T_layout, Dev>&>(*this)
342  .get_distconv_adapter());
343 }
344 
345 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
346 const sum_distconv_adapter<TensorDataType, T_layout, Dev>&
348 {
349  return dynamic_cast<
350  const sum_distconv_adapter<TensorDataType, T_layout, Dev>&>(
352 }
353 
354 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
355 std::unique_ptr<
356  typename sum_distconv_adapter<TensorDataType, T_layout, Dev>::TensorDevType>
357 sum_distconv_adapter<TensorDataType, T_layout, Dev>::setup_error_signals_i(
358  int index) const
359 {
360  return std::make_unique<TensorDevType>(this->get_prev_error_signals(0));
361 }
362 #endif // LBANN_HAS_DISTCONV
363 
364 #ifndef LBANN_SUM_LAYER_INSTANTIATE
365 #define PROTO_DEVICE(T, Device) \
366  extern template class sum_layer<T, data_layout::DATA_PARALLEL, Device>; \
367  extern template class sum_layer<T, data_layout::MODEL_PARALLEL, Device>
368 
370 #undef PROTO_DEVICE
371 #ifdef LBANN_HAS_DISTCONV
372 #define PROTO_DEVICE(T, Device) \
373  extern template class sum_distconv_adapter<T, \
374  data_layout::DATA_PARALLEL, \
375  Device>; \
376  extern template class sum_distconv_adapter<T, \
377  data_layout::MODEL_PARALLEL, \
378  Device>
379 
381 #undef PROTO_DEVICE
382 #endif // LBANN_HAS_DISTCONV
383 #endif // LBANN_SUM_LAYER_INSTANTIATE
384 
385 } // namespace lbann
386 
387 #endif // LBANN_LAYER_SUM_HPP_INCLUDED
virtual void setup_dims()
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
int get_backprop_requirements() const override
Returns the necessary tensors for computing backpropagation.
Definition: sum.hpp:81
#define LBANN_ERROR(...)
Definition: exception.hpp:37
void serialize(std::ostream &os, google::protobuf::Message const &msg)
Serialize the protobuf message to a stream.
void setup_pointers() override
Setup layer pointers. Called by the &#39;setup&#39; function. Pointers to parent/child layers are assumed to ...
Definition: sum.hpp:92
constexpr El::Device Device
Add multiple tensors.
Definition: sum.hpp:59
void fp_setup_outputs() override
Setup output tensors. Called by the &#39;forward_prop&#39; function. Each output tensor is resized to match t...
Definition: sum.hpp:217
bool can_run_inplace() const override
If True, the computation can run in-place (feeding each input activations tensor as the corresponding...
Definition: sum.hpp:80
void write_specific_proto(lbann_data::Layer &proto) const final
Definition: sum.hpp:329
data_layout get_data_layout() const override
Get data layout of the data tensors. We assume that the data layouts of the previous activations...
Definition: sum.hpp:78
data_layout
Data layout that is optimized for different modes of parallelism.
Definition: base.hpp:218
void bp_compute() override
Compute objective funciton gradients. Called by the &#39;back_prop&#39; function. Given the input...
Definition: sum.hpp:305
sum_layer(lbann_comm *comm)
Definition: sum.hpp:62
void bp_setup_gradient_wrt_inputs() override
Setup gradient w.r.t. input tensors. Called by the &#39;back_prop&#39; function. Each gradient w...
Definition: sum.hpp:247
sum_layer * copy() const override
Copy function. This function dynamically allocates memory for a layer instance and instantiates a cop...
Definition: sum.hpp:67
virtual void setup_pointers()
Setup layer pointers. Called by the &#39;setup&#39; function. Pointers to parent/child layers are assumed to ...
std::string get_type() const override
Get the layer type&#39;s name.
Definition: sum.hpp:77
void setup_dims() override
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
Definition: sum.hpp:103
void fp_compute() override
Apply layer operation. Called by the &#39;forward_prop&#39; function. Given the input tensors, the output tensors are populated with computed values.
Definition: sum.hpp:130
El::Device get_device_allocation() const override
Get the device allocation for the data tensors. We assume that the decice allocation of the previous ...
Definition: sum.hpp:79
dc::TensorDev< OutputTensorDataType > TensorDevType