LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
cross_grid_sum_slice.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_LAYER_CROSS_GRID_SUM_SLICE_HPP_INCLUDED
28 #define LBANN_LAYER_CROSS_GRID_SUM_SLICE_HPP_INCLUDED
29 
32 
33 namespace lbann {
34 
35 template <typename TensorDataType, El::Device Dev>
36 class cross_grid_sum_slice_layer : public data_type_layer<TensorDataType>
37 {
38 public:
40  : data_type_layer<TensorDataType>(comm)
41  {
42  this->m_expected_num_parent_layers = -1; // No limit on parents
43  this->m_expected_num_child_layers = -1; // No limit on children
44  }
45 
46  cross_grid_sum_slice_layer* copy() const override
47  {
48  return new cross_grid_sum_slice_layer(*this);
49  }
50  std::string get_type() const override { return "cross_grid_sum_slice"; }
51  data_layout get_data_layout() const override
52  {
54  }
55  El::Device get_device_allocation() const override { return Dev; }
56  bool can_run_inplace() const override { return false; }
57  int get_backprop_requirements() const override { return ERROR_SIGNALS; }
58 
59 protected:
61  void write_specific_proto(lbann_data::Layer& proto) const final;
62 
63  El::SyncInfo<Dev> syncSubGridCommunication = El::SyncInfo<Dev>();
64 
65  void setup_pointers() override
66  {
68  if (this->get_num_parents() < 1) {
70  " layer \"",
71  this->get_name(),
72  "\" has no parent layers");
73  }
74  }
75 
76  void setup_dims() override
77  {
79 
80  // Slice along last dimension
81  int subgridCommSize = El::mpi::Size(this->get_subgrid_comm());
82  const auto input_dims = this->get_input_dims();
83  std::vector<int> output_dims_slice(input_dims);
84  output_dims_slice.back() = int(output_dims_slice.back() / subgridCommSize);
85 
86  for (int i = 0; i < this->get_num_children(); ++i)
87  this->set_output_dims(output_dims_slice, i);
88  }
89 
90  void fp_compute() override
91  {
92  auto const subgrid_comm_rank = El::mpi::Rank(this->get_subgrid_comm());
93  auto const subgrid_comm_size = El::mpi::Size(this->get_subgrid_comm());
94 
95  auto& output = this->get_activations(subgrid_comm_rank);
96  auto& input = this->get_prev_activations(subgrid_comm_rank);
97  // El::Copy(input, output);
98 
99  auto& output_cast = dynamic_cast<
100  El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Dev>&>(
101  output);
102 
103  auto const sync_info_output =
104  El::SyncInfoFromMatrix(output_cast.LockedMatrix());
105 
106  const El::Int mloc = input.LocalHeight();
107  const El::Int nloc = input.LocalWidth();
108 
109  El::Matrix<TensorDataType, Dev> prev_allreduce(mloc, nloc),
110  after_allreduce(mloc, nloc);
111 
112  El::Copy(input.LockedMatrix(), prev_allreduce);
113 
114  El::mpi::AllReduce(prev_allreduce.Buffer(),
115  after_allreduce.Buffer(),
116  mloc * nloc,
117  El::mpi::SUM,
118  this->get_subgrid_comm(),
119  sync_info_output);
120 
121  const auto input_dims = this->get_input_dims();
122  int last_dim = input_dims.back();
123 
124  int last_dim_start_point = 1;
125  for (int i = 0; i < int(input_dims.size()) - 1; ++i) {
126  last_dim_start_point = last_dim_start_point * input_dims[i];
127  }
128 
129  if (last_dim % subgrid_comm_size != 0)
130  LBANN_ERROR("cross_grid_sum_slice layer: last dimension should be "
131  "divided by the number of branches in subgraph");
132 
133  int const last_dim_index =
134  int(last_dim / subgrid_comm_size) * subgrid_comm_rank;
135 
136  El::copy::util::InterleaveMatrix(
137  (last_dim / subgrid_comm_size),
138  input.LocalWidth() * last_dim_start_point,
139  after_allreduce.LockedBuffer(last_dim_index, 0),
140  1,
141  last_dim,
142  output_cast.Buffer(),
143  1,
144  (last_dim / subgrid_comm_size),
145  sync_info_output);
146  }
147 
148  void fp_setup_outputs() override
149  {
150 
151  if (this->get_num_children() < 1) {
152  return;
153  }
154  // Determine distributed matrix alignment
155 
156  auto mini_batch_size =
158 
159  // Initialize output tensors
160  for (int i = 0; i < this->get_num_children(); ++i) {
161  auto& output = this->get_activations(i);
162  output.Empty(false);
163  output.Resize(this->get_output_size(i), mini_batch_size);
164  }
165  }
166 
168  {
169  auto children = this->get_child_layers();
170  auto const subgrid_comm_rank = El::mpi::Rank(this->get_subgrid_comm());
171  auto const subgrid_comm_size = El::mpi::Size(this->get_subgrid_comm());
172  const auto input_dims = this->get_input_dims();
173  int last_dim = input_dims.back();
174 
175  auto& input_grad = this->get_error_signals(subgrid_comm_rank);
176  const auto& gradient_wrt_output =
177  this->get_prev_error_signals(subgrid_comm_rank);
178 
179  using MatrixType =
180  El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Dev>;
181  auto& gradient_wrt_output_cast =
182  dynamic_cast<const MatrixType&>(gradient_wrt_output);
183 
184  auto& gradient_wrt_input_cast = dynamic_cast<MatrixType&>(input_grad);
185 
186  int mloc = gradient_wrt_output_cast.LocalHeight();
187  int nloc = gradient_wrt_output_cast.LocalWidth();
188  int per_grid_last_dim = last_dim / subgrid_comm_size;
189 
190  El::Matrix<TensorDataType, Dev> temp_input(nloc, mloc),
191  temp_output(nloc * (mloc / per_grid_last_dim), last_dim),
192  transposed(nloc * (mloc / per_grid_last_dim), per_grid_last_dim),
193  transposed_output(last_dim, nloc * (mloc / per_grid_last_dim));
194 
195  El::Copy(gradient_wrt_output_cast.LockedMatrix(), temp_input);
196  temp_input.Resize(per_grid_last_dim, nloc * (mloc / per_grid_last_dim));
197 
198  El::Transpose(temp_input, transposed);
199 
200  El::mpi::AllGather(transposed.Buffer(),
201  mloc * nloc,
202  temp_output.Buffer(),
203  mloc * nloc,
204  this->get_subgrid_comm(),
206 
207  El::Transpose(temp_output, transposed_output);
208  transposed_output.Resize(mloc * subgrid_comm_size, nloc);
209 
210  auto mini_batch_size =
212 
213  for (int i = 0; i < El::To<int>(children.size()); i++) {
214  auto& gradient_wrt_input_cast_layer =
215  dynamic_cast<MatrixType&>(this->get_error_signals(i));
216  gradient_wrt_input_cast_layer.Resize(this->get_input_size(),
217  mini_batch_size);
218  }
219  El::Copy(transposed_output, gradient_wrt_input_cast.Matrix());
220  }
221 
222  void bp_compute() final {}
223 };
224 
225 #ifndef LBANN_CROSS_GRID_SUM_SLICE_LAYER_INSTANTIATE
226 #define PROTO_DEVICE(T, Device) \
227  extern template class cross_grid_sum_slice_layer<T, Device>
228 
230 #undef PROTO_DEVICE
231 
232 #endif // LBANN_CROSS_GRID_SUM_SLICE_LAYER_INSTANTIATE
233 
234 } // namespace lbann
235 
236 #endif // LBANN_LAYER_CROSS_GRID_SUM_SLICE_HPP_INCLUDED
virtual void setup_dims()
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
bool can_run_inplace() const override
If True, the computation can run in-place (feeding each input activations tensor as the corresponding...
#define LBANN_ERROR(...)
Definition: exception.hpp:37
int get_output_size(size_t output_index=0) const
Get output tensor size.
int get_num_parents() const noexcept
Get number of parent layers.
Definition: layer.hpp:574
void bp_compute() final
Compute objective funciton gradients. Called by the &#39;back_prop&#39; function. Given the input...
std::vector< int > get_input_dims(size_t input_index=0) const
Get input tensor dimensions.
constexpr El::Device Device
OutputAbsDistMatrixType & get_prev_error_signals(int child_index=0)
InputAbsDistMatrixType & get_prev_activations(int parent_index=0)
El::Int infer_mini_batch_size_from_parents_or_default_to_current() const override
const OutputAbsDistMatrixType & get_activations(const Layer &child) const override
int m_expected_num_child_layers
Expected number of child layers. A negative value indicates no limit.
Definition: layer.hpp:842
int get_input_size(size_t input_index=0) const
Get input tensor size.
void set_output_dims(std::vector< int > dims, size_t output_index=0)
Set output tensor dimensions.
std::string get_type() const override
Get the layer type&#39;s name.
int get_num_children() const noexcept
Get number of child layers.
Definition: layer.hpp:576
void write_specific_proto(lbann_data::Layer &proto) const final
data_layout get_data_layout() const override
Get data layout of the data tensors. We assume that the data layouts of the previous activations...
void setup_pointers() override
Setup layer pointers. Called by the &#39;setup&#39; function. Pointers to parent/child layers are assumed to ...
std::string get_name() const
Get the layer instance&#39;s name.
Definition: layer.hpp:332
std::vector< const Layer * > get_child_layers() const
int get_backprop_requirements() const override
Returns the necessary tensors for computing backpropagation.
void setup_dims() override
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
data_layout
Data layout that is optimized for different modes of parallelism.
Definition: base.hpp:218
void bp_setup_gradient_wrt_inputs() override
Setup gradient w.r.t. input tensors. Called by the &#39;back_prop&#39; function. Each gradient w...
El::Device get_device_allocation() const override
Get the device allocation for the data tensors. We assume that the decice allocation of the previous ...
cross_grid_sum_slice_layer * copy() const override
Copy function. This function dynamically allocates memory for a layer instance and instantiates a cop...
void fp_setup_outputs() override
Setup output tensors. Called by the &#39;forward_prop&#39; function. Each output tensor is resized to match t...
virtual void setup_pointers()
Setup layer pointers. Called by the &#39;setup&#39; function. Pointers to parent/child layers are assumed to ...
int m_expected_num_parent_layers
Definition: layer.hpp:838
const InputAbsDistMatrixType & get_error_signals(const Layer &parent) const override
void fp_compute() override
Apply layer operation. Called by the &#39;forward_prop&#39; function. Given the input tensors, the output tensors are populated with computed values.