LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
cross_entropy_impl.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_LAYERS_LOSS_CROSS_ENTROPY_IMPL_HPP_INCLUDED
28 #define LBANN_LAYERS_LOSS_CROSS_ENTROPY_IMPL_HPP_INCLUDED
29 
32 
33 #ifdef LBANN_HAS_DISTCONV
35 #endif // LBANN_HAS_DISTCONV
36 
37 namespace lbann {
38 
39 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
41 {
43  this->set_output_dims({1});
44 
45 #ifdef LBANN_HAS_DISTCONV
46  // In the current implementation of cross entropy in Distconv, we
47  // do not use the reshape layer and just assumes both inputs have
48  // the matching shape. Therefore, the following check on the input
49  // dimensions would fail. We could address this by either 1)
50  // implementing the reshape layer, or 2) giving a proper shape to
51  // the ground-truth data.
52  //
53  if (this->distconv_enabled()) {
54  return;
55  }
56 #endif
57 
58  if (m_use_labels) {
59  const auto& parents = this->get_parent_layers();
60 
61  if (T_layout == data_layout::MODEL_PARALLEL) {
62  std::stringstream err;
63  err
64  << get_type() << " layer \"" << this->get_name() << "\" "
65  << "only supports use_labels is not supported in model parallel layout"
66  << " (for now)";
67  LBANN_ERROR(err.str());
68  }
69 
70  const auto& predictions_dims = this->get_input_dims(0);
71  const auto& labels_dims = this->get_input_dims(1);
72  // Check if the first dimension is 1 for the labels tensor
73  if (labels_dims[0] != 1) {
74  std::stringstream err;
75  err << get_type() << " layer \"" << this->get_name() << "\" "
76  << "expects the 0-th dimension of the tensor to be 1 when use labels "
77  << "is enabled. Found tensor with shape (";
78 
79  // TODO: Put this loop in util as it's used frequently to
80  // print layer dimensions
81  for (size_t j = 0; j < labels_dims.size(); ++j) {
82  err << (j > 0 ? " x " : "") << labels_dims[j];
83  }
84  err << ")";
85  LBANN_ERROR(err.str());
86  }
87 
88  // Check if the number of dimensions match for predictions and labels
89  // tensors
90 
91  if (predictions_dims.size() != labels_dims.size() ||
92  predictions_dims.size() < 2) {
93  std::stringstream err;
94  err << get_type() << " layer \"" << this->get_name() << "\" "
95  << "expects both input tensors to have the same number of dimensions "
96  << "and have >2 dimensions when use_labels is enabled. "
97  << "Found tensors with shape (";
98 
99  // TODO: Put this loop in util as it's used frequently to
100  // print layer dimensions
101  for (int i = 0; i < this->get_num_parents(); ++i) {
102  const auto& dims = this->get_input_dims(i);
103  err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name()
104  << "\" outputs ";
105  for (size_t j = 0; j < dims.size(); ++j) {
106  err << (j > 0 ? " x " : "") << dims[j];
107  }
108  }
109  err << ")";
110  LBANN_ERROR(err.str());
111  }
112  // Check if all spatial dimensions match for predictions and labels
113  // tensors
114  if (!std::equal(predictions_dims.begin() + 1,
115  predictions_dims.end(),
116  labels_dims.begin() + 1)) {
117  std::stringstream err;
118  err << get_type() << " layer \"" << this->get_name() << "\" "
119  << "expects both input tensors to have the same shape after the 0-th "
120  << "dimesion when use_labels is enabled. Found tensors with shape (";
121 
122  // TODO: Put this loop in util as it's used frequently to
123  // print layer dimensions
124  for (int i = 0; i < this->get_num_parents(); ++i) {
125  const auto& dims = this->get_input_dims(i);
126  err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name()
127  << "\" outputs ";
128  for (size_t j = 0; j < dims.size(); ++j) {
129  err << (j > 0 ? " x " : "") << dims[j];
130  }
131  }
132  err << ")";
133  LBANN_ERROR(err.str());
134  }
135  }
136  else {
137  // Check that input dimensions match
138  if (this->get_input_dims(0) != this->get_input_dims(1)) {
139  const auto& parents = this->get_parent_layers();
140  std::stringstream err;
141  err << get_type() << " layer \"" << this->get_name() << "\" "
142  << "has input tensors with different dimensions (";
143  for (int i = 0; i < this->get_num_parents(); ++i) {
144  const auto& dims = this->get_input_dims(i);
145  err << (i > 0 ? ", " : "") << "layer \"" << parents[i]->get_name()
146  << "\" outputs ";
147  for (size_t j = 0; j < dims.size(); ++j) {
148  err << (j > 0 ? " x " : "") << dims[j];
149  }
150  }
151  err << ")";
152  LBANN_ERROR(err.str());
153  }
154  }
155 }
156 
157 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
159  size_t max_mini_batch_size)
160 {
161  data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
162 
163  // Initialize workspace
164  const auto& prediction = this->get_prev_activations(0);
165  switch (this->get_data_layout()) {
167  m_workspace.reset(new StarVCMatDT<TensorDataType, Dev>(prediction.Grid(),
168  prediction.Root()));
169  break;
171  m_workspace.reset(new StarMRMatDT<TensorDataType, Dev>(prediction.Grid(),
172  prediction.Root()));
173  break;
174  default:
175  LBANN_ERROR("invalid data layout");
176  }
177 #ifdef HYDROGEN_HAVE_CUB
178  if (m_workspace->GetLocalDevice() == El::Device::GPU) {
179  m_workspace->Matrix().SetMemoryMode(1); // CUB memory pool
180  }
181 #endif // HYDROGEN_HAVE_CUB
182 }
183 
184 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
186 {
187 
188 #ifdef LBANN_HAS_DISTCONV
189  if (this->distconv_enabled()) {
190  fp_compute_distconv();
191  return;
192  }
193 
194 #endif // LBANN_HAS_DISTCONV
195 
196  // Initialize workspace
197  const auto& prediction = this->get_prev_activations(0);
198  m_workspace->AlignWith(prediction.DistData());
199  m_workspace->Resize(1, prediction.Width());
200 
201  // Compute local contributions and accumulate
203  local_fp_compute();
204  this->get_comm()->allreduce(*m_workspace, m_workspace->RedundantComm());
205  El::Copy(*m_workspace, this->get_activations());
206 }
207 
208 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
210 {
211 
212 #ifdef LBANN_HAS_DISTCONV
213  if (this->distconv_enabled()) {
214  bp_compute_distconv();
215  return;
216  }
217 #endif // LBANN_HAS_DISTCONV
218 
219  // Initialize workspace
220  const auto& prediction = this->get_prev_activations(0);
221  m_workspace->AlignWith(prediction.DistData());
222  El::Copy(this->get_prev_error_signals(), *m_workspace);
223 
224  // Compute local gradients
225  local_bp_compute();
226 }
227 
228 template <typename T, data_layout L, El::Device D>
230  lbann_data::Layer& proto) const
231 {
232  proto.set_datatype(proto::ProtoDataType<T>);
233  auto* msg = proto.mutable_cross_entropy();
234  msg->set_use_labels(m_use_labels);
235 }
236 
237 #ifdef LBANN_HAS_ONNX
238 template <typename T, data_layout L, El::Device D>
239 void cross_entropy_layer<T, L, D>::fill_onnx_node(onnx::GraphProto& graph) const
240 {
241  auto const parents = this->get_parent_layers();
242  // z = Log(input=x)
243  auto* log = graph.add_node();
244  size_t idx = parents[0]->find_child_layer_index(*this);
245  log->add_input(parents[0]->get_name() + "_" + std::to_string(idx));
246  log->add_output(this->get_name() + "_log");
247  log->set_name(this->get_name() + "_log");
248  log->set_op_type("Log");
249  log->set_domain("");
250  log->set_doc_string("Log node for Cross Entropy Layer");
251 
252  // z = Mul(A=y, B=z)
253  auto* mul = graph.add_node();
254  idx = parents[1]->find_child_layer_index(*this);
255  mul->add_input(parents[1]->get_name() + "_" + std::to_string(idx));
256  mul->add_input(log->output(0));
257  mul->add_output(this->get_name() + "_mul");
258  mul->set_name(this->get_name() + "_mul");
259  mul->set_op_type("Mul");
260  mul->set_domain("");
261  mul->set_doc_string("Multiply node for Cross Entropy Layer");
262 
263  // z = Reshape(data=z, shape=[0,-1])
264  auto* shape = graph.add_initializer();
265  shape->set_name(this->get_name() + "_mul_shape");
266  shape->set_data_type(onnx::TensorProto::INT64);
267  shape->add_dims(2);
268  shape->add_int64_data(0);
269  shape->add_int64_data(-1);
270  shape->set_doc_string(this->get_name() + " shape to reshape multiply");
271 
272  auto* reshape = graph.add_node();
273  reshape->add_input(mul->output(0));
274  reshape->add_input(shape->name());
275  reshape->add_output(this->get_name() + "_mul_reshape");
276  reshape->set_name(this->get_name() + "_mul_reshape");
277  reshape->set_op_type("Reshape");
278  reshape->set_domain("");
279  reshape->set_doc_string("Reshape muultiply result for Cross Entropy Layer");
280 
281  // z = ReduceSum(data=z, axes=-1)
282 
283  auto* axes = graph.add_initializer();
284  axes->set_name(this->get_name() + "_reducesum_axes");
285  axes->set_data_type(onnx::TensorProto::INT64);
286  axes->add_dims(1);
287  axes->add_int64_data(-1);
288  axes->set_doc_string(this->get_name() + "ReduceSum axes");
289 
290  auto* reduce_sum = graph.add_node();
291  reduce_sum->add_input(reshape->output(0));
292  reduce_sum->add_input(axes->name());
293  for (auto const* child : this->get_child_layers()) {
294  idx = this->find_child_layer_index(*child);
295  reduce_sum->add_output(this->get_name() + "_" + std::to_string(idx));
296  }
297  reduce_sum->set_name(this->get_name() + "_reducesum");
298  reduce_sum->set_op_type("ReduceSum");
299  reduce_sum->set_domain("");
300  reduce_sum->set_doc_string("ReduceSum node for Cross Entropy Layer");
301 }
302 #endif // LBANN_HAS_ONNX
303 
304 #ifdef LBANN_HAS_DISTCONV
305 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
306 const cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>&
308 {
309  return dynamic_cast<
310  const cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>&>(
312 }
313 
314 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
315 cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>&
317 {
318  return const_cast<
319  cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>&>(
321  *this)
322  .get_distconv_adapter());
323 }
324 
325 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
326 dc::Shape cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::
327  get_prev_activations_shape(int index) const
328 {
329  // Assumes both of the two input tensors have the equal shape.
331  0);
332 }
333 
334 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
335 dc::Shape cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::
336  get_activations_shape(int output_index) const
337 {
338  // NOTE: LBANN matrix is a 2-D matrix, while Distconv keeps the
339  // original spatial and channel dimensions, so
340  // get_output_tensor_shape() doesn't work here.
341  dc::Shape shape = this->get_prev_activations_shape(0);
342  for (int i = 0; i < shape.num_dims() - 1; ++i) {
343  shape[i] = 1;
344  }
345  return shape;
346 }
347 
348 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
349 dc::Shape cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::
350  get_activations_local_shape(int index) const
351 {
352  assert_eq(index, 0);
353  auto input_shape = this->get_prev_activations().get_local_shape();
354  for (int i = 0; i < input_shape.length() - 1; ++i) {
355  input_shape[i] = 1;
356  }
357  return input_shape;
358 }
359 
360 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
361 void cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::
362  setup_distributions(tensor_overlap_constraints& constraints)
363 {
365  // Output tensors share all dimensions except for the sample dimension
366  auto activations_split = this->get_activations_dist().get_split_shape();
367  auto prev_error_signals_split =
368  this->get_prev_error_signals_dist().get_split_shape();
369  for (int i = 0; i < activations_split.length() - 1; ++i) {
370  activations_split[i] = 1;
371  prev_error_signals_split[i] = 1;
372  }
373  this->get_activations_dist().set_split_shape(activations_split);
374  this->get_prev_error_signals_dist().set_split_shape(prev_error_signals_split);
375 
376  for (auto& d : this->m_prev_activations_dists) {
377  d.clear_overlap();
378  constraints.mark_updated(d);
379  constraints.mark_invariant(d);
380  }
381  for (auto& d : this->m_activations_dists) {
382  d.clear_overlap();
383  constraints.mark_updated(d);
384  constraints.mark_invariant(d);
385  }
386  for (auto& d : this->m_prev_error_signals_dists) {
387  d.clear_overlap();
388  constraints.mark_updated(d);
389  constraints.mark_invariant(d);
390  }
391  for (auto& d : this->m_error_signals_dists) {
392  d.clear_overlap();
393  constraints.mark_updated(d);
394  constraints.mark_invariant(d);
395  }
396 }
397 
398 template <typename TensorDataType, data_layout T_layout, El::Device Dev>
399 void cross_entropy_distconv_adapter<TensorDataType, T_layout, Dev>::setup_layer(
400  size_t workspace_capacity)
401 {
402  m_cross_entropy =
403  std::make_unique<dc::CrossEntropy>(dc::get_backend(), m_use_labels);
404  m_cross_entropy->setup(this->get_prev_activations(0),
405  this->get_prev_activations(1),
406  this->get_activations(0));
407 }
408 #endif // LBANN_HAS_DISTCONV
409 
410 } // namespace lbann
411 
412 #endif // LBANN_LAYERS_LOSS_CROSS_ENTROPY_IMPL_HPP_INCLUDED
void setup_dims() override
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
virtual void setup_dims()
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
El::DistMatrix< TensorDataType, El::STAR, El::VC, El::ELEMENT, D > StarVCMatDT
Definition: base.hpp:148
#define LBANN_ERROR(...)
Definition: exception.hpp:37
void mark_updated(const dc::Dist &d)
El::DistMatrix< TensorDataType, El::STAR, El::MR, El::ELEMENT, D > StarMRMatDT
ColSumMat.
Definition: base.hpp:163
Cross entropy between probability vectors.
virtual void setup_distributions(tensor_overlap_constraints &constraints)
std::string to_string(El::Device const &d)
void setup_data(size_t max_mini_batch_size) override
Setup layer data. Called by the &#39;setup&#39; function. Memory is allocated for distributed matrices...
void bp_compute() override
Compute objective funciton gradients. Called by the &#39;back_prop&#39; function. Given the input...
::distconv::tensor::Shape Shape
void fp_compute() override
Apply layer operation. Called by the &#39;forward_prop&#39; function. Given the input tensors, the output tensors are populated with computed values.
void write_specific_proto(lbann_data::Layer &proto) const final
virtual dc::Shape get_prev_activations_shape(int input_index=0) const
void setup_data(size_t max_mini_batch_size) override
void mark_invariant(const dc::Dist &d)