LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
layer_norm.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED
28 #define LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED
29 
31 #include "lbann/layers/layer.hpp"
32 #include "lbann/models/model.hpp"
34 
35 #include "lbann/proto/layers.pb.h"
36 #include <memory>
37 
38 namespace lbann {
39 
53 template <typename TensorDataType, data_layout Layout, El::Device Device>
54 class layer_norm_layer : public data_type_layer<TensorDataType>
55 {
56 public:
58 
61  using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;
62 
64 
65 public:
69  layer_norm_layer(TensorDataType epsilon = El::To<TensorDataType>(1e-5),
70  bool scale = false,
71  bool bias = false);
72 
73  layer_norm_layer(const layer_norm_layer& other);
75  layer_norm_layer* copy() const override;
76 
77  std::string get_type() const override;
78  data_layout get_data_layout() const override;
79  El::Device get_device_allocation() const override;
80  description get_description() const override;
81  bool can_run_inplace() const override { return true; }
82  int get_backprop_requirements() const override
83  {
85  }
86 
88 
90  template <typename ArchiveT>
91  void serialize(ArchiveT& ar);
92 
94 
95 protected:
97  void write_specific_proto(lbann_data::Layer& proto) const final;
98 
99  void setup_dims() override;
100  void setup_data(size_t max_mini_batch_size) override;
101 
102  void fp_compute() override;
103  void bp_compute() override;
104 
105 private:
106  using AbsDistMatType = El::AbstractDistMatrix<TensorDataType>;
107 
109  TensorDataType m_epsilon;
110 
112  bool m_scale;
113 
115  bool m_bias;
116 
121  std::unique_ptr<AbsDistMatType> m_statistics;
126  std::unique_ptr<AbsDistMatType> m_statistics_gradient;
127 
129  std::unique_ptr<AbsDistMatType> m_scale_gradient;
130 
132  std::unique_ptr<AbsDistMatType> m_bias_gradient;
133 };
134 
135 // =========================================================
136 // Implementation
137 // =========================================================
138 
139 template <typename T, data_layout L, El::Device D>
141  lbann_data::Layer& proto) const
142 {
143  proto.set_datatype(proto::ProtoDataType<T>);
144  auto* msg = proto.mutable_layer_norm();
145  msg->mutable_epsilon()->set_value(m_epsilon);
146  msg->set_scale(m_scale);
147  msg->set_bias(m_bias);
148 }
149 
150 template <typename TensorDataType, data_layout Layout, El::Device Device>
152  TensorDataType epsilon,
153  bool scale,
154  bool bias)
155  : data_type_layer<TensorDataType>(nullptr),
156  m_epsilon(epsilon),
157  m_scale(scale),
158  m_bias(bias)
159 {}
160 
161 template <typename TensorDataType, data_layout Layout, El::Device Device>
164  : data_type_layer<TensorDataType>(other),
165  m_epsilon(other.m_epsilon),
166  m_scale(other.m_scale),
167  m_bias(other.m_bias),
168  m_statistics(other.m_statistics ? other.m_statistics->Copy() : nullptr),
170  ? other.m_statistics_gradient->Copy()
171  : nullptr),
173  : nullptr),
174  m_bias_gradient(other.m_bias_gradient ? other.m_bias_gradient->Copy()
175  : nullptr)
176 {}
177 
178 template <typename TensorDataType, data_layout Layout, El::Device Device>
182 {
184  m_epsilon = other.m_epsilon;
185  m_scale = other.m_scale;
186  m_bias = other.m_bias;
187  m_statistics.reset(other.m_statistics ? other.m_statistics->Copy() : nullptr);
189  ? other.m_statistics_gradient->Copy()
190  : nullptr);
191  m_scale_gradient.reset(other.m_scale_gradient ? other.m_scale_gradient->Copy()
192  : nullptr);
193  m_bias_gradient.reset(other.m_bias_gradient ? other.m_bias_gradient->Copy()
194  : nullptr);
195  return *this;
196 }
197 
198 template <typename TensorDataType, data_layout Layout, El::Device Device>
201 {
202  return new layer_norm_layer(*this);
203 }
204 
205 template <typename TensorDataType, data_layout Layout, El::Device Device>
207 {
208  return "layer norm";
209 }
210 
211 template <typename TensorDataType, data_layout Layout, El::Device Device>
214 {
215  return Layout;
216 }
217 
218 template <typename TensorDataType, data_layout Layout, El::Device Device>
221 {
222  return Device;
223 }
224 
225 template <typename TensorDataType, data_layout Layout, El::Device Device>
228 {
230  desc.add("Epsilon", m_epsilon);
231  desc.add("Affine Scale", m_scale);
232  desc.add("Affine Bias", m_bias);
233  return desc;
234 }
235 
236 template <typename TensorDataType, data_layout Layout, El::Device Device>
238 {
240  this->set_output_dims(this->get_input_dims());
241 }
242 
243 template <typename TensorDataType, data_layout Layout, El::Device Device>
245  size_t max_mini_batch_size)
246 {
247  data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
248  const auto& output_dims = this->get_output_dims();
249  std::vector<size_t> out_dims{output_dims.begin(), output_dims.end()};
250  auto dist = this->get_prev_activations().DistData();
251  dist.colDist = El::STAR;
252  m_statistics.reset(AbsDistMatrixType::Instantiate(dist));
253  m_statistics_gradient.reset(AbsDistMatrixType::Instantiate(dist));
254 
255  // Setup weights
256  using WeightsType = data_type_weights<TensorDataType>;
257  if ((m_scale && m_bias && this->num_weights() > 2) ||
258  (!m_scale && !m_bias && this->num_weights() > 0) ||
259  (m_scale && !m_bias && this->num_weights() > 1) ||
260  (!m_scale && m_bias && this->num_weights() > 1)) {
261  LBANN_ERROR("attempted to setup ",
262  this->get_type(),
263  " layer \"",
264  this->get_name(),
265  "\" ",
266  "with an invalid number of weights ",
267  "(",
268  this->num_weights(),
269  ") and scale = ",
270  m_scale,
271  ", bias = ",
272  m_bias);
273  }
274  this->set_num_weights((m_scale ? 1 : 0) + (m_bias ? 1 : 0));
275 
276  // Setup default weights if not given
277  int weight_idx = 0;
278 
279  // Replicate weights across minibatch
280  dist = this->get_prev_activations().DistData();
281  dist.rowDist = El::STAR;
282 
283  if (m_scale) {
284  if (!this->has_weights(weight_idx)) {
285  auto w = std::make_shared<WeightsType>(*this->get_comm());
286  auto init = std::make_unique<constant_initializer<TensorDataType>>(
287  El::TypeTraits<TensorDataType>::One());
288  auto opt = this->m_model->template create_optimizer<TensorDataType>();
289  w->set_name(this->get_name() + "_scale_weights");
290  w->set_optimizer(std::move(opt));
291  w->set_initializer(std::move(init));
292  this->set_weights(weight_idx, w);
293  this->m_model->add_weights(std::move(w));
294  }
295  auto& weights = this->get_weights(weight_idx);
296  weights.set_dims(out_dims);
298  m_scale_gradient.reset(AbsDistMatrixType::Instantiate(dist));
299  m_scale_gradient->AlignWith(dist);
302  ++weight_idx;
303  }
304  if (m_bias) {
305  if (!this->has_weights(weight_idx)) {
306  auto w = std::make_shared<WeightsType>(*this->get_comm());
307  auto init = std::make_unique<constant_initializer<TensorDataType>>(
308  El::TypeTraits<TensorDataType>::Zero());
309  auto opt = this->m_model->template create_optimizer<TensorDataType>();
310  w->set_name(this->get_name() + "_bias_weights");
311  w->set_optimizer(std::move(opt));
312  w->set_initializer(std::move(init));
313  this->set_weights(weight_idx, w);
314  this->m_model->add_weights(std::move(w));
315  }
316  auto& weights = this->get_weights(weight_idx);
317  weights.set_dims(out_dims);
319  m_bias_gradient.reset(AbsDistMatrixType::Instantiate(dist));
320  m_bias_gradient->AlignWith(dist);
323  }
324 }
325 
326 LBANN_DEFINE_LAYER_BUILDER(layer_norm);
327 
328 // =========================================================
329 // Explicit template instantiation
330 // =========================================================
331 
332 #ifndef LBANN_LAYER_NORM_LAYER_INSTANTIATE
333 #define PROTO_DEVICE(T, Device) \
334  extern template class layer_norm_layer<T, \
335  data_layout::DATA_PARALLEL, \
336  Device>; \
337  extern template class layer_norm_layer<T, data_layout::MODEL_PARALLEL, Device>
338 
340 #undef PROTO_DEVICE
341 #endif // LBANN_LAYER_NORM_LAYER_INSTANTIATE
342 
343 } // namespace lbann
344 
345 #endif // LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED
virtual void setup_dims()
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
lbann_comm * get_comm() const
bool can_run_inplace() const override
If True, the computation can run in-place (feeding each input activations tensor as the corresponding...
Definition: layer_norm.hpp:81
layer_norm_layer & operator=(const layer_norm_layer &other)
Definition: layer_norm.hpp:180
data_layout get_data_layout() const override
Get data layout of the data tensors. We assume that the data layouts of the previous activations...
Definition: layer_norm.hpp:213
#define LBANN_ERROR(...)
Definition: exception.hpp:37
std::unique_ptr< AbsDistMatType > m_bias_gradient
Gradient w.r.t. bias.
Definition: layer_norm.hpp:132
El::AbstractDistMatrix< TensorDataType > AbsDistMatrixType
The tensor type expected in this object.
Definition: layer_norm.hpp:61
int get_backprop_requirements() const override
Returns the necessary tensors for computing backpropagation.
Definition: layer_norm.hpp:82
size_t get_matrix_width() const
std::vector< int > get_input_dims(size_t input_index=0) const
Get input tensor dimensions.
Generates nicely formatted description messages.
Definition: description.hpp:49
std::unique_ptr< AbsDistMatType > m_scale_gradient
Gradient w.r.t. scale.
Definition: layer_norm.hpp:129
void add_weights(OwningWeightsPtr &&w)
Add weights to model.
virtual description get_description() const
Human-readable description.
constexpr El::Device Device
InputAbsDistMatrixType & get_prev_activations(int parent_index=0)
layer_norm_layer * copy() const override
Copy function. This function dynamically allocates memory for a layer instance and instantiates a cop...
Definition: layer_norm.hpp:200
El::Device get_device_allocation() const override
Get the device allocation for the data tensors. We assume that the decice allocation of the previous ...
Definition: layer_norm.hpp:220
void serialize(ArchiveT &ar)
weights const & get_weights(size_t idx) const
void set_name(std::string name)
Metadata Accessors.
void set_output_dims(std::vector< int > dims, size_t output_index=0)
Set output tensor dimensions.
size_t num_weights() const noexcept
Definition: layer.hpp:727
bool m_bias
Apply elementwise bias after normalization (learned weights).
Definition: layer_norm.hpp:115
bool has_weights() const noexcept
Definition: layer.hpp:728
bool m_scale
Apply elementwise scale after normalization (learned weights).
Definition: layer_norm.hpp:112
TensorDataType m_epsilon
Definition: layer_norm.hpp:109
void set_matrix_distribution(El::DistData dist)
void bp_compute() override
Compute objective funciton gradients. Called by the &#39;back_prop&#39; function. Given the input...
std::string get_name() const
Get the layer instance&#39;s name.
Definition: layer.hpp:332
void set_num_weights(size_t n)
Definition: layer.hpp:733
El::AbstractDistMatrix< TensorDataType > AbsDistMatType
Definition: layer_norm.hpp:106
Normalize over data samples.
Definition: layer_norm.hpp:54
void write_specific_proto(lbann_data::Layer &proto) const final
Definition: layer_norm.hpp:140
std::unique_ptr< AbsDistMatType > m_statistics_gradient
Gradients w.r.t. per-sample statistics.
Definition: layer_norm.hpp:126
description get_description() const override
Human-readable description.
Definition: layer_norm.hpp:227
layer_norm_layer(TensorDataType epsilon=El::To< TensorDataType >(1e-5), bool scale=false, bool bias=false)
Definition: layer_norm.hpp:151
data_layout
Data layout that is optimized for different modes of parallelism.
Definition: base.hpp:218
size_t get_matrix_height() const
void setup_data(size_t max_mini_batch_size) override
Setup layer data. Called by the &#39;setup&#39; function. Memory is allocated for distributed matrices...
Definition: layer_norm.hpp:244
std::string get_type() const override
Get the layer type&#39;s name.
Definition: layer_norm.hpp:206
void set_dims(std::vector< size_t > matrix_height_dims, std::vector< size_t > matrix_width_dims={})
void set_weights(size_t idx, ViewingWeightsPtr w)
Definition: layer.hpp:734
void fp_compute() override
Apply layer operation. Called by the &#39;forward_prop&#39; function. Given the input tensors, the output tensors are populated with computed values.
void setup_data(size_t max_mini_batch_size) override
LBANN_DEFINE_LAYER_BUILDER(elu)
std::unique_ptr< AbsDistMatType > m_statistics
Per-sample statistics.
Definition: layer_norm.hpp:121
std::vector< int > get_output_dims(size_t output_index=0) const
Get output tensor dimensions.
data_type_layer & operator=(data_type_layer &&other)=default
model * m_model
Reference to model managing this layer.
Definition: layer.hpp:845
void setup_dims() override
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
Definition: layer_norm.hpp:237