d1/d5b/layer__norm_8hpp_source.html

 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
 // Produced at the Lawrence Livermore National Laboratory.
 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
 //
 // LLNL-CODE-697807.
 // All rights reserved.
 //
 // This file is part of LBANN: Livermore Big Artificial Neural Network
 // Toolkit. For details, see http://software.llnl.gov/LBANN or
 // https://github.com/LLNL/LBANN.
 //
 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
 // may not use this file except in compliance with the License.  You may
 // obtain a copy of the License at:
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 // implied. See the License for the specific language governing
 // permissions and limitations under the license.

 #ifndef LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED
 #define LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED

 #include "lbann/layers/data_type_layer.hpp"
 #include "lbann/layers/layer.hpp"
 #include "lbann/models/model.hpp"
 #include "lbann/proto/datatype_helpers.hpp"

 #include "lbann/proto/layers.pb.h"
 #include <memory>

 namespace lbann {

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 class layer_norm_layer : public data_type_layer<TensorDataType>
 {
 public:

   using AbsDistMatrixType = El::AbstractDistMatrix<TensorDataType>;


 public:
   layer_norm_layer(TensorDataType epsilon = El::To<TensorDataType>(1e-5),
                    bool scale = false,
                    bool bias = false);

   layer_norm_layer(const layer_norm_layer& other);
   layer_norm_layer& operator=(const layer_norm_layer& other);
   layer_norm_layer* copy() const override;

   std::string get_type() const override;
   data_layout get_data_layout() const override;
   El::Device get_device_allocation() const override;
   description get_description() const override;
   bool can_run_inplace() const override { return true; }
   int get_backprop_requirements() const override
   {
     return ERROR_SIGNALS | PREV_ACTIVATIONS;
   }


   template <typename ArchiveT>
   void serialize(ArchiveT& ar);


 protected:
   void write_specific_proto(lbann_data::Layer& proto) const final;

   void setup_dims() override;
   void setup_data(size_t max_mini_batch_size) override;

   void fp_compute() override;
   void bp_compute() override;

 private:
   using AbsDistMatType = El::AbstractDistMatrix<TensorDataType>;

   TensorDataType m_epsilon;

   bool m_scale;

   bool m_bias;

   std::unique_ptr<AbsDistMatType> m_statistics;
   std::unique_ptr<AbsDistMatType> m_statistics_gradient;

   std::unique_ptr<AbsDistMatType> m_scale_gradient;

   std::unique_ptr<AbsDistMatType> m_bias_gradient;
 };

 // =========================================================
 // Implementation
 // =========================================================

 template <typename T, data_layout L, El::Device D>
 void layer_norm_layer<T, L, D>::write_specific_proto(
   lbann_data::Layer& proto) const
 {
   proto.set_datatype(proto::ProtoDataType<T>);
   auto* msg = proto.mutable_layer_norm();
   msg->mutable_epsilon()->set_value(m_epsilon);
   msg->set_scale(m_scale);
   msg->set_bias(m_bias);
 }

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 layer_norm_layer<TensorDataType, Layout, Device>::layer_norm_layer(
   TensorDataType epsilon,
   bool scale,
   bool bias)
   : data_type_layer<TensorDataType>(nullptr),
     m_epsilon(epsilon),
     m_scale(scale),
     m_bias(bias)
 {}

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 layer_norm_layer<TensorDataType, Layout, Device>::layer_norm_layer(
   const layer_norm_layer<TensorDataType, Layout, Device>& other)
   : data_type_layer<TensorDataType>(other),
     m_epsilon(other.m_epsilon),
     m_scale(other.m_scale),
     m_bias(other.m_bias),
     m_statistics(other.m_statistics ? other.m_statistics->Copy() : nullptr),
     m_statistics_gradient(other.m_statistics_gradient
                             ? other.m_statistics_gradient->Copy()
                             : nullptr),
     m_scale_gradient(other.m_scale_gradient ? other.m_scale_gradient->Copy()
                                             : nullptr),
     m_bias_gradient(other.m_bias_gradient ? other.m_bias_gradient->Copy()
                                           : nullptr)
 {}

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 layer_norm_layer<TensorDataType, Layout, Device>&
 layer_norm_layer<TensorDataType, Layout, Device>::operator=(
   const layer_norm_layer<TensorDataType, Layout, Device>& other)
 {
   data_type_layer<TensorDataType>::operator=(other);
   m_epsilon = other.m_epsilon;
   m_scale = other.m_scale;
   m_bias = other.m_bias;
   m_statistics.reset(other.m_statistics ? other.m_statistics->Copy() : nullptr);
   m_statistics_gradient.reset(other.m_statistics_gradient
                                 ? other.m_statistics_gradient->Copy()
                                 : nullptr);
   m_scale_gradient.reset(other.m_scale_gradient ? other.m_scale_gradient->Copy()
                                                 : nullptr);
   m_bias_gradient.reset(other.m_bias_gradient ? other.m_bias_gradient->Copy()
                                               : nullptr);
   return *this;
 }

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 layer_norm_layer<TensorDataType, Layout, Device>*
 layer_norm_layer<TensorDataType, Layout, Device>::copy() const
 {
   return new layer_norm_layer(*this);
 }

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 std::string layer_norm_layer<TensorDataType, Layout, Device>::get_type() const
 {
   return "layer norm";
 }

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 data_layout
 layer_norm_layer<TensorDataType, Layout, Device>::get_data_layout() const
 {
   return Layout;
 }

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 El::Device
 layer_norm_layer<TensorDataType, Layout, Device>::get_device_allocation() const
 {
   return Device;
 }

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 description
 layer_norm_layer<TensorDataType, Layout, Device>::get_description() const
 {
   auto desc = data_type_layer<TensorDataType>::get_description();
   desc.add("Epsilon", m_epsilon);
   desc.add("Affine Scale", m_scale);
   desc.add("Affine Bias", m_bias);
   return desc;
 }

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 void layer_norm_layer<TensorDataType, Layout, Device>::setup_dims()
 {
   data_type_layer<TensorDataType>::setup_dims();
   this->set_output_dims(this->get_input_dims());
 }

 template <typename TensorDataType, data_layout Layout, El::Device Device>
 void layer_norm_layer<TensorDataType, Layout, Device>::setup_data(
   size_t max_mini_batch_size)
 {
   data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
   const auto& output_dims = this->get_output_dims();
   std::vector<size_t> out_dims{output_dims.begin(), output_dims.end()};
   auto dist = this->get_prev_activations().DistData();
   dist.colDist = El::STAR;
   m_statistics.reset(AbsDistMatrixType::Instantiate(dist));
   m_statistics_gradient.reset(AbsDistMatrixType::Instantiate(dist));

   // Setup weights
   using WeightsType = data_type_weights<TensorDataType>;
   if ((m_scale && m_bias && this->num_weights() > 2) ||
       (!m_scale && !m_bias && this->num_weights() > 0) ||
       (m_scale && !m_bias && this->num_weights() > 1) ||
       (!m_scale && m_bias && this->num_weights() > 1)) {
     LBANN_ERROR("attempted to setup ",
                 this->get_type(),
                 " layer \"",
                 this->get_name(),
                 "\" ",
                 "with an invalid number of weights ",
                 "(",
                 this->num_weights(),
                 ") and scale = ",
                 m_scale,
                 ", bias = ",
                 m_bias);
   }
   this->set_num_weights((m_scale ? 1 : 0) + (m_bias ? 1 : 0));

   // Setup default weights if not given
   int weight_idx = 0;

   // Replicate weights across minibatch
   dist = this->get_prev_activations().DistData();
   dist.rowDist = El::STAR;

   if (m_scale) {
     if (!this->has_weights(weight_idx)) {
       auto w = std::make_shared<WeightsType>(*this->get_comm());
       auto init = std::make_unique<constant_initializer<TensorDataType>>(
         El::TypeTraits<TensorDataType>::One());
       auto opt = this->m_model->template create_optimizer<TensorDataType>();
       w->set_name(this->get_name() + "_scale_weights");
       w->set_optimizer(std::move(opt));
       w->set_initializer(std::move(init));
       this->set_weights(weight_idx, w);
       this->m_model->add_weights(std::move(w));
     }
     auto& weights = this->get_weights(weight_idx);
     weights.set_dims(out_dims);
     weights.set_matrix_distribution(dist);
     m_scale_gradient.reset(AbsDistMatrixType::Instantiate(dist));
     m_scale_gradient->AlignWith(dist);
     m_scale_gradient->Resize(weights.get_matrix_height(),
                              weights.get_matrix_width());
     ++weight_idx;
   }
   if (m_bias) {
     if (!this->has_weights(weight_idx)) {
       auto w = std::make_shared<WeightsType>(*this->get_comm());
       auto init = std::make_unique<constant_initializer<TensorDataType>>(
         El::TypeTraits<TensorDataType>::Zero());
       auto opt = this->m_model->template create_optimizer<TensorDataType>();
       w->set_name(this->get_name() + "_bias_weights");
       w->set_optimizer(std::move(opt));
       w->set_initializer(std::move(init));
       this->set_weights(weight_idx, w);
       this->m_model->add_weights(std::move(w));
     }
     auto& weights = this->get_weights(weight_idx);
     weights.set_dims(out_dims);
     weights.set_matrix_distribution(dist);
     m_bias_gradient.reset(AbsDistMatrixType::Instantiate(dist));
     m_bias_gradient->AlignWith(dist);
     m_bias_gradient->Resize(weights.get_matrix_height(),
                             weights.get_matrix_width());
   }
 }

 LBANN_DEFINE_LAYER_BUILDER(layer_norm);

 // =========================================================
 // Explicit template instantiation
 // =========================================================

 #ifndef LBANN_LAYER_NORM_LAYER_INSTANTIATE
 #define PROTO_DEVICE(T, Device)                                                \
   extern template class layer_norm_layer<T,                                    \
                                          data_layout::DATA_PARALLEL,           \
                                          Device>;                              \
   extern template class layer_norm_layer<T, data_layout::MODEL_PARALLEL, Device>

 #include "lbann/macros/instantiate_device.hpp"
 #undef PROTO_DEVICE
 #endif // LBANN_LAYER_NORM_LAYER_INSTANTIATE

 } // namespace lbann

 #endif // LBANN_LAYERS_REGULARIZERS_LAYER_NORM_HPP_INCLUDED
lbann::Layer::setup_dims
virtual void setup_dims()
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.

lbann::ERROR_SIGNALS
Definition: base.hpp:207

lbann::data_type_weights
Definition: l2.hpp:41

lbann::Layer::get_comm
lbann_comm * get_comm() const

lbann::layer_norm_layer::can_run_inplace
bool can_run_inplace() const override
If True, the computation can run in-place (feeding each input activations tensor as the corresponding...
Definition: layer_norm.hpp:81

lbann::PREV_ACTIVATIONS
Definition: base.hpp:208

lbann::layer_norm_layer::operator=
layer_norm_layer & operator=(const layer_norm_layer &other)
Definition: layer_norm.hpp:180

lbann::layer_norm_layer::get_data_layout
data_layout get_data_layout() const override
Get data layout of the data tensors. We assume that the data layouts of the previous activations...
Definition: layer_norm.hpp:213

LBANN_ERROR
#define LBANN_ERROR(...)
Definition: exception.hpp:37

lbann::layer_norm_layer::m_bias_gradient
std::unique_ptr< AbsDistMatType > m_bias_gradient
Gradient w.r.t. bias.
Definition: layer_norm.hpp:132

lbann::layer_norm_layer::AbsDistMatrixType
El::AbstractDistMatrix< TensorDataType > AbsDistMatrixType
The tensor type expected in this object.
Definition: layer_norm.hpp:61

lbann::layer_norm_layer::get_backprop_requirements
int get_backprop_requirements() const override
Returns the necessary tensors for computing backpropagation.
Definition: layer_norm.hpp:82

lbann::weights::get_matrix_width
size_t get_matrix_width() const

lbann::Layer::get_input_dims
std::vector< int > get_input_dims(size_t input_index=0) const
Get input tensor dimensions.

lbann::description
Generates nicely formatted description messages.
Definition: description.hpp:49

lbann::layer_norm_layer::m_scale_gradient
std::unique_ptr< AbsDistMatType > m_scale_gradient
Gradient w.r.t. scale.
Definition: layer_norm.hpp:129

lbann::model::add_weights
void add_weights(OwningWeightsPtr &&w)
Add weights to model.

lbann::Layer::get_description
virtual description get_description() const
Human-readable description.

lbann::Device
constexpr El::Device Device
Definition: OperatorTraits.hpp:62

lbann::data_type_layer< TensorDataType >::get_prev_activations
InputAbsDistMatrixType & get_prev_activations(int parent_index=0)

lbann::weights
Definition: weights/weights.hpp:100

lbann::layer_norm_layer::copy
layer_norm_layer * copy() const override
Copy function. This function dynamically allocates memory for a layer instance and instantiates a cop...
Definition: layer_norm.hpp:200

lbann::layer_norm_layer::get_device_allocation
El::Device get_device_allocation() const override
Get the device allocation for the data tensors. We assume that the decice allocation of the previous ...
Definition: layer_norm.hpp:220

lbann::layer_norm_layer::serialize
void serialize(ArchiveT &ar)

lbann::Layer::get_weights
weights const  & get_weights(size_t idx) const

lbann::model::set_name
void set_name(std::string name)
Metadata Accessors.

lbann::Layer::set_output_dims
void set_output_dims(std::vector< int > dims, size_t output_index=0)
Set output tensor dimensions.

lbann::Layer::num_weights
size_t num_weights() const noexcept
Definition: layer.hpp:727

lbann::layer_norm_layer::m_bias
bool m_bias
Apply elementwise bias after normalization (learned weights).
Definition: layer_norm.hpp:115

lbann::Layer::has_weights
bool has_weights() const noexcept
Definition: layer.hpp:728

lbann::layer_norm_layer::m_scale
bool m_scale
Apply elementwise scale after normalization (learned weights).
Definition: layer_norm.hpp:112

lbann::layer_norm_layer::m_epsilon
TensorDataType m_epsilon
Definition: layer_norm.hpp:109

lbann::data_type_layer
Definition: data_type_layer.hpp:69

lbann::weights::set_matrix_distribution
void set_matrix_distribution(El::DistData dist)

lbann::layer_norm_layer::bp_compute
void bp_compute() override
Compute objective funciton gradients. Called by the &#39;back_prop&#39; function. Given the input...

layer.hpp

lbann::Layer::get_name
std::string get_name() const
Get the layer instance&#39;s name.
Definition: layer.hpp:332

lbann::Layer::set_num_weights
void set_num_weights(size_t n)
Definition: layer.hpp:733

lbann::layer_norm_layer::AbsDistMatType
El::AbstractDistMatrix< TensorDataType > AbsDistMatType
Definition: layer_norm.hpp:106

lbann::layer_norm_layer
Normalize over data samples.
Definition: layer_norm.hpp:54

lbann::layer_norm_layer::write_specific_proto
void write_specific_proto(lbann_data::Layer &proto) const final
Definition: layer_norm.hpp:140

lbann::layer_norm_layer::m_statistics_gradient
std::unique_ptr< AbsDistMatType > m_statistics_gradient
Gradients w.r.t. per-sample statistics.
Definition: layer_norm.hpp:126

lbann::layer_norm_layer::get_description
description get_description() const override
Human-readable description.
Definition: layer_norm.hpp:227

instantiate_device.hpp

lbann::layer_norm_layer::layer_norm_layer
layer_norm_layer(TensorDataType epsilon=El::To< TensorDataType >(1e-5), bool scale=false, bool bias=false)
Definition: layer_norm.hpp:151

lbann::data_layout
data_layout
Data layout that is optimized for different modes of parallelism.
Definition: base.hpp:218

lbann::weights::get_matrix_height
size_t get_matrix_height() const

lbann::layer_norm_layer::setup_data
void setup_data(size_t max_mini_batch_size) override
Setup layer data. Called by the &#39;setup&#39; function. Memory is allocated for distributed matrices...
Definition: layer_norm.hpp:244

data_type_layer.hpp

lbann::layer_norm_layer::get_type
std::string get_type() const override
Get the layer type&#39;s name.
Definition: layer_norm.hpp:206

lbann::weights::set_dims
void set_dims(std::vector< size_t > matrix_height_dims, std::vector< size_t > matrix_width_dims={})

lbann::Layer::set_weights
void set_weights(size_t idx, ViewingWeightsPtr w)
Definition: layer.hpp:734

lbann::layer_norm_layer::fp_compute
void fp_compute() override
Apply layer operation. Called by the &#39;forward_prop&#39; function. Given the input tensors, the output tensors are populated with computed values.

lbann::data_type_layer::setup_data
void setup_data(size_t max_mini_batch_size) override

lbann::LBANN_DEFINE_LAYER_BUILDER
LBANN_DEFINE_LAYER_BUILDER(elu)

lbann::layer_norm_layer::m_statistics
std::unique_ptr< AbsDistMatType > m_statistics
Per-sample statistics.
Definition: layer_norm.hpp:121

lbann::Layer::get_output_dims
std::vector< int > get_output_dims(size_t output_index=0) const
Get output tensor dimensions.

model.hpp

lbann::data_type_layer::operator=
data_type_layer & operator=(data_type_layer &&other)=default

lbann::Layer::m_model
model * m_model
Reference to model managing this layer.
Definition: layer.hpp:845

lbann
Definition: callback_helpers.hpp:32

lbann::layer_norm_layer::setup_dims
void setup_dims() override
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
Definition: layer_norm.hpp:237

datatype_helpers.hpp