LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
optimizer_impl.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_OPTIMIZERS_OPTIMIZER_IMPL_HPP_INCLUDED
28 #define LBANN_OPTIMIZERS_OPTIMIZER_IMPL_HPP_INCLUDED
29 
32 
33 namespace lbann {
34 
35 template <typename TensorDataType>
37  El::AbstractDistMatrix<TensorDataType> const& contrib,
38  TensorDataType scale,
39  bool allreduce_needed)
40 {
41  TensorDataType buf_scale, in_scale;
42  auto& grad = get_gradient_buffer(buf_scale, in_scale, allreduce_needed);
43  El::Scale(buf_scale, grad);
44  El::Axpy(in_scale * scale, contrib, grad);
45 }
46 
47 template <typename TensorDataType>
48 El::AbstractDistMatrix<TensorDataType>&
49 optimizer::get_gradient_buffer(TensorDataType& buf_scale,
50  TensorDataType& in_scale,
51  bool allreduce_needed)
52 {
53 
54  // Anon enum to clarify "get<#>" calls below.
55  enum
56  {
57  HEIGHT = 0,
58  WIDTH,
59  DISTDATA
60  };
61  using GradMgrType = GradientHelperImpl<TensorDataType>;
62 
63  auto& grad_mgr_ptr = gradients_[std::type_index(typeid(TensorDataType))];
64  // If the manager hasn't been created, let's make it.
65  if (!grad_mgr_ptr) {
66  auto mat_info = this->get_matrix_info();
67  grad_mgr_ptr = std::make_unique<GradMgrType>(std::get<HEIGHT>(mat_info),
68  std::get<WIDTH>(mat_info),
69  std::get<DISTDATA>(mat_info));
70  grad_mgr_ptr->set_status(optimizer_gradient_status::cleared);
71  }
72  // Get the underlying matrix back out.
73  auto& grad_mgr = static_cast<GradMgrType&>(*grad_mgr_ptr);
74  // Complete outstanding allreduce, if needed.
75  if (grad_mgr.get_status() == optimizer_gradient_status::allreduce_started) {
76  grad_mgr.complete_allreduce(*(this->m_comm));
77  }
78  auto& buffer = grad_mgr.gradient();
79 
80  // Determine scaling factor and transition state.
81  switch (grad_mgr.get_status()) {
83  buf_scale = DataType(1);
84  in_scale = DataType(1);
85  if (allreduce_needed) {
86  buf_scale /= buffer.RedundantSize();
88  }
89  break;
91  buf_scale = DataType(0);
92  in_scale = DataType(1);
93  grad_mgr.set_status(allreduce_needed
96  break;
98  buf_scale = DataType(1);
99  // Properly scale data that does not need to be allreduced.
100  in_scale =
101  (allreduce_needed ? DataType(1) : DataType(1) / buffer.RedundantSize());
102  break;
104  default:
105  LBANN_ERROR("unexpected gradient status (" +
106  to_string(grad_mgr.get_status()) + ")");
107  }
108  return buffer;
109 }
110 
111 template <typename TensorDataType>
113  El::AbstractDistMatrix<TensorDataType>& gradient)
114 {
115  using AbsDistMatType = El::AbstractDistMatrix<TensorDataType>;
116  static const TensorDataType one = TensorDataType(1.f);
117 
118  // There are a few cases to note here:
119  // 1. One update of the same type.
120  // 2. One update of a different type.
121  // 3. Multiple updates of multiple types. In this case, some work
122  // can be saved if one of the updates has the same type as
123  // "gradient".
124 
125  // Some general information
126  auto num_updates = this->gradients_.size();
127  auto const this_type_idx = std::type_index(typeid(TensorDataType));
128 
129  if (num_updates == 0UL)
130  return;
131 
132  // Handle the case that one of the updates is TensorDataType. In
133  // this case, the input gradients matrix can be made to "view" the
134  // update, rather than requiring a copy.
135  auto this_type_contrib = this->gradients_.find(this_type_idx);
136  if (this_type_contrib != this->gradients_.end()) {
137  // Check for invariant consistency.
138  auto const& grad_mgr = *(this_type_contrib->second);
139  if (grad_mgr.get_status() != optimizer_gradient_status::ready) {
140  LBANN_ERROR("Expected ready status. Got: ",
141  to_string(grad_mgr.get_status()));
142  }
143  // Sync the input gradient with the contribution, one way or another.
144  auto const& contrib =
145  dynamic_cast<AbsDistMatType const&>(grad_mgr.gradient());
146  if (contrib.DistData() == gradient.DistData()) {
147  El::LockedView(gradient, contrib);
148  }
149  else {
150  LBANN_ERROR("Should never need this copy.");
151  El::Copy(contrib, gradient);
152  }
153  --num_updates;
154  }
155  else {
156  // No sync possible; zero out the matrix instead
157  El::Zero(gradient);
158  }
159 
160  // Handle the case that only 1 update of a different type is needed.
161  if (num_updates == 1UL && this->gradients_.size() == 1UL) {
162  auto const& grad_mgr = *(this->gradients_.begin()->second);
163  if (grad_mgr.get_status() != optimizer_gradient_status::ready) {
164  LBANN_ERROR("Expected ready status. Got: ",
165  to_string(grad_mgr.get_status()));
166  }
167  El::Copy(grad_mgr.gradient(), gradient);
168  }
169  else if (this->gradients_.size() > 1UL) {
170  // Need a temporary matrix for the type-casted copy.
171  auto tmp = std::unique_ptr<AbsDistMatType>{
172  gradient.Construct(gradient.Grid(), gradient.Root())};
173 
174  for (auto const& grad_mgr_v : this->gradients_) {
175  if (grad_mgr_v.first == this_type_idx)
176  continue;
177  auto const& grad_mgr = *(grad_mgr_v.second);
178  if (grad_mgr.get_status() != optimizer_gradient_status::ready) {
179  LBANN_ERROR("Expected ready status. Got: ",
180  to_string(grad_mgr.get_status()));
181  }
182  auto const& grad_base = grad_mgr.gradient();
183  El::Copy(grad_base, *tmp);
184  El::Axpy(one, *tmp, gradient);
185  }
186  }
187 }
188 
189 template <typename TensorDataType>
191  lbann_comm& comm)
192 {
193  switch (this->get_status()) {
195  comm.nb_allreduce(*gradient_, gradient_->RedundantComm(), allreduce_req_);
197  break;
201  break;
202  default:
203  LBANN_ERROR("unexpected gradient status "
204  "(" +
205  to_string(this->get_status()) + ")");
206  }
207 }
208 
209 template <typename TensorDataType>
211  lbann_comm& comm)
212 {
213  switch (this->get_status()) {
215  comm.wait(allreduce_req_);
216  this->set_status(optimizer_gradient_status::ready);
217  break;
220  break;
222  LBANN_ERROR("attempted to finish gradient allreduce "
223  "before starting it");
224  break;
225  default:
226  LBANN_ERROR("unexpected gradient status "
227  "(" +
228  to_string(this->get_status()) + ")");
229  }
230 }
231 
232 template <typename TensorDataType>
234 {
235  this->set_status(optimizer_gradient_status::cleared);
236 }
237 
238 } // namespace lbann
239 
240 #endif // LBANN_OPTIMIZERS_OPTIMIZER_IMPL_HPP_INCLUDED
lbann_comm * m_comm
LBANN communicator.
Definition: optimizer.hpp:304
std::unordered_map< std::type_index, gradient_manager_ptr > gradients_
Definition: optimizer.hpp:330
#define LBANN_ERROR(...)
Definition: exception.hpp:37
void nb_allreduce(El::AbstractMatrix< TensorDataType > &m, const El::mpi::Comm &c, Al::request &req, El::mpi::Op op=El::mpi::SUM) const
std::string to_string(El::Device const &d)
El::AbstractDistMatrix< TensorDataType > & get_gradient_buffer(TensorDataType &buf_scale, TensorDataType &in_scale, bool allreduce_needed=false)
Get the gradient buffer.
Values can be accessed immediately.
void accumulate_all_gradient_contributions(El::AbstractDistMatrix< TensorDataType > &gradient)
void complete_allreduce(lbann_comm &comm) override
Allreduce is needed before accessing values.
void add_to_gradient(El::AbstractDistMatrix< TensorDataType > const &contrib, TensorDataType scale=1.f, bool allreduce_needed=false)
Add to the objective function gradient w.r.t. the weights.
void start_allreduce(lbann_comm &comm) override
Allreduce on values is in progress.
virtual std::tuple< El::Int, El::Int, El::DistData > get_matrix_info() const =0
void wait(El::mpi::Request< T > &req) const
Definition: comm_impl.hpp:754