LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
dist_embedding.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_LAYERS_MISC_DIST_EMBEDDING_HPP_INCLUDED
28 #define LBANN_LAYERS_MISC_DIST_EMBEDDING_HPP_INCLUDED
29 #include "lbann/base.hpp"
30 #include "lbann/layers/layer.hpp"
31 
32 #if defined(LBANN_HAS_SHMEM) || defined(LBANN_HAS_NVSHMEM)
34 #include "lbann/models/model.hpp"
35 #include "lbann/optimizers/sgd.hpp"
37 #include "lbann/proto/layers.pb.h"
38 #include "lbann/utils/memory.hpp"
40 
41 namespace lbann {
42 
69 template <typename TensorDataType, data_layout Layout, El::Device Device>
70 class dist_embedding_layer : public data_type_layer<TensorDataType>
71 {
72  static_assert(
74  "distributed embedding layer only supports data parallel layout");
75 
76 public:
77  dist_embedding_layer(size_t num_embeddings,
78  size_t embedding_dim,
79  bool sparse_sgd,
80  DataType learning_rate,
81  bool barrier_in_forward_prop);
82 
83  dist_embedding_layer(const dist_embedding_layer& other);
84  dist_embedding_layer& operator=(const dist_embedding_layer& other);
85  ~dist_embedding_layer();
86 
87  dist_embedding_layer* copy() const override;
88 
89  std::string get_type() const override;
90  data_layout get_data_layout() const override;
91  El::Device get_device_allocation() const override;
92  bool can_run_inplace() const override { return false; }
93  int get_backprop_requirements() const override
94  {
96  }
97 
98  description get_description() const override;
99 
101 
103  template <typename ArchiveT>
104  void serialize(ArchiveT& ar);
105 
107 
108 protected:
110  void write_specific_proto(lbann_data::Layer& proto) const final;
111 
112  friend class cereal::access;
113  dist_embedding_layer();
114 
115  void setup_dims() override;
116  void setup_data(size_t max_mini_batch_size) override;
117 
118  void fp_compute() override;
119  void bp_compute() override;
120  bool update_compute() override;
121 
122 public:
129  struct vector_metadata
130  {
131  size_t source_rank{0};
132  size_t source_index{0};
133  size_t target_rank{0};
134  size_t target_index{0};
135  bool is_active{false};
136  };
137 
138 private:
139  using LocalMat = El::Matrix<TensorDataType, Device>;
140 
145  static void
146  nb_barrier(lbann_comm& comm, const El::mpi::Comm& c, Al::request& req);
147 
148  void attach_embeddings_to_shmem_buffer();
149  void apply_sparse_sgd_step(size_t num_gradients, LocalMat& local_embeddings);
150 
158  TensorDataType* m_embeddings_buffer{nullptr};
160  size_t m_embeddings_buffer_size{0};
161 
163  TensorDataType* m_workspace_buffer{nullptr};
165  size_t m_workspace_buffer_size{0};
166 
168  vector_metadata* m_metadata_buffer{nullptr};
170  size_t m_metadata_buffer_size{0};
171 
183  Al::request m_nb_barrier_request;
184 
186  size_t m_num_embeddings;
188  size_t m_embedding_dim;
189 
194  bool m_sparse_sgd;
196  DataType m_learning_rate;
197 
209  bool m_barrier_in_forward_prop;
210 };
211 
212 // ---------------------------------------------
213 // Implementation
214 // ---------------------------------------------
215 
216 template <typename T, data_layout L, El::Device D>
217 void dist_embedding_layer<T, L, D>::write_specific_proto(
218  lbann_data::Layer& proto) const
219 {
220  proto.set_datatype(proto::ProtoDataType<T>);
221  auto* msg = proto.mutable_dist_embedding();
222  msg->set_num_embeddings(m_num_embeddings);
223  msg->set_embedding_dim(m_embedding_dim);
224  msg->set_sparse_sgd(m_sparse_sgd);
225  msg->set_learning_rate(m_learning_rate);
226  msg->set_barrier_in_forward_prop(m_barrier_in_forward_prop);
227 }
228 
229 template <typename TensorDataType, data_layout Layout, El::Device Device>
230 dist_embedding_layer<TensorDataType, Layout, Device>::dist_embedding_layer(
231  size_t num_embeddings,
232  size_t embedding_dim,
233  bool sparse_sgd,
234  DataType learning_rate,
235  bool barrier_in_forward_prop)
236  : data_type_layer<TensorDataType>(nullptr),
237  m_num_embeddings{num_embeddings},
238  m_embedding_dim{embedding_dim},
239  m_sparse_sgd{sparse_sgd},
240  m_learning_rate{learning_rate},
241  m_barrier_in_forward_prop{barrier_in_forward_prop}
242 {
243 
244  // Learning rate is only used for sparse SGD
245  if (!m_sparse_sgd) {
246  m_learning_rate = -1.0;
247  }
248 }
249 
250 template <typename TensorDataType, data_layout Layout, El::Device Device>
251 dist_embedding_layer<TensorDataType, Layout, Device>::dist_embedding_layer()
252  : dist_embedding_layer(1, 1, false, El::To<DataType>(1), false)
253 {}
254 
255 template <typename TensorDataType, data_layout Layout, El::Device Device>
256 dist_embedding_layer<TensorDataType, Layout, Device>::dist_embedding_layer(
257  const dist_embedding_layer& other)
258  : data_type_layer<TensorDataType>(other)
259 {
260  LBANN_ERROR("copy constructor is invalid for dist_embedding_layer");
261 }
262 
263 template <typename TensorDataType, data_layout Layout, El::Device Device>
264 dist_embedding_layer<TensorDataType, Layout, Device>&
265 dist_embedding_layer<TensorDataType, Layout, Device>::operator=(
266  const dist_embedding_layer& other)
267 {
268  LBANN_ERROR("copy assignment operator is invalid for dist_embedding_layer");
269 }
270 
271 template <typename TensorDataType, data_layout Layout, El::Device Device>
272 dist_embedding_layer<TensorDataType, Layout, Device>*
273 dist_embedding_layer<TensorDataType, Layout, Device>::copy() const
274 {
275  return new dist_embedding_layer(*this);
276 }
277 
278 template <typename TensorDataType, data_layout Layout, El::Device Device>
279 std::string
280 dist_embedding_layer<TensorDataType, Layout, Device>::get_type() const
281 {
282  return "distributed embedding";
283 }
284 
285 template <typename TensorDataType, data_layout Layout, El::Device Device>
287 dist_embedding_layer<TensorDataType, Layout, Device>::get_data_layout() const
288 {
289  return Layout;
290 }
291 
292 template <typename TensorDataType, data_layout Layout, El::Device Device>
294 dist_embedding_layer<TensorDataType, Layout, Device>::get_device_allocation()
295  const
296 {
297  return Device;
298 }
299 
300 template <typename TensorDataType, data_layout Layout, El::Device Device>
301 description
302 dist_embedding_layer<TensorDataType, Layout, Device>::get_description() const
303 {
305  desc.add("Num embeddings", m_num_embeddings);
306  desc.add("Embedding dim", m_embedding_dim);
307  desc.add("Using sparse SGD", m_sparse_sgd);
308  desc.add("SGD learning rate", m_learning_rate);
309  return desc;
310 }
311 
312 template <typename TensorDataType, data_layout Layout, El::Device Device>
313 void dist_embedding_layer<TensorDataType, Layout, Device>::setup_dims()
314 {
316  auto dims = this->get_input_dims();
317  dims.push_back(static_cast<int>(m_embedding_dim));
318  this->set_output_dims(dims);
319 }
320 
321 template <typename TensorDataType, data_layout Layout, El::Device Device>
322 void dist_embedding_layer<TensorDataType, Layout, Device>::setup_data(
323  size_t max_mini_batch_size)
324 {
325  data_type_layer<TensorDataType>::setup_data(max_mini_batch_size);
326 
327  // Synchronize non-blocking barrier
328  // Note: Make sure SHMEM buffers are safe to reset.
329  auto& comm = *this->get_comm();
330  comm.wait(m_nb_barrier_request);
331 
332  // Construct default weights if needed
333  // Note: Randomly drawn from normal distribution with mean 0 and
334  // standard deviation 1.
335  if (!this->has_weights()) {
336  auto w = std::make_shared<data_type_weights<TensorDataType>>(comm);
337  auto init = std::make_unique<normal_initializer<TensorDataType>>(0, 1);
338  auto opt = this->m_model->template create_optimizer<TensorDataType>();
339  w->set_name(this->get_name() + "_weights");
340  w->set_initializer(std::move(init));
341  w->set_optimizer(std::move(opt));
342  this->add_weights(w);
343  this->m_model->add_weights(std::move(w));
344  }
345  if (this->num_weights() != 1) {
346  LBANN_ERROR("attempted to setup ",
347  this->get_type(),
348  " layer \"",
349  this->get_name(),
350  "\" ",
351  "with an invalid number of weights ",
352  "(expected 1, found ",
353  this->num_weights(),
354  ")");
355  }
356 
357  // Configure embedding weights
358  auto& embeddings = this->get_weights(0);
359  {
360  auto dist = this->get_prev_activations().DistData();
361  dist.colDist = El::STAR;
362  dist.rowDist = El::VC;
363  embeddings.set_dims({m_embedding_dim}, {m_num_embeddings});
364  embeddings.set_matrix_distribution(dist);
365  }
366 
367  // Destroy embedding optimizer and create dummy weights
368  // Note: This layer manually performs sparse SGD on embedding
369  // weights during backprop, so the embedding optimizer isn't needed.
370  // However, the layer must send gradients to some optimizer to
371  // prevent the model from optimizing the layer out of compute graph
372  // during backprop. We get around this by creating dummy weights
373  // with no entries.
374  if (m_sparse_sgd) {
375  embeddings.set_optimizer(nullptr);
376  auto w = std::make_shared<data_type_weights<TensorDataType>>(comm);
377  auto opt = std::make_unique<sgd<TensorDataType>>(0.);
378  w->set_name(this->get_name() + "_dummy_weights");
379  w->set_optimizer(std::move(opt));
380  w->set_dims(1);
381  w->set_matrix_distribution(embeddings.get_matrix_distribution());
382  w->setup();
383  this->add_weights(w);
384  this->m_model->add_weights(std::move(w));
385  }
386 
387  // Setup embedding weights
388  embeddings.setup();
389  attach_embeddings_to_shmem_buffer();
390 
391  // Non-blocking barrier
392  // Note: Embeddings have been initialized
393  nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request);
394 }
395 
396 template <typename TensorDataType, data_layout Layout, El::Device Device>
397 bool dist_embedding_layer<TensorDataType, Layout, Device>::update_compute()
398 {
399 
400  // Apply sparse SGD if needed
401  if (m_sparse_sgd) {
402  const size_t input_size = this->get_input_size();
403  const size_t mini_batch_size = this->get_prev_activations().Width();
404  using ValuesGetter = weights_details::SafeWeightsAccessor<TensorDataType>;
405  auto& embeddings = ValuesGetter::mutable_values(this->get_weights(0));
406  auto& local_embeddings = dynamic_cast<LocalMat&>(embeddings.Matrix());
407  apply_sparse_sgd_step(input_size * mini_batch_size, local_embeddings);
408  }
409 
410  // Non-blocking barrier
411  // Note: Embeddings are up-to-date.
412  auto& comm = *this->get_comm();
413  comm.wait(m_nb_barrier_request);
414  nb_barrier(comm, comm.get_trainer_comm(), m_nb_barrier_request);
415 
416  return true;
417 }
418 
419 template <typename TensorDataType, data_layout Layout, El::Device Device>
420 void dist_embedding_layer<TensorDataType, Layout, Device>::nb_barrier(
421  lbann_comm& comm,
422  const El::mpi::Comm& c,
423  Al::request& req)
424 {
425  static El::Matrix<float, Device> buffer;
426  buffer.SetMemoryMode(0); // Don't use memory pool
427  buffer.Resize(1, 1);
428  comm.nb_allreduce(buffer, c, req);
429 }
430 
431 // ---------------------------------------------
432 // Explicit template instantiation
433 // ---------------------------------------------
434 
435 #ifdef LBANN_HAS_SHMEM
436 extern template class dist_embedding_layer<float,
437  data_layout::DATA_PARALLEL,
438  El::Device::CPU>;
439 #endif // LBANN_HAS_SHMEM
440 #if defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
441 extern template class dist_embedding_layer<float,
442  data_layout::DATA_PARALLEL,
443  El::Device::GPU>;
444 #endif // defined(LBANN_HAS_GPU) && defined(LBANN_HAS_NVSHMEM)
445 
446 } // namespace lbann
447 #endif // defined(LBANN_HAS_SHMEM) || defined(LBANN_HAS_NVSHMEM)
448 
449 // ---------------------------------------------
450 // Builder function
451 // ---------------------------------------------
452 
453 namespace lbann {} // namespace lbann
454 
455 #endif // LBANN_LAYERS_MISC_DIST_EMBEDDING_HPP_INCLUDED
virtual void setup_dims()
Setup tensor dimensions Called by the &#39;setup&#39; function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
#define LBANN_ERROR(...)
Definition: exception.hpp:37
void serialize(std::ostream &os, google::protobuf::Message const &msg)
Serialize the protobuf message to a stream.
virtual description get_description() const
Human-readable description.
constexpr El::Device Device
data_layout
Data layout that is optimized for different modes of parallelism.
Definition: base.hpp:218
void setup_data(size_t max_mini_batch_size) override