LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
comm.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_COMM_HPP_INCLUDED
28 #define LBANN_COMM_HPP_INCLUDED
29 
30 #include "base.hpp"
31 
32 #ifdef LBANN_HAS_CUDA
33 #include <cuda_runtime.h>
34 #endif // LBANN_HAS_CUDA
35 #ifdef LBANN_HAS_ALUMINUM
36 #include <Al.hpp>
37 #endif // LBANN_HAS_ALUMINUM
38 
40 
41 #include "detect_El_mpi.hpp"
42 
43 #include <map>
44 #include <typeindex>
45 #include <vector>
46 
47 namespace lbann {
48 
49 #ifdef LBANN_HAS_ALUMINUM
50 
51 ::Al::ReductionOperator mpi_op_to_al_op(El::mpi::Op op);
52 #endif
53 
55 enum class GridType
56 {
57  NO_GRID = 0,
58  PRIMARY_GRID = 1,
59  SECONDARY_GRID = 2
60 };
61 
62 /* Notes on Synchronization
63  *
64  * The updated interface exposes a synchronization handle/device
65  * tagging mechanism used by Hydrogen: El::SyncInfo<D>, where D is an
66  * El::Device. When operating on Matrix objects, this should be
67  * handled automagically, assuming the Matrix is setup properly. Users
68  * must be aware of this when making MPI calls through Hydrogen or
69  * through lbann_comm with raw data buffers (T[]).
70  *
71  * When dealing with El::Matrix objects, users should be aware of the
72  * following. There is no synchronization for CPU objects
73  * (El::SyncInfo<El::Device::CPU> is an empty struct), but GPU Matrix
74  * objects now have an associated stream and event. These are
75  * GPUManager::Stream() and GPUManager::Event() by default, resp., but
76  * can be overriden by a user. Note: the Matrix never owns these; it
77  * will not free these resources at destruction. There are many
78  * methods in which multiple El::Matrix objects might interact. This
79  * should work properly; otherwise, report bugs to benson31.
80  *
81  * When dealing with raw data (T[]), users should be aware of the
82  * following. In the near future, all El::mpi functions will have an
83  * El::SyncInfo object as their last parameter, and it will be a
84  * required parameter. In lbann_comm, this means that when the call
85  * trickles down to an El::mpi function, an appropriate El::SyncInfo
86  * must be available. Since many of LBANN's uses of this interface are
87  * for communicating CPU buffers, there is "shortcut" API that assumes
88  * the data is CPU memory, thus providing the default
89  * El::SyncInfo<El::Device::CPU> object to El::mpi. If a user wishes
90  * to communicate GPU data, they must use the "full" API, which adds a
91  * final El::SyncInfo parameter to the function. This ensures the
92  * appropriate synchronization semantics, especially when working with
93  * Aluminum as the communication frontend.
94  */
95 
106 {
107 public:
112  lbann_comm(int procs_per_trainer = 0,
113  El::mpi::Comm world = El::mpi::COMM_WORLD.GetMPIComm());
115  lbann_comm(const lbann_comm&) = delete;
117  lbann_comm& operator=(const lbann_comm&) = delete;
118  ~lbann_comm();
119 
130  void split_trainers(int procs_per_trainer = -1, int trainer_grid_height = -1);
131 
145  void split_trainer_grid(int num_process_primary_grid = 0,
146  bool create_two_models = false,
147  bool enable_async_comm = false,
148  bool enable_topo_aware = false);
149 
152  inline GridType get_grid_type() const noexcept { return m_grid_type; }
153 
155  inline int get_trainer_rank() const noexcept { return m_trainer_rank; }
157  inline int get_rank_in_trainer() const noexcept { return m_rank_in_trainer; }
159  inline int get_rank_in_world() const
160  {
161  return El::mpi::Rank(get_world_comm());
162  }
164  inline int get_world_rank(int trainer, int rank) const noexcept
165  {
166  if (m_secondary_grid_ranks.size() == 0) {
167  return m_procs_per_trainer * trainer + rank;
168  }
169  else {
170  return (m_secondary_grid_ranks.size() + m_primary_grid_ranks.size()) *
171  trainer +
172  rank;
173  }
174  }
176  inline int map_world_rank_to_trainer_rank(int world_rank) const noexcept
177  {
178  return (world_rank / m_procs_per_trainer);
179  }
181  inline int map_world_rank_to_rank_in_trainer(int world_rank) const noexcept
182  {
183  return (world_rank % m_procs_per_trainer);
184  }
186  inline int get_trainer_master() const noexcept { return 0; }
188  inline int get_intertrainer_master() const noexcept { return 0; }
190  inline int get_world_master() const noexcept { return 0; }
192  inline bool am_trainer_master() const noexcept
193  {
194  return get_rank_in_trainer() == get_trainer_master();
195  }
197  inline bool am_world_master() const noexcept
198  {
199  return get_rank_in_world() == get_world_master();
200  }
202  inline El::Grid& get_trainer_grid() { return *m_grid; }
204  inline const El::Grid& get_trainer_grid() const { return *m_grid; }
207  inline El::Grid& get_secondary_grid() { return *m_secondary_grid; }
209  inline const El::Grid& get_secondary_grid() const
210  {
211  return *m_secondary_grid;
212  }
215  inline El::Grid& get_subset_grid() { return *m_subset_grid; }
218  inline const El::Grid& get_subset_grid() const { return *m_subset_grid; }
220  inline int get_num_trainers() const noexcept { return m_num_trainers; }
221  /* Return the number of processes in a trainer. */
222  inline int get_procs_per_trainer() const noexcept
223  {
224  return m_procs_per_trainer;
225  }
227  inline int get_procs_per_node() const noexcept { return m_procs_per_node; }
229  inline int get_procs_in_world() const
230  {
231  return El::mpi::Size(get_world_comm());
232  }
234  inline int get_rank_in_node() const noexcept { return m_rank_in_node; }
236  inline bool is_world_rank_on_node(int rank) const
237  {
238  return std::find(m_world_ranks_on_node.begin(),
239  m_world_ranks_on_node.end(),
240  rank) != m_world_ranks_on_node.end();
241  }
242 
248  inline int get_default_threads_per_proc() const noexcept
249  {
250  return m_threads_per_proc;
251  }
252 
254  void reset_threads() const noexcept;
255 
257  void intertrainer_sum_matrix(AbsMat& mat) const;
258  void intertrainer_sum_matrix(AbsDistMat& mat) const;
260  void intertrainer_broadcast_matrix(AbsMat& mat, int root) const;
261  void intertrainer_broadcast_matrix(AbsDistMat& mat, int root) const;
262 
264  template <typename T, bool S = is_instantiated_El_mpi_type<T>::value>
265  void broadcast(int root, T& val, const El::mpi::Comm& c) const;
266 
267  template <typename T>
268  void broadcast_custom(int root, T& val, const El::mpi::Comm& c) const;
269  template <typename T>
270  void broadcast_native(int root, T& val, const El::mpi::Comm& c) const;
271 
273  template <typename T>
274  void world_broadcast(int root, T& val) const;
276  template <typename T>
277  void intertrainer_broadcast(int root, T& val) const;
279  template <typename T>
280  void trainer_broadcast(int root, T& val) const;
281 
287  // Default to cpu memory
288  template <typename T>
289  void broadcast(const int root,
290  T* data,
291  const int count,
292  const El::mpi::Comm& c) const;
293 
294  template <typename T,
295  El::Device D,
297  void broadcast(const int root,
298  T* data,
299  const int count,
300  const El::mpi::Comm& c,
301  El::SyncInfo<D> const& syncInfo) const;
302 
304  template <typename T>
305  void world_broadcast(const int root, T* data, const int count) const;
306 
307  template <typename T, El::Device D>
308  void world_broadcast(const int root,
309  T* data,
310  const int count,
311  El::SyncInfo<D> const& syncInfo) const;
313  template <typename T>
314  void intertrainer_broadcast(const int root, T* data, const int count) const;
315  template <typename T, El::Device D>
316  void intertrainer_broadcast(const int root,
317  T* data,
318  const int count,
319  El::SyncInfo<D> const& syncInfo) const;
321  template <typename T>
322  void trainer_broadcast(const int root, T* data, const int count) const;
323 
324  template <typename T, El::Device D>
325  void trainer_broadcast(const int root,
326  T* data,
327  const int count,
328  El::SyncInfo<D> const& syncInfo) const;
329 
333  template <typename T>
334  size_t
335  resize(const int root, std::vector<T>& data, const El::mpi::Comm& c) const;
336 
341  template <typename T>
342  void
343  broadcast(const int root, std::vector<T>& data, const El::mpi::Comm& c) const;
345  template <typename T>
346  void world_broadcast(int root, std::vector<T>& data) const;
351  template <typename T>
353  void intertrainer_broadcast(int root, std::vector<T>& data) const;
355  template <typename T>
356  void trainer_broadcast(int root, std::vector<T>& data) const;
357 
359  template <typename T>
360  void all_gather(const T* src,
361  int src_count,
362  T* rcv,
363  int rcv_count,
364  const El::mpi::Comm& c) const;
365  template <typename T, El::Device D>
366  void all_gather(const T* src,
367  int src_count,
368  T* rcv,
369  int rcv_count,
370  const El::mpi::Comm& c,
371  El::SyncInfo<D> const& syncInfo) const;
372 
377  template <typename T>
378  void all_gather(std::vector<T> const& src,
379  std::vector<T>& rcs,
380  std::vector<int> const& rcv_counts,
381  std::vector<int> const& rcv_disp,
382  const El::mpi::Comm& c) const;
387  template <typename T>
388  void trainer_all_gather(std::vector<T> const& src,
389  std::vector<T>& rcs,
390  std::vector<int> const& rcv_counts,
391  std::vector<int> const& rcv_disp) const;
396  template <typename T>
397  void
398  all_gather(T const& src, std::vector<T>& data, const El::mpi::Comm& c) const;
403  template <typename T>
404  void world_all_gather(T const& src, std::vector<T>& data) const;
409  template <typename T>
410  void trainer_all_gather(T const& src, std::vector<T>& data) const;
411 
413  template <typename T>
414  void trainer_gather(T snd, int root) const;
416  template <typename T>
417  void trainer_gather(T snd, T* rcv) const;
419  template <typename T>
420  void trainer_gather(T const* snd, int count, int root) const;
422  template <typename T>
423  void trainer_gather(T const* snd, int count, T* rcv) const;
425  template <typename T>
426  void trainer_gatherv(T const* snd, int count, int root) const;
427  template <typename T>
428  void trainer_gatherv(T const* snd,
429  int count,
430  T* rcv,
431  int const* rcv_counts,
432  int const* rcv_displacements) const;
434  template <typename T>
435  void intertrainer_gather(T snd, int root) const;
437  template <typename T>
438  void intertrainer_gather(T snd, std::vector<T>& rcv) const;
440  template <typename T>
441  void intertrainer_gather(T const* snd, int count, int root) const;
443  template <typename T>
444  void intertrainer_gather(T const* snd, int count, T* rcv) const;
446  template <typename T>
447  void gather(T snd, int root, const El::mpi::Comm& c) const;
449  template <typename T>
450  void gather(T snd, T* rcv, const El::mpi::Comm& c) const;
452  template <typename T>
453  void gather(T snd, std::vector<T>& rcv, const El::mpi::Comm& c) const;
455  template <typename T>
456  void gather(T const* snd, int count, int root, const El::mpi::Comm& c) const;
457  template <typename T, El::Device D>
458  void gather(T const* snd,
459  int count,
460  int root,
461  const El::mpi::Comm& c,
462  El::SyncInfo<D> const& syncInfo) const;
464  template <typename T>
465  void gather(T const* snd, int count, T* rcv, const El::mpi::Comm& c) const;
466  template <typename T, El::Device D>
467  void gather(T const* snd,
468  int count,
469  T* rcv,
470  const El::mpi::Comm& c,
471  El::SyncInfo<D> const& syncInfo) const;
473  template <typename T>
474  T scatter(int root, const El::mpi::Comm& c) const;
476  template <typename T>
477  T scatter(T const* snd, const El::mpi::Comm& c) const;
479  template <typename T>
480  void
481  intertrainer_reduce(T snd, int root, El::mpi::Op op = El::mpi::SUM) const;
483  template <typename T>
484  T intertrainer_reduce(T snd, El::mpi::Op op = El::mpi::SUM) const;
486  template <typename T>
487  void trainer_reduce(T snd, int root, El::mpi::Op op = El::mpi::SUM) const;
489  template <typename T>
490  T trainer_reduce(T snd, El::mpi::Op op = El::mpi::SUM) const;
492  template <typename T>
493  void trainer_reduce(T const* snd,
494  int count,
495  int root,
496  El::mpi::Op op = El::mpi::SUM) const;
498  template <typename T>
499  void trainer_reduce(T const* snd,
500  int count,
501  T* rcv,
502  El::mpi::Op op = El::mpi::SUM) const;
504  template <typename T>
505  void reduce(T snd,
506  int root,
507  const El::mpi::Comm& c,
508  El::mpi::Op op = El::mpi::SUM) const;
510  template <typename T>
511  T reduce(T snd, const El::mpi::Comm& c, El::mpi::Op op = El::mpi::SUM) const;
512 
514  // Op is "SUM"
515  template <typename T>
516  void reduce(T const* snd, int count, int root, const El::mpi::Comm& c) const;
517  template <typename T, El::Device D>
518  void reduce(T const* snd,
519  int count,
520  int root,
521  const El::mpi::Comm& c,
522  El::SyncInfo<D> const& syncInfo) const;
523 
524  template <typename T>
525  void reduce(T const* snd,
526  int count,
527  int root,
528  const El::mpi::Comm& c,
529  El::mpi::Op op) const;
530  template <typename T, El::Device D>
531  void reduce(T const* snd,
532  int count,
533  int root,
534  const El::mpi::Comm& c,
535  El::mpi::Op op,
536  El::SyncInfo<D> const& syncInfo) const;
538  template <typename T, El::Device D>
539  void reduce(T const* snd,
540  int count,
541  T* rcv,
542  const El::mpi::Comm& c,
543  El::SyncInfo<D> const& syncInfo) const;
544  template <typename T>
545  void reduce(T const* snd, int count, T* rcv, const El::mpi::Comm& c) const;
546 
547  template <typename T>
548  void reduce(T const* snd,
549  int count,
550  T* rcv,
551  const El::mpi::Comm& c,
552  El::mpi::Op op) const;
553  template <typename T, El::Device D>
554  void reduce(T const* snd,
555  int count,
556  T* rcv,
557  const El::mpi::Comm& c,
558  El::mpi::Op op,
559  El::SyncInfo<D> const& syncInfo) const;
561  template <typename T>
562  T intertrainer_allreduce(T snd, El::mpi::Op op = El::mpi::SUM) const;
564  template <typename T>
565  T trainer_allreduce(T snd, El::mpi::Op op = El::mpi::SUM) const;
567  template <typename T>
568  void trainer_allreduce(T const* snd,
569  int count,
570  T* rcv,
571  El::mpi::Op op = El::mpi::SUM) const;
573  template <typename T>
574  T allreduce(T snd,
575  const El::mpi::Comm& c,
576  El::mpi::Op op = El::mpi::SUM) const;
577 
578  // FIXME (trb): Based on the backend choice of "MPIBackend", I'm
579  // assuming this is intended as a CPU-only call.
581  template <typename T>
582  void allreduce(T const* snd,
583  int count,
584  T* rcv,
585  const El::mpi::Comm& c,
586  El::mpi::Op op = El::mpi::SUM) const;
588  template <typename T>
589  void allreduce(T* data,
590  int count,
591  const El::mpi::Comm& c,
592  El::mpi::Op op = El::mpi::SUM) const;
594  template <typename TensorDataType>
595  void allreduce(El::AbstractMatrix<TensorDataType>& m,
596  const El::mpi::Comm& c,
597  El::mpi::Op op = El::mpi::SUM) const;
599  template <typename TensorDataType>
600  void allreduce(El::AbstractDistMatrix<TensorDataType>& m,
601  const El::mpi::Comm& c,
602  El::mpi::Op op = El::mpi::SUM) const;
607  template <typename TensorDataType>
608  void nb_allreduce(El::AbstractMatrix<TensorDataType>& m,
609  const El::mpi::Comm& c,
610  Al::request& req,
611  El::mpi::Op op = El::mpi::SUM) const;
616  template <typename TensorDataType>
617  void nb_allreduce(El::AbstractDistMatrix<TensorDataType>& m,
618  const El::mpi::Comm& c,
619  Al::request& req,
620  El::mpi::Op op = El::mpi::SUM) const;
626  template <typename T>
627  void nb_allreduce(T* data,
628  int count,
629  const El::mpi::Comm& c,
630  Al::request& req,
631  El::mpi::Op op = El::mpi::SUM) const;
632 
634  template <typename T>
635  void wait_all(std::vector<El::mpi::Request<T>>& req) const;
636 
638  template <typename T>
639  void wait(El::mpi::Request<T>& req) const;
640 
642  void wait(Al::request& req) const;
644  bool test(Al::request& req) const;
645 
647  void intertrainer_barrier() const;
649  void trainer_barrier() const;
651  void global_barrier() const;
653  void barrier(const El::mpi::Comm& c) const;
654 
656  template <typename T>
657  void send(const T* data, int count, int trainer, int rank) const;
658  template <typename T, El::Device D>
659  void send(const T* data,
660  int count,
661  int trainer,
662  int rank,
663  El::SyncInfo<D> const& syncInfo) const;
664  template <typename T, El::Device D>
665  void send(const T* data,
666  int count,
667  int trainer,
668  El::SyncInfo<D> const& syncInfo) const;
669  void send(const AbsMat& mat, int trainer, int rank) const;
670  void send(const DistMat& mat, int trainer, int rank) const;
671  void send(const AbsMat& mat, int trainer) const
672  {
673  send(mat, trainer, m_rank_in_trainer);
674  }
675  void send(const DistMat& mat, int trainer) const
676  {
677  send(mat, trainer, m_rank_in_trainer);
678  }
679 
681  template <typename T>
682  void nb_send(const T* data,
683  int count,
684  int trainer,
685  int rank,
686  El::mpi::Request<T>& req) const;
687  template <typename T>
688  void nb_tagged_send(const T* data,
689  int count,
690  int rank,
691  int tag,
692  El::mpi::Request<T>& req,
693  const El::mpi::Comm& c) const;
694  template <typename T>
695  void nb_send(const T* data,
696  int count,
697  int trainer,
698  El::mpi::Request<T>& req) const;
699  void nb_send(const AbsMat& mat,
700  int trainer,
701  int rank,
702  El::mpi::Request<DataType>& req) const;
703  void nb_send(const DistMat& mat,
704  int trainer,
705  int rank,
706  El::mpi::Request<DataType>& req) const;
707  void
708  nb_send(const AbsMat& mat, int trainer, El::mpi::Request<DataType>& req) const
709  {
710  nb_send(mat, trainer, m_rank_in_trainer, req);
711  }
712  void nb_send(const DistMat& mat,
713  int trainer,
714  El::mpi::Request<DataType>& req) const
715  {
716  nb_send(mat, trainer, m_rank_in_trainer, req);
717  }
718 
720  template <typename T>
721  void recv(T* data, int count, int trainer, int rank) const;
722  template <typename T>
723  void recv(T* data, int count, int trainer) const;
724  template <typename T>
725  void recv(T* data, int count) const;
726  template <typename T, El::Device D>
727  void recv(T* data,
728  int count,
729  int trainer,
730  int rank,
731  El::SyncInfo<D> const& syncInfo) const;
732  template <typename T, El::Device D>
733  void
734  recv(T* data, int count, int trainer, El::SyncInfo<D> const& syncInfo) const;
735  void recv(AbsMat& mat, int trainer, int rank) const;
736  void recv(DistMat& mat, int trainer, int rank) const;
737  void recv(AbsMat& mat, int trainer) const
738  {
739  recv(mat, trainer, m_rank_in_trainer);
740  }
741  void recv(DistMat& mat, int trainer) const
742  {
743  recv(mat, trainer, m_rank_in_trainer);
744  }
746  template <typename T, El::Device D>
747  void recv(T* data, int count, El::SyncInfo<D> const& syncInfo) const;
748  void recv(AbsMat& mat) const;
749  void recv(DistMat& mat) const;
750 
752  template <typename T>
753  void nb_recv(T* data,
754  int count,
755  int trainer,
756  int rank,
757  El::mpi::Request<T>& req) const;
758  template <typename T>
759  void nb_tagged_recv(T* data,
760  int count,
761  int rank,
762  int tag,
763  El::mpi::Request<T>& req,
764  const El::mpi::Comm& c) const;
765 
766  template <typename T>
767  void nb_recv(T* data, int count, int trainer, El::mpi::Request<T>& req) const;
768  void nb_recv(AbsMat& mat,
769  int trainer,
770  int rank,
771  El::mpi::Request<DataType>& req) const;
772  void nb_recv(DistMat& mat,
773  int trainer,
774  int rank,
775  El::mpi::Request<DataType>& req) const;
776  void nb_recv(AbsMat& mat, int trainer, El::mpi::Request<DataType>& req) const
777  {
778  nb_recv(mat, trainer, m_rank_in_trainer, req);
779  }
780  void nb_recv(DistMat& mat, int trainer, El::mpi::Request<DataType>& req) const
781  {
782  nb_recv(mat, trainer, m_rank_in_trainer, req);
783  }
784  template <typename T>
785  void nb_recv(T* data, int count, El::mpi::Request<T>& req) const;
786  void nb_recv(AbsMat& mat, El::mpi::Request<DataType>& req) const;
787  void nb_recv(DistMat& mat, El::mpi::Request<DataType>& req) const;
788 
790  template <typename T, El::Device D>
791  void sendrecv(const T* snd,
792  int send_count,
793  int send_trainer,
794  int send_rank,
795  T* rcv,
796  int recv_count,
797  int recv_trainer,
798  int recv_rank) const;
799  template <typename T, El::Device D>
800  void sendrecv(const T* snd,
801  int send_count,
802  int send_trainer,
803  T* rcv,
804  int recv_count,
805  int recv_trainer) const;
806 
807  template <typename T, El::Device D>
808  void sendrecv(const T* snd,
809  int send_count,
810  int send_trainer,
811  int send_rank,
812  T* rcv,
813  int recv_count,
814  int recv_trainer,
815  int recv_rank,
816  El::SyncInfo<D> const& syncInfo) const;
817  template <typename T, El::Device D>
818  void sendrecv(const T* snd,
819  int send_count,
820  int send_trainer,
821  T* rcv,
822  int recv_count,
823  int recv_trainer,
824  El::SyncInfo<D> const& syncInfo) const;
825 
827  template <typename T>
828  int get_count(int trainer, int rank) const;
829  template <typename T>
830  int get_count(int trainer) const;
831 
832  // Statistics methods.
834  inline size_t get_num_trainer_barriers() const noexcept
835  {
836  return m_num_trainer_barriers;
837  }
839  inline size_t get_num_intertrainer_barriers() const noexcept
840  {
841  return m_num_intertrainer_barriers;
842  }
844  inline size_t get_num_global_barriers() const noexcept
845  {
846  return m_num_global_barriers;
847  }
849  inline size_t get_bytes_sent() const noexcept { return m_bytes_sent; }
851  inline size_t get_bytes_received() const noexcept { return m_bytes_received; }
852 
853  inline void reset_stats_counters() noexcept
854  {
855  m_num_trainer_barriers = 0;
856  m_num_intertrainer_barriers = 0;
857  m_num_global_barriers = 0;
858  m_bytes_sent = 0;
859  m_bytes_received = 0;
860  }
861 
863  static inline bool is_sendable(const AbsMat& mat) noexcept
864  {
865  // This assumes we do not transmit mat with a datatype smaller than
866  // DataType.
867  // MPI uses "int" as its count type; do calculations with larger ints.
868  size_t count = (size_t)mat.Height() * (size_t)mat.Width();
869  return count <= (size_t)std::numeric_limits<int>::max();
870  }
872  static inline bool is_sendable(const AbsDistMat& dist_mat) noexcept
873  {
874  return is_sendable(dist_mat.LockedMatrix());
875  }
876 
883  const El::mpi::Comm& get_intertrainer_comm() const noexcept
884  {
885  return m_intertrainer_comm;
886  }
887 
889  const El::mpi::Comm& get_trainer_comm() const noexcept
890  {
891  return m_trainer_comm;
892  }
893 
895  const El::mpi::Comm& get_combined_grid_comm() const noexcept
896  {
897  return m_combined_grid_comm;
898  }
899 
901  const El::mpi::Comm& get_world_comm() const noexcept { return m_world_comm; }
902 
904  const El::mpi::Comm& get_node_comm() const noexcept { return m_node_comm; }
905 
907  const El::mpi::Comm& get_KFAC_comm() const noexcept { return m_trainer_comm; }
908 
910  std::vector<int> get_primary_grid_ranks() { return m_primary_grid_ranks; }
911 
913  std::vector<int> get_secondary_grid_ranks() { return m_secondary_grid_ranks; }
914 
915  bool get_KFAC_subgrid_create_two_models() { return m_create_two_models; }
916 
918  bool enable_subgrid_async_communication() { return m_subgrid_async_progress; }
919 
928  const El::mpi::Comm& get_packed_group_comm(int num_per_group) const;
929 
931  bool is_rank_node_local(int rank, const El::mpi::Comm& comm) const
932  {
933  // Translating to COMM_WORLD is typically constant time.
934  int world_rank = El::mpi::Translate(comm, rank, get_world_comm());
935  return is_world_rank_on_node(world_rank);
936  }
937 
939  void lbann_comm_abort(std::string msg) const;
940 
941 private:
943  const El::mpi::Comm m_world_comm;
945  El::mpi::Comm m_trainer_comm;
947  El::mpi::Comm m_intertrainer_comm;
949  El::mpi::Comm m_node_comm;
951  El::mpi::Comm m_primary_grid_comm;
953  El::mpi::Comm m_secondary_grid_comm;
955  El::mpi::Comm m_combined_grid_comm;
957  mutable std::unordered_map<int, El::mpi::Comm> m_group_communicators;
959  std::unique_ptr<El::Grid> m_grid;
973  std::vector<int> m_world_ranks_on_node;
980 
983 
984  bool m_create_two_models = false, m_subgrid_async_progress = false;
985 
986  std::unique_ptr<El::Grid> m_secondary_grid, m_subset_grid;
987 
991  std::vector<int> m_primary_grid_ranks;
992  std::vector<int> m_secondary_grid_ranks;
993 
994  // Various statistics counters.
995  mutable size_t m_num_trainer_barriers;
997  mutable size_t m_num_global_barriers;
998  mutable size_t m_bytes_sent;
999  mutable size_t m_bytes_received;
1000 
1002  void setup_node_comm();
1003 
1012  void setup_threads();
1013 
1017  void count_bytes_broadcast(const size_t bytes,
1018  const int rank,
1019  const int root) const noexcept
1020  {
1021  if (rank == root) {
1022  m_bytes_sent += bytes;
1023  }
1024  else {
1025  m_bytes_received += bytes;
1026  }
1027  }
1028 }; // class lbann_comm
1029 
1034 int get_rank_in_world();
1035 
1036 } // namespace lbann
1037 
1038 #endif // LBANN_COMM_HPP_INCLUDED
int get_rank_in_trainer() const noexcept
Definition: comm.hpp:157
int get_default_threads_per_proc() const noexcept
Definition: comm.hpp:248
std::unordered_map< int, El::mpi::Comm > m_group_communicators
Definition: comm.hpp:957
void nb_send(const DistMat &mat, int trainer, El::mpi::Request< DataType > &req) const
Definition: comm.hpp:712
int get_rank_in_node() const noexcept
Definition: comm.hpp:234
void send(const AbsMat &mat, int trainer) const
Definition: comm.hpp:671
int get_trainer_rank() const noexcept
Definition: comm.hpp:155
size_t m_num_trainer_barriers
Definition: comm.hpp:995
int get_world_master() const noexcept
Definition: comm.hpp:190
std::vector< int > m_primary_grid_ranks
Definition: comm.hpp:991
int get_rank_in_world() const
Definition: comm.hpp:159
size_t m_num_intertrainer_barriers
Definition: comm.hpp:996
El::AbstractDistMatrix< DataType > AbsDistMat
Definition: base.hpp:120
El::Grid Grid
Definition: base.hpp:126
El::mpi::Comm m_trainer_comm
Definition: comm.hpp:945
El::Grid & get_subset_grid()
Definition: comm.hpp:215
MCMRMat< El::Device::CPU > DistMat
Definition: base.hpp:183
size_t get_num_intertrainer_barriers() const noexcept
Definition: comm.hpp:839
int m_trainer_rank
Definition: comm.hpp:965
const El::mpi::Comm & get_intertrainer_comm() const noexcept
Definition: comm.hpp:883
std::vector< int > m_world_ranks_on_node
Definition: comm.hpp:973
void nb_recv(DistMat &mat, int trainer, El::mpi::Request< DataType > &req) const
Definition: comm.hpp:780
size_t m_bytes_sent
Definition: comm.hpp:998
T & data(const cnpy::NpyArray &na, const std::vector< size_t > indices)
Definition: cnpy_utils.hpp:75
int get_num_trainers() const noexcept
Definition: comm.hpp:220
bool enable_subgrid_async_communication()
Definition: comm.hpp:918
std::unique_ptr< El::Grid > m_grid
Definition: comm.hpp:959
int get_trainer_master() const noexcept
Definition: comm.hpp:186
int m_procs_per_trainer
Definition: comm.hpp:963
bool get_KFAC_subgrid_create_two_models()
Definition: comm.hpp:915
std::vector< int > get_primary_grid_ranks()
Definition: comm.hpp:910
const El::mpi::Comm & get_node_comm() const noexcept
Definition: comm.hpp:904
int m_rank_in_node
Definition: comm.hpp:971
int get_intertrainer_master() const noexcept
Definition: comm.hpp:188
constexpr El::Device Device
const El::Grid & get_secondary_grid() const
Definition: comm.hpp:209
int get_rank_in_world()
void send(const DistMat &mat, int trainer) const
Definition: comm.hpp:675
int map_world_rank_to_trainer_rank(int world_rank) const noexcept
Definition: comm.hpp:176
El::mpi::Comm m_primary_grid_comm
Definition: comm.hpp:951
void nb_recv(AbsMat &mat, int trainer, El::mpi::Request< DataType > &req) const
Definition: comm.hpp:776
size_t get_num_global_barriers() const noexcept
Definition: comm.hpp:844
std::unique_ptr< El::Grid > m_subset_grid
Definition: comm.hpp:986
void count_bytes_broadcast(const size_t bytes, const int rank, const int root) const noexcept
Definition: comm.hpp:1017
const El::Grid & get_trainer_grid() const
Definition: comm.hpp:204
GridType get_grid_type() const noexcept
Definition: comm.hpp:152
El::mpi::Comm m_combined_grid_comm
Definition: comm.hpp:955
bool is_rank_node_local(int rank, const El::mpi::Comm &comm) const
Definition: comm.hpp:931
By default, assume no instantiation for the type T in El::mpi.
size_t get_bytes_received() const noexcept
Definition: comm.hpp:851
static bool is_sendable(const AbsMat &mat) noexcept
Definition: comm.hpp:863
El::mpi::Comm m_intertrainer_comm
Definition: comm.hpp:947
int m_procs_per_node
Definition: comm.hpp:969
El::AbstractMatrix< DataType > AbsMat
Definition: base.hpp:115
bool am_trainer_master() const noexcept
Definition: comm.hpp:192
int m_num_trainers
Definition: comm.hpp:961
const El::mpi::Comm & get_KFAC_comm() const noexcept
Definition: comm.hpp:907
const El::mpi::Comm m_world_comm
Definition: comm.hpp:943
size_t m_bytes_received
Definition: comm.hpp:999
size_t m_num_global_barriers
Definition: comm.hpp:997
void recv(DistMat &mat, int trainer) const
Definition: comm.hpp:741
void recv(AbsMat &mat, int trainer) const
Definition: comm.hpp:737
User-facing class that represents a set of compute resources.
Definition: trainer.hpp:60
bool am_world_master() const noexcept
Definition: comm.hpp:197
size_t get_num_trainer_barriers() const noexcept
Definition: comm.hpp:834
const El::mpi::Comm & get_trainer_comm() const noexcept
Definition: comm.hpp:889
const El::mpi::Comm & get_combined_grid_comm() const noexcept
Definition: comm.hpp:895
int m_threads_per_proc
Definition: comm.hpp:979
El::mpi::Comm m_node_comm
Definition: comm.hpp:949
void reset_stats_counters() noexcept
Definition: comm.hpp:853
int map_world_rank_to_rank_in_trainer(int world_rank) const noexcept
Definition: comm.hpp:181
int get_world_rank(int trainer, int rank) const noexcept
Definition: comm.hpp:164
int get_procs_per_trainer() const noexcept
Definition: comm.hpp:222
const El::mpi::Comm & get_world_comm() const noexcept
Definition: comm.hpp:901
El::mpi::Comm m_secondary_grid_comm
Definition: comm.hpp:953
int m_rank_in_trainer
Definition: comm.hpp:967
size_t get_bytes_sent() const noexcept
Definition: comm.hpp:849
void nb_send(const AbsMat &mat, int trainer, El::mpi::Request< DataType > &req) const
Definition: comm.hpp:708
bool is_world_rank_on_node(int rank) const
Definition: comm.hpp:236
std::vector< int > m_secondary_grid_ranks
Definition: comm.hpp:992
int get_procs_in_world() const
Definition: comm.hpp:229
std::vector< int > get_secondary_grid_ranks()
Definition: comm.hpp:913
El::Grid & get_trainer_grid()
Definition: comm.hpp:202
static bool is_sendable(const AbsDistMat &dist_mat) noexcept
Definition: comm.hpp:872
const El::Grid & get_subset_grid() const
Definition: comm.hpp:218
int get_procs_per_node() const noexcept
Definition: comm.hpp:227
El::Grid & get_secondary_grid()
Definition: comm.hpp:207
GridType
Definition: comm.hpp:55