27 #ifndef LBANN_COMM_HPP_INCLUDED 28 #define LBANN_COMM_HPP_INCLUDED 33 #include <cuda_runtime.h> 34 #endif // LBANN_HAS_CUDA 35 #ifdef LBANN_HAS_ALUMINUM 37 #endif // LBANN_HAS_ALUMINUM 49 #ifdef LBANN_HAS_ALUMINUM 51 ::Al::ReductionOperator mpi_op_to_al_op(El::mpi::Op op);
113 El::mpi::Comm world = El::mpi::COMM_WORLD.GetMPIComm());
130 void split_trainers(
int procs_per_trainer = -1,
int trainer_grid_height = -1);
145 void split_trainer_grid(
int num_process_primary_grid = 0,
146 bool create_two_models =
false,
147 bool enable_async_comm =
false,
148 bool enable_topo_aware =
false);
161 return El::mpi::Rank(get_world_comm());
166 if (m_secondary_grid_ranks.size() == 0) {
167 return m_procs_per_trainer *
trainer + rank;
170 return (m_secondary_grid_ranks.size() + m_primary_grid_ranks.size()) *
178 return (world_rank / m_procs_per_trainer);
183 return (world_rank % m_procs_per_trainer);
194 return get_rank_in_trainer() == get_trainer_master();
211 return *m_secondary_grid;
224 return m_procs_per_trainer;
231 return El::mpi::Size(get_world_comm());
238 return std::find(m_world_ranks_on_node.begin(),
239 m_world_ranks_on_node.end(),
240 rank) != m_world_ranks_on_node.end();
250 return m_threads_per_proc;
254 void reset_threads()
const noexcept;
257 void intertrainer_sum_matrix(
AbsMat& mat)
const;
258 void intertrainer_sum_matrix(
AbsDistMat& mat)
const;
260 void intertrainer_broadcast_matrix(
AbsMat& mat,
int root)
const;
261 void intertrainer_broadcast_matrix(
AbsDistMat& mat,
int root)
const;
264 template <typename T, bool S = is_instantiated_El_mpi_type<T>::value>
265 void broadcast(
int root, T& val,
const El::mpi::Comm& c)
const;
267 template <
typename T>
268 void broadcast_custom(
int root, T& val,
const El::mpi::Comm& c)
const;
269 template <
typename T>
270 void broadcast_native(
int root, T& val,
const El::mpi::Comm& c)
const;
273 template <
typename T>
274 void world_broadcast(
int root, T& val)
const;
276 template <
typename T>
277 void intertrainer_broadcast(
int root, T& val)
const;
279 template <
typename T>
280 void trainer_broadcast(
int root, T& val)
const;
288 template <
typename T>
289 void broadcast(
const int root,
292 const El::mpi::Comm& c)
const;
294 template <
typename T,
297 void broadcast(
const int root,
300 const El::mpi::Comm& c,
301 El::SyncInfo<D>
const& syncInfo)
const;
304 template <
typename T>
305 void world_broadcast(
const int root, T* data,
const int count)
const;
307 template <
typename T, El::Device D>
308 void world_broadcast(
const int root,
311 El::SyncInfo<D>
const& syncInfo)
const;
313 template <
typename T>
314 void intertrainer_broadcast(
const int root, T* data,
const int count)
const;
315 template <
typename T, El::Device D>
316 void intertrainer_broadcast(
const int root,
319 El::SyncInfo<D>
const& syncInfo)
const;
321 template <
typename T>
322 void trainer_broadcast(
const int root, T* data,
const int count)
const;
324 template <
typename T, El::Device D>
325 void trainer_broadcast(
const int root,
328 El::SyncInfo<D>
const& syncInfo)
const;
333 template <
typename T>
335 resize(
const int root, std::vector<T>& data,
const El::mpi::Comm& c)
const;
341 template <
typename T>
343 broadcast(
const int root, std::vector<T>& data,
const El::mpi::Comm& c)
const;
345 template <
typename T>
346 void world_broadcast(
int root, std::vector<T>& data)
const;
351 template <
typename T>
353 void intertrainer_broadcast(
int root, std::vector<T>& data)
const;
355 template <
typename T>
356 void trainer_broadcast(
int root, std::vector<T>& data)
const;
359 template <
typename T>
360 void all_gather(
const T* src,
364 const El::mpi::Comm& c)
const;
365 template <
typename T, El::Device D>
366 void all_gather(
const T* src,
370 const El::mpi::Comm& c,
371 El::SyncInfo<D>
const& syncInfo)
const;
377 template <
typename T>
378 void all_gather(std::vector<T>
const& src,
380 std::vector<int>
const& rcv_counts,
381 std::vector<int>
const& rcv_disp,
382 const El::mpi::Comm& c)
const;
387 template <
typename T>
388 void trainer_all_gather(std::vector<T>
const& src,
390 std::vector<int>
const& rcv_counts,
391 std::vector<int>
const& rcv_disp)
const;
396 template <
typename T>
398 all_gather(T
const& src, std::vector<T>& data,
const El::mpi::Comm& c)
const;
403 template <
typename T>
404 void world_all_gather(T
const& src, std::vector<T>& data)
const;
409 template <
typename T>
410 void trainer_all_gather(T
const& src, std::vector<T>& data)
const;
413 template <
typename T>
414 void trainer_gather(T snd,
int root)
const;
416 template <
typename T>
417 void trainer_gather(T snd, T* rcv)
const;
419 template <
typename T>
420 void trainer_gather(T
const* snd,
int count,
int root)
const;
422 template <
typename T>
423 void trainer_gather(T
const* snd,
int count, T* rcv)
const;
425 template <
typename T>
426 void trainer_gatherv(T
const* snd,
int count,
int root)
const;
427 template <
typename T>
428 void trainer_gatherv(T
const* snd,
431 int const* rcv_counts,
432 int const* rcv_displacements)
const;
434 template <
typename T>
435 void intertrainer_gather(T snd,
int root)
const;
437 template <
typename T>
438 void intertrainer_gather(T snd, std::vector<T>& rcv)
const;
440 template <
typename T>
441 void intertrainer_gather(T
const* snd,
int count,
int root)
const;
443 template <
typename T>
444 void intertrainer_gather(T
const* snd,
int count, T* rcv)
const;
446 template <
typename T>
447 void gather(T snd,
int root,
const El::mpi::Comm& c)
const;
449 template <
typename T>
450 void gather(T snd, T* rcv,
const El::mpi::Comm& c)
const;
452 template <
typename T>
453 void gather(T snd, std::vector<T>& rcv,
const El::mpi::Comm& c)
const;
455 template <
typename T>
456 void gather(T
const* snd,
int count,
int root,
const El::mpi::Comm& c)
const;
457 template <
typename T, El::Device D>
458 void gather(T
const* snd,
461 const El::mpi::Comm& c,
462 El::SyncInfo<D>
const& syncInfo)
const;
464 template <
typename T>
465 void gather(T
const* snd,
int count, T* rcv,
const El::mpi::Comm& c)
const;
466 template <
typename T, El::Device D>
467 void gather(T
const* snd,
470 const El::mpi::Comm& c,
471 El::SyncInfo<D>
const& syncInfo)
const;
473 template <
typename T>
474 T scatter(
int root,
const El::mpi::Comm& c)
const;
476 template <
typename T>
477 T scatter(T
const* snd,
const El::mpi::Comm& c)
const;
479 template <
typename T>
481 intertrainer_reduce(T snd,
int root, El::mpi::Op op = El::mpi::SUM)
const;
483 template <
typename T>
484 T intertrainer_reduce(T snd, El::mpi::Op op = El::mpi::SUM)
const;
486 template <
typename T>
487 void trainer_reduce(T snd,
int root, El::mpi::Op op = El::mpi::SUM)
const;
489 template <
typename T>
490 T trainer_reduce(T snd, El::mpi::Op op = El::mpi::SUM)
const;
492 template <
typename T>
493 void trainer_reduce(T
const* snd,
496 El::mpi::Op op = El::mpi::SUM)
const;
498 template <
typename T>
499 void trainer_reduce(T
const* snd,
502 El::mpi::Op op = El::mpi::SUM)
const;
504 template <
typename T>
507 const El::mpi::Comm& c,
508 El::mpi::Op op = El::mpi::SUM)
const;
510 template <
typename T>
511 T reduce(T snd,
const El::mpi::Comm& c, El::mpi::Op op = El::mpi::SUM)
const;
515 template <
typename T>
516 void reduce(T
const* snd,
int count,
int root,
const El::mpi::Comm& c)
const;
517 template <
typename T, El::Device D>
518 void reduce(T
const* snd,
521 const El::mpi::Comm& c,
522 El::SyncInfo<D>
const& syncInfo)
const;
524 template <
typename T>
525 void reduce(T
const* snd,
528 const El::mpi::Comm& c,
529 El::mpi::Op op)
const;
530 template <
typename T, El::Device D>
531 void reduce(T
const* snd,
534 const El::mpi::Comm& c,
536 El::SyncInfo<D>
const& syncInfo)
const;
538 template <
typename T, El::Device D>
539 void reduce(T
const* snd,
542 const El::mpi::Comm& c,
543 El::SyncInfo<D>
const& syncInfo)
const;
544 template <
typename T>
545 void reduce(T
const* snd,
int count, T* rcv,
const El::mpi::Comm& c)
const;
547 template <
typename T>
548 void reduce(T
const* snd,
551 const El::mpi::Comm& c,
552 El::mpi::Op op)
const;
553 template <
typename T, El::Device D>
554 void reduce(T
const* snd,
557 const El::mpi::Comm& c,
559 El::SyncInfo<D>
const& syncInfo)
const;
561 template <
typename T>
562 T intertrainer_allreduce(T snd, El::mpi::Op op = El::mpi::SUM)
const;
564 template <
typename T>
565 T trainer_allreduce(T snd, El::mpi::Op op = El::mpi::SUM)
const;
567 template <
typename T>
568 void trainer_allreduce(T
const* snd,
571 El::mpi::Op op = El::mpi::SUM)
const;
573 template <
typename T>
575 const El::mpi::Comm& c,
576 El::mpi::Op op = El::mpi::SUM)
const;
581 template <
typename T>
582 void allreduce(T
const* snd,
585 const El::mpi::Comm& c,
586 El::mpi::Op op = El::mpi::SUM)
const;
588 template <
typename T>
589 void allreduce(T* data,
591 const El::mpi::Comm& c,
592 El::mpi::Op op = El::mpi::SUM)
const;
594 template <
typename TensorDataType>
595 void allreduce(El::AbstractMatrix<TensorDataType>& m,
596 const El::mpi::Comm& c,
597 El::mpi::Op op = El::mpi::SUM)
const;
599 template <
typename TensorDataType>
600 void allreduce(El::AbstractDistMatrix<TensorDataType>& m,
601 const El::mpi::Comm& c,
602 El::mpi::Op op = El::mpi::SUM)
const;
607 template <
typename TensorDataType>
608 void nb_allreduce(El::AbstractMatrix<TensorDataType>& m,
609 const El::mpi::Comm& c,
611 El::mpi::Op op = El::mpi::SUM)
const;
616 template <
typename TensorDataType>
617 void nb_allreduce(El::AbstractDistMatrix<TensorDataType>& m,
618 const El::mpi::Comm& c,
620 El::mpi::Op op = El::mpi::SUM)
const;
626 template <
typename T>
627 void nb_allreduce(T* data,
629 const El::mpi::Comm& c,
631 El::mpi::Op op = El::mpi::SUM)
const;
634 template <
typename T>
635 void wait_all(std::vector<El::mpi::Request<T>>& req)
const;
638 template <
typename T>
639 void wait(El::mpi::Request<T>& req)
const;
647 void intertrainer_barrier()
const;
649 void trainer_barrier()
const;
651 void global_barrier()
const;
653 void barrier(
const El::mpi::Comm& c)
const;
656 template <
typename T>
657 void send(
const T* data,
int count,
int trainer,
int rank)
const;
658 template <
typename T, El::Device D>
659 void send(
const T* data,
663 El::SyncInfo<D>
const& syncInfo)
const;
664 template <
typename T, El::Device D>
665 void send(
const T* data,
668 El::SyncInfo<D>
const& syncInfo)
const;
669 void send(
const AbsMat& mat,
int trainer,
int rank)
const;
670 void send(
const DistMat& mat,
int trainer,
int rank)
const;
673 send(mat, trainer, m_rank_in_trainer);
677 send(mat, trainer, m_rank_in_trainer);
681 template <
typename T>
682 void nb_send(
const T* data,
686 El::mpi::Request<T>& req)
const;
687 template <
typename T>
688 void nb_tagged_send(
const T* data,
692 El::mpi::Request<T>& req,
693 const El::mpi::Comm& c)
const;
694 template <
typename T>
695 void nb_send(
const T* data,
698 El::mpi::Request<T>& req)
const;
699 void nb_send(
const AbsMat& mat,
702 El::mpi::Request<DataType>& req)
const;
703 void nb_send(
const DistMat& mat,
706 El::mpi::Request<DataType>& req)
const;
708 nb_send(
const AbsMat& mat,
int trainer, El::mpi::Request<DataType>& req)
const 710 nb_send(mat, trainer, m_rank_in_trainer, req);
714 El::mpi::Request<DataType>& req)
const 716 nb_send(mat, trainer, m_rank_in_trainer, req);
720 template <
typename T>
721 void recv(T* data,
int count,
int trainer,
int rank)
const;
722 template <
typename T>
723 void recv(T* data,
int count,
int trainer)
const;
724 template <
typename T>
725 void recv(T* data,
int count)
const;
726 template <
typename T, El::Device D>
731 El::SyncInfo<D>
const& syncInfo)
const;
732 template <
typename T, El::Device D>
734 recv(T* data,
int count,
int trainer, El::SyncInfo<D>
const& syncInfo)
const;
735 void recv(
AbsMat& mat,
int trainer,
int rank)
const;
736 void recv(
DistMat& mat,
int trainer,
int rank)
const;
739 recv(mat, trainer, m_rank_in_trainer);
743 recv(mat, trainer, m_rank_in_trainer);
746 template <
typename T, El::Device D>
747 void recv(T* data,
int count, El::SyncInfo<D>
const& syncInfo)
const;
748 void recv(
AbsMat& mat)
const;
752 template <
typename T>
753 void nb_recv(T* data,
757 El::mpi::Request<T>& req)
const;
758 template <
typename T>
759 void nb_tagged_recv(T* data,
763 El::mpi::Request<T>& req,
764 const El::mpi::Comm& c)
const;
766 template <
typename T>
767 void nb_recv(T* data,
int count,
int trainer, El::mpi::Request<T>& req)
const;
771 El::mpi::Request<DataType>& req)
const;
775 El::mpi::Request<DataType>& req)
const;
776 void nb_recv(
AbsMat& mat,
int trainer, El::mpi::Request<DataType>& req)
const 778 nb_recv(mat, trainer, m_rank_in_trainer, req);
782 nb_recv(mat, trainer, m_rank_in_trainer, req);
784 template <
typename T>
785 void nb_recv(T* data,
int count, El::mpi::Request<T>& req)
const;
786 void nb_recv(
AbsMat& mat, El::mpi::Request<DataType>& req)
const;
787 void nb_recv(
DistMat& mat, El::mpi::Request<DataType>& req)
const;
790 template <
typename T, El::Device D>
791 void sendrecv(
const T* snd,
798 int recv_rank)
const;
799 template <
typename T, El::Device D>
800 void sendrecv(
const T* snd,
805 int recv_trainer)
const;
807 template <
typename T, El::Device D>
808 void sendrecv(
const T* snd,
816 El::SyncInfo<D>
const& syncInfo)
const;
817 template <
typename T, El::Device D>
818 void sendrecv(
const T* snd,
824 El::SyncInfo<D>
const& syncInfo)
const;
827 template <
typename T>
828 int get_count(
int trainer,
int rank)
const;
829 template <
typename T>
830 int get_count(
int trainer)
const;
836 return m_num_trainer_barriers;
841 return m_num_intertrainer_barriers;
846 return m_num_global_barriers;
855 m_num_trainer_barriers = 0;
856 m_num_intertrainer_barriers = 0;
857 m_num_global_barriers = 0;
859 m_bytes_received = 0;
868 size_t count = (size_t)mat.Height() * (size_t)mat.Width();
869 return count <= (size_t)std::numeric_limits<int>::max();
874 return is_sendable(dist_mat.LockedMatrix());
885 return m_intertrainer_comm;
891 return m_trainer_comm;
897 return m_combined_grid_comm;
907 const El::mpi::Comm&
get_KFAC_comm() const noexcept {
return m_trainer_comm; }
928 const El::mpi::Comm& get_packed_group_comm(
int num_per_group)
const;
934 int world_rank = El::mpi::Translate(comm, rank, get_world_comm());
935 return is_world_rank_on_node(world_rank);
939 void lbann_comm_abort(std::string msg)
const;
984 bool m_create_two_models =
false, m_subgrid_async_progress =
false;
1002 void setup_node_comm();
1012 void setup_threads();
1019 const int root)
const noexcept
1022 m_bytes_sent += bytes;
1025 m_bytes_received += bytes;
1038 #endif // LBANN_COMM_HPP_INCLUDED int get_rank_in_trainer() const noexcept
int get_default_threads_per_proc() const noexcept
std::unordered_map< int, El::mpi::Comm > m_group_communicators
void nb_send(const DistMat &mat, int trainer, El::mpi::Request< DataType > &req) const
int get_rank_in_node() const noexcept
void send(const AbsMat &mat, int trainer) const
int get_trainer_rank() const noexcept
size_t m_num_trainer_barriers
int get_world_master() const noexcept
std::vector< int > m_primary_grid_ranks
int get_rank_in_world() const
size_t m_num_intertrainer_barriers
El::AbstractDistMatrix< DataType > AbsDistMat
El::mpi::Comm m_trainer_comm
El::Grid & get_subset_grid()
MCMRMat< El::Device::CPU > DistMat
size_t get_num_intertrainer_barriers() const noexcept
const El::mpi::Comm & get_intertrainer_comm() const noexcept
std::vector< int > m_world_ranks_on_node
void nb_recv(DistMat &mat, int trainer, El::mpi::Request< DataType > &req) const
T & data(const cnpy::NpyArray &na, const std::vector< size_t > indices)
int get_num_trainers() const noexcept
bool enable_subgrid_async_communication()
std::unique_ptr< El::Grid > m_grid
int get_trainer_master() const noexcept
bool get_KFAC_subgrid_create_two_models()
std::vector< int > get_primary_grid_ranks()
const El::mpi::Comm & get_node_comm() const noexcept
int get_intertrainer_master() const noexcept
constexpr El::Device Device
const El::Grid & get_secondary_grid() const
void send(const DistMat &mat, int trainer) const
int map_world_rank_to_trainer_rank(int world_rank) const noexcept
El::mpi::Comm m_primary_grid_comm
void nb_recv(AbsMat &mat, int trainer, El::mpi::Request< DataType > &req) const
size_t get_num_global_barriers() const noexcept
std::unique_ptr< El::Grid > m_subset_grid
void count_bytes_broadcast(const size_t bytes, const int rank, const int root) const noexcept
const El::Grid & get_trainer_grid() const
GridType get_grid_type() const noexcept
El::mpi::Comm m_combined_grid_comm
bool is_rank_node_local(int rank, const El::mpi::Comm &comm) const
By default, assume no instantiation for the type T in El::mpi.
size_t get_bytes_received() const noexcept
static bool is_sendable(const AbsMat &mat) noexcept
El::mpi::Comm m_intertrainer_comm
El::AbstractMatrix< DataType > AbsMat
bool am_trainer_master() const noexcept
const El::mpi::Comm & get_KFAC_comm() const noexcept
const El::mpi::Comm m_world_comm
size_t m_num_global_barriers
void recv(DistMat &mat, int trainer) const
void recv(AbsMat &mat, int trainer) const
User-facing class that represents a set of compute resources.
bool am_world_master() const noexcept
size_t get_num_trainer_barriers() const noexcept
const El::mpi::Comm & get_trainer_comm() const noexcept
const El::mpi::Comm & get_combined_grid_comm() const noexcept
El::mpi::Comm m_node_comm
void reset_stats_counters() noexcept
int map_world_rank_to_rank_in_trainer(int world_rank) const noexcept
int get_world_rank(int trainer, int rank) const noexcept
int get_procs_per_trainer() const noexcept
const El::mpi::Comm & get_world_comm() const noexcept
El::mpi::Comm m_secondary_grid_comm
size_t get_bytes_sent() const noexcept
void nb_send(const AbsMat &mat, int trainer, El::mpi::Request< DataType > &req) const
bool is_world_rank_on_node(int rank) const
std::vector< int > m_secondary_grid_ranks
int get_procs_in_world() const
std::vector< int > get_secondary_grid_ranks()
El::Grid & get_trainer_grid()
static bool is_sendable(const AbsDistMat &dist_mat) noexcept
const El::Grid & get_subset_grid() const
int get_procs_per_node() const noexcept
El::Grid & get_secondary_grid()