LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
distconv.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 #ifndef LBANN_UTILS_DISTCONV_HPP
28 #define LBANN_UTILS_DISTCONV_HPP
29 
30 #include "lbann_config.hpp"
31 
32 #ifdef LBANN_HAS_DISTCONV
33 
34 #include "El.hpp"
36 #include <vector>
37 
38 #ifdef LBANN_DEBUG
39 #define DISTCONV_DEBUG
40 #endif
41 
42 #include "distconv/distconv.hpp"
43 #include "distconv/dnn_backend/backend.hpp"
44 #include "distconv/tensor/algorithms.hpp"
45 #include "distconv/tensor/shuffle_mpi.hpp"
46 #include "distconv/tensor/shuffle_mpi_cuda.hpp"
47 #include "distconv/tensor/shuffle_mpi_cuda_al.hpp"
48 #include "distconv/tensor/tensor_mpi_cuda.hpp"
49 #include "distconv/util/util.hpp"
50 #ifdef DISTCONV_HAS_P2P
51 #include "distconv/tensor/shuffle_mpi_cuda_hybrid.hpp"
52 #include "distconv/tensor/shuffle_mpi_cuda_p2p.hpp"
53 #include "p2p/p2p.hpp"
54 #endif // DISTCONV_HAS_P2P
55 
56 namespace lbann {
57 
58 inline auto default_hydrogen_stream()
59 {
60 #if H2_HAS_CUDA
61  return hydrogen::cuda::GetDefaultStream();
62 #elif H2_HAS_ROCM
63  return hydrogen::rocm::GetDefaultStream();
64 #endif
65 }
66 
67 // Forward Declarations
68 class lbann_comm;
69 class Layer;
70 
71 namespace dc {
72 
73 namespace tensor = ::distconv::tensor;
74 namespace util = ::distconv::util;
75 
77 // Helper type aliases
79 using IntVector = ::distconv::IntVector;
80 using IndexVector = ::distconv::IndexVector;
82 
83 using Dist = ::distconv::tensor::Distribution;
84 
86 
87 using AbsTensor = ::distconv::tensor::AbstractTensor;
88 
89 template <typename TensorDataType>
90 using TensorHost = ::distconv::tensor::
91  Tensor<TensorDataType, LocaleMPI, ::distconv::tensor::BaseAllocator>;
92 
93 template <typename TensorDataType>
94 using TensorDev = ::distconv::tensor::
95  Tensor<TensorDataType, LocaleMPI, ::distconv::tensor::CUDAAllocator>;
96 
97 template <typename TensorDataType>
98 using TensorHostShuffler =
99  ::distconv::tensor::TensorMPIShuffler<TensorDataType,
100  ::distconv::tensor::BaseAllocator>;
101 
102 template <typename TensorDataType>
103 using TensorShuffler =
104  ::distconv::tensor::TensorMPICUDAShuffler<TensorDataType>;
105 template <typename TensorDataType>
106 using TensorShufflerAL =
107  ::distconv::tensor::TensorMPICUDAShufflerAL<TensorDataType>;
108 #ifdef DISTCONV_HAS_P2P
109 template <typename TensorDataType>
110 using TensorShufflerP2P =
111  ::distconv::tensor::TensorMPICUDAShufflerP2P<TensorDataType>;
112 template <typename TensorDataType>
113 using TensorShufflerHybrid =
114  ::distconv::tensor::TensorMPICUDAShufflerHybrid<TensorDataType>;
115 #endif // DISTCONV_HAS_P2P
116 
117 // Debug printing functions
118 using MPIPrintStreamDebug = ::distconv::util::MPIPrintStreamDebug;
119 using MPIPrintStreamError = ::distconv::util::MPIPrintStreamError;
120 using MPIPrintStreamInfo = ::distconv::util::MPIPrintStreamInfo;
121 using MPIPrintStreamWarning = ::distconv::util::MPIPrintStreamWarning;
122 using MPIRootPrintStreamDebug = ::distconv::util::MPIRootPrintStreamDebug;
123 using MPIRootPrintStreamError = ::distconv::util::MPIRootPrintStreamError;
124 using MPIRootPrintStreamInfo = ::distconv::util::MPIRootPrintStreamInfo;
125 using MPIRootPrintStreamWaning = ::distconv::util::MPIRootPrintStreamWarning;
126 
127 // Distconv layer classes
128 using Backend = ::distconv::BackendDNNLib;
129 using AlCommType = typename decltype(std::declval<Backend>()
130  .get_al_mpi_cuda_comm())::element_type;
131 
132 using ::distconv::get_channel_dim;
133 using ::distconv::get_sample_dim;
134 
135 int get_strided_mpi_rank(MPI_Comm comm);
136 MPI_Comm get_strided_mpi_comm(MPI_Comm comm);
137 
140 void initialize(MPI_Comm comm);
141 
144 void finalize();
145 
151 MPI_Comm get_mpi_comm();
152 
155 int get_mpi_rank();
156 
159 int get_mpi_num_ranks();
160 
163 bool is_mpi_root();
164 
167 int get_rank_stride();
168 
171 bool evaluate_performance();
172 
175 std::string get_convolution_fwd_algorithm();
176 
179 std::string get_convolution_bwd_data_algorithm();
180 
183 std::string get_convolution_bwd_filter_algorithm();
184 
187 std::string get_synthetic_data_reader_randgen();
188 
191 int get_number_of_pre_generated_synthetic_data();
192 
195 bool is_deterministic();
196 
199 int get_number_of_io_partitions();
200 
203 bool is_cosmoflow_parallel_io_enabled();
204 
205 #ifdef DISTCONV_HAS_P2P
206 
208 p2p::P2P& get_p2p();
209 #endif // DISTCONV_HAS_P2P
210 
213 AlCommType& get_hosttransfer();
214 
217 Backend& get_backend();
218 
221 ::distconv::HaloExchangeMethod get_halo_exchange_method();
222 
223 template <typename TensorDataType>
224 TensorShuffler<TensorDataType>*
225 get_tensor_shuffler(const TensorDev<TensorDataType>& src,
226  const TensorDev<TensorDataType>& dst);
227 
228 MPI_Comm get_input_comm(const lbann_comm& comm);
229 
232 int get_input_rank(const lbann_comm& comm);
233 
236 Dist get_hydrogen_data_parallel_distribution(int num_dims);
237 
238 template <typename Tensor>
239 void dump_tensor(const Tensor& t, const std::string& path)
240 {
241  dc::MPIPrintStreamDebug() << "Dumping tensor to " << path;
242  h2::gpu::sync();
243  distconv::dump_tensor(t, path, true);
244 }
245 
246 size_t get_workspace_capacity();
247 
248 int get_num_dims(const Layer& layer);
249 int get_num_spatial_dims(const Layer& layer);
250 
251 #ifndef LBANN_UTILS_DISTCONV_INSTANTIATE
252 #define PROTO(T) \
253  extern template TensorShuffler<T>* get_tensor_shuffler<T>( \
254  const TensorDev<T>&, \
255  const TensorDev<T>&);
256 
257 #define LBANN_INSTANTIATE_CPU_HALF
258 #define LBANN_INSTANTIATE_GPU_HALF
260 #undef PROTO
261 #undef LBANN_INSTANTIATE_CPU_HALF
262 #undef LBANN_INSTANTIATE_GPU_HALF
263 #endif // LBANN_UTILS_DISTCONV_INSTANTIATE
264 
265 } // namespace dc
266 } // namespace lbann
267 
268 #endif // LBANN_HAS_DISTCONV
269 #endif // LBANN_UTILS_DISTCONV_HPP
::distconv::tensor::LocaleMPI LocaleMPI
::distconv::tensor::Tensor< TensorDataType, LocaleMPI, ::distconv::tensor::CUDAAllocator > TensorDev
::distconv::tensor::TensorMPICUDAShuffler< TensorDataType > TensorShuffler
::distconv::tensor::Shape Shape
world_comm_ptr initialize(int &argc, char **&argv)
void finalize(lbann_comm *comm=nullptr)
::distconv::tensor::AbstractTensor AbsTensor
::distconv::tensor::Distribution Dist