LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
utils/summary.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 
27 // lbann_summary - Write summary statistics to Tensorboard
28 
29 #ifndef LBANN_SUMMARY_HPP_INCLUDED
30 #define LBANN_SUMMARY_HPP_INCLUDED
31 
32 #include "lbann/base.hpp"
33 #include "lbann/comm.hpp"
34 #include <string>
35 #include <vector>
36 
37 #ifdef LBANN_HAS_TBINF
38 #include "TBinf.hpp"
39 #endif
40 
41 namespace lbann {
42 
43 template <typename T, typename U>
44 using BiggerOf = typename std::conditional<(sizeof(T) > sizeof(U)), T, U>::type;
45 
46 #ifdef LBANN_HAS_TBINF
47 
64 class lbann_summary
65 {
66 public:
72  lbann_summary(std::string logdir, lbann_comm* comm);
73  ~lbann_summary();
74 
76  template <typename TensorDataType>
77  void reduce_mean(const std::string tag,
78  const El::AbstractDistMatrix<TensorDataType>& mat,
79  int step);
81  template <typename TensorDataType>
82  void reduce_min(const std::string tag,
83  const El::AbstractDistMatrix<TensorDataType>& mat,
84  int step);
86  template <typename TensorDataType>
87  void reduce_max(const std::string tag,
88  const El::AbstractDistMatrix<TensorDataType>& mat,
89  int step);
91  template <typename TensorDataType>
92  void reduce_stdev(const std::string tag,
93  const El::AbstractDistMatrix<TensorDataType>& mat,
94  int step);
96  template <typename TensorDataType>
97  void reduce_scalar(const std::string tag, TensorDataType s, int step);
99  template <typename TensorDataType>
100  void sum_reduce_scalar(const std::string tag, TensorDataType s, int step);
102  template <typename TensorDataType>
103  void reduce_scalar_all(const std::string tag, TensorDataType s, int step);
105  template <typename TensorDataType>
106  void reduce_histogram(const std::string tag,
107  const El::AbstractDistMatrix<TensorDataType>& mat,
108  int step);
110  template <typename TensorDataType>
111  void reduce_2norm(const std::string tag,
112  const El::AbstractDistMatrix<TensorDataType>& mat,
113  int step);
114  void report_image(std::string const& /*tag*/,
115  std::string const& /*img_format*/,
116  CPUMat const& /*image*/,
117  std::vector<int> const& /*dims*/,
118  int /*step*/);
124  void flush();
125 
126 private:
127  lbann_comm* m_comm;
128  TBinf::SummaryWriter* m_sw;
129 
133  struct pending_op
134  {
135  pending_op(const std::string tag_,
136  int step_,
137  float local_,
138  float local2_ = 0.0f,
139  int num_ = 0)
140  : tag(tag_), step(step_), local(local_), local2(local2_), num(num_)
141  {}
143  const std::string tag;
145  int step;
147  float local;
149  float local2;
151  int num;
152  };
156  struct pending_histogram
157  {
158  pending_histogram(const std::string tag_,
159  int step_,
160  std::vector<double> buckets_,
161  double min_,
162  double max_,
163  double num_,
164  double sum_,
165  double sqsum_)
166  : tag(tag_),
167  step(step_),
168  buckets(buckets_),
169  min(min_),
170  max(max_),
171  num(num_),
172  sum(sum_),
173  sqsum(sqsum_)
174  {}
176  const std::string tag;
178  int step;
180  std::vector<double> buckets;
182  double min;
184  double max;
186  double num;
188  double sum;
190  double sqsum;
191  };
192 
194  std::vector<pending_op> m_pending_means;
196  std::vector<pending_op> m_pending_mins;
198  std::vector<pending_op> m_pending_maxes;
200  std::vector<pending_op> m_pending_stdevs;
202  std::vector<pending_op> m_pending_scalars;
204  std::vector<pending_op> m_pending_sum_scalars;
206  std::vector<pending_op> m_pending_scalar_alls;
208  std::vector<double> m_histogram_buckets;
210  std::vector<pending_histogram> m_pending_histograms;
211 
213  void flush_means();
215  void flush_mins();
217  void flush_maxes();
219  void flush_stdevs();
221  void flush_scalars();
223  void flush_sum_scalars();
225  void flush_scalar_alls();
227  void flush_histograms();
228 
230  template <typename TensorDataType>
231  auto local_sum(const El::AbstractMatrix<TensorDataType>& mat) const
234  template <typename TensorDataType, typename AccumT>
235  void local_sum_sqsum(const El::AbstractMatrix<TensorDataType>& mat,
236  AccumT& sum,
237  AccumT& sqsum) const;
239  template <typename TensorDataType>
240  auto local_min(const El::AbstractMatrix<TensorDataType>& mat) const
243  template <typename TensorDataType>
244  auto local_max(const El::AbstractMatrix<TensorDataType>& mat) const
247  template <typename TensorDataType>
248  auto local_2norm(const El::AbstractMatrix<TensorDataType>& mat) const
251  std::string prepend_model(const std::string tag, int model) const;
253  void gather_scalar_summary(const std::string tag, float s, int step);
255  void gather_scalar_summary(const std::vector<pending_op>& ops,
256  std::vector<float>& scalars);
257 };
258 
259 #else
260 
263 {
264 public:
265  lbann_summary(std::string logdir, lbann_comm* comm) {}
266 
267  void report_image(std::string const& tag,
268  std::string const& img_format,
269  CPUMat const& image,
270  std::vector<int> const& dims,
271  int step)
272  {}
273 
274  template <typename TensorDataType>
275  void reduce_mean(const std::string tag,
276  const El::AbstractDistMatrix<TensorDataType>& mat,
277  int step)
278  {}
279  template <typename TensorDataType>
280  void reduce_min(const std::string tag,
281  const El::AbstractDistMatrix<TensorDataType>& mat,
282  int step)
283  {}
284  template <typename TensorDataType>
285  void reduce_max(const std::string tag,
286  const El::AbstractDistMatrix<TensorDataType>& mat,
287  int step)
288  {}
289  template <typename TensorDataType>
290  void reduce_stdev(const std::string tag,
291  const El::AbstractDistMatrix<TensorDataType>& mat,
292  int step)
293  {}
294  template <typename TensorDataType>
295  void reduce_scalar(const std::string tag, TensorDataType s, int step)
296  {}
297  template <typename TensorDataType>
298  void sum_reduce_scalar(const std::string tag, TensorDataType s, int step)
299  {}
300  template <typename TensorDataType>
301  void reduce_scalar_all(const std::string tag, TensorDataType s, int step)
302  {}
303  template <typename TensorDataType>
304  void reduce_histogram(const std::string tag,
305  const El::AbstractDistMatrix<TensorDataType>& mat,
306  int step)
307  {}
308  template <typename TensorDataType>
309  void reduce_2norm(const std::string tag,
310  const El::AbstractDistMatrix<TensorDataType>& mat,
311  int step)
312  {}
313  void flush() {}
314 };
315 
316 #endif // LBANN_HAS_TBINF
317 
318 } // namespace lbann
319 
320 #endif // LBANN_SUMMARY_HPP_INCLUDED
void reduce_mean(const std::string tag, const El::AbstractDistMatrix< TensorDataType > &mat, int step)
void reduce_stdev(const std::string tag, const El::AbstractDistMatrix< TensorDataType > &mat, int step)
void reduce_scalar(const std::string tag, TensorDataType s, int step)
lbann_summary(std::string logdir, lbann_comm *comm)
void sum_reduce_scalar(const std::string tag, TensorDataType s, int step)
void reduce_2norm(const std::string tag, const El::AbstractDistMatrix< TensorDataType > &mat, int step)
Abstract base class for neural network models.
Definition: model.hpp:83
void report_image(std::string const &tag, std::string const &img_format, CPUMat const &image, std::vector< int > const &dims, int step)
El::Matrix< DataType, El::Device::CPU > CPUMat
Definition: base.hpp:116
typename std::conditional<(sizeof(T) > sizeof(U)), T, U >::type BiggerOf
void reduce_histogram(const std::string tag, const El::AbstractDistMatrix< TensorDataType > &mat, int step)
void reduce_scalar_all(const std::string tag, TensorDataType s, int step)
void reduce_min(const std::string tag, const El::AbstractDistMatrix< TensorDataType > &mat, int step)
void reduce_max(const std::string tag, const El::AbstractDistMatrix< TensorDataType > &mat, int step)