LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
checkpoint.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
25 //
26 // checkpoint .hpp .cpp - Callback hooks to checkpoint model
28 #ifndef LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED
29 #define LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED
30 
32 #include "lbann/io/persist.hpp"
34 
35 namespace lbann {
36 
37 // Forward-declarations
38 class TrainingAlgorithm;
39 
40 namespace callback {
41 
42 enum class callback_phase
43 {
44  batch,
45  epoch,
46  validation,
47  inference,
48  invalid
49 };
50 
52 class checkpoint : public callback_base
53 {
54 public:
71  checkpoint(std::string checkpoint_dir,
72  std::string restart_dir,
73  int checkpoint_epochs,
74  int checkpoint_steps,
75  int checkpoint_secs,
76  std::string per_rank_dir,
77  int ckpt_dist_epochs,
78  int ckpt_dist_steps)
79  : callback_base(),
80  m_active_trainer(nullptr),
81  m_active_training_algorithm(nullptr),
82  m_checkpoint_dir(std::move(checkpoint_dir)),
83  m_restart_dir(std::move(restart_dir)),
84  m_checkpoint_epochs(checkpoint_epochs),
85  m_checkpoint_steps(checkpoint_steps),
86  m_checkpoint_secs(checkpoint_secs),
87  m_per_rank_dir(per_rank_dir),
88  m_ckpt_dist_epochs(ckpt_dist_epochs),
89  m_ckpt_dist_steps(ckpt_dist_steps)
90  {}
91  checkpoint(const checkpoint&) = default;
92  checkpoint& operator=(const checkpoint&) = default;
93  checkpoint* copy() const override { return new checkpoint(*this); }
94  void setup(model* m) override;
95  void setup(trainer* t) override;
96  void on_train_begin(model* m) override;
97  void on_train_end(model* m) override;
98  void on_epoch_begin(model* m) override;
99  void on_batch_begin(model* m) override;
100  void on_validation_begin(model* m) override;
101 
102  inline void set_checkpoint_dir(const std::string& dir)
103  {
104  m_checkpoint_dir = dir;
105  }
106 
107  inline const std::string& get_checkpoint_dir() { return m_checkpoint_dir; }
108 
109  inline void set_restart_dir(const std::string& dir) { m_restart_dir = dir; }
110 
111  inline const std::string& get_restart_dir()
112  {
113  // If the restart directory has been explicitly defined use that
114  if (m_restart_dir.length() != 0) {
115  return m_restart_dir;
116  }
117  else {
118  return m_checkpoint_dir;
119  }
120  }
121 
122  inline void set_active_trainer(trainer* t) { m_active_trainer = t; }
123 
124  trainer& get_active_trainer();
125 
127  {
128  m_active_training_algorithm = t;
129  }
130 
131  TrainingAlgorithm& get_active_training_algorithm();
132 
133  inline void set_checkpoint_epochs(int epochs)
134  {
135  m_checkpoint_epochs = epochs;
136  }
137 
138  inline void set_checkpoint_steps(int steps) { m_checkpoint_steps = steps; }
139 
140  inline void set_checkpoint_secs(EvalType secs) { m_checkpoint_secs = secs; }
141 
142  inline void set_per_rank_dir(std::string dir) { m_per_rank_dir = dir; }
143 
144  inline const std::string& get_per_rank_dir() { return m_per_rank_dir; }
145 
146  inline void set_ckpt_dist_epochs(int ckpt_dist_epochs)
147  {
148  m_ckpt_dist_epochs = ckpt_dist_epochs;
149  }
150 
151  inline void set_ckpt_dist_steps(int ckpt_dist_steps)
152  {
153  m_ckpt_dist_steps = ckpt_dist_steps;
154  }
155 
156  inline std::string get_shared_checkpoint_rootdir()
157  {
158  return get_restart_dir();
159  }
160 
164  {
165  if (m_per_rank_dir.length()) {
166  return get_per_rank_dir() + "/" + get_restart_dir();
167  }
168  else {
169  return get_restart_dir();
170  }
171  }
172 
173  bool need_checkpoint(model* m, callback_phase phase);
174  std::string find_latest_checkpoint(lbann_comm& comm,
175  const std::string& trainer_name,
176  const std::string& alg_name,
177  visitor_hook& hook,
178  execution_mode& mode,
179  size_t& epoch,
180  size_t& step,
181  bool& shared);
182  bool open_latest_checkpoint(
183  lbann_comm& comm,
184  const std::string& task_label,
185  const std::string& trainer_name,
186  const std::string& alg_name,
187  std::function<bool(/*const */ persist&)> reload_shared_ckpt,
188  std::function<bool(/*const */ persist&)> reload_distributed_ckpt);
189  bool reload_model(model* m);
190  bool reload_trainer(trainer* t);
191  bool restart(model* m);
192  std::string name() const override { return "checkpoint"; }
193 
194 private:
196  void write_specific_proto(lbann_data::Callback& proto) const final;
197 
198  bool do_checkpoint(model* m, visitor_hook hook);
199  void do_distributed_checkpoint(lbann_comm& comm,
200  trainer& t,
201  model& m,
202  visitor_hook hook,
203  execution_mode mode,
204  persist& p,
205  size_t epoch,
206  size_t step);
207  void do_shared_checkpoint(lbann_comm& comm,
208  trainer& t,
209  model& m,
210  visitor_hook hook,
211  execution_mode mode,
212  persist& p,
213  size_t epoch,
214  size_t step);
215 
216 private:
219  std::string m_checkpoint_dir;
220  // If the restart directory is not explicity set, default to the
221  // checkpoint directory
222  std::string m_restart_dir;
226  std::string m_per_rank_dir;
232 
233  template <size_t _max_dir_len>
234  struct header_t
235  {
238  int epoch;
239  int step;
240  int shared;
241  char dirname[_max_dir_len];
242  };
243 };
244 
245 std::string get_trainer_checkpoint_dirname(const std::string& trainer_name,
246  const std::string& dir);
247 
248 std::string get_last_shared_checkpoint_filename(const std::string& alg_name,
249  const std::string& dir);
250 
251 std::string get_last_shared_checkpoint_filename(const std::string& trainer_name,
252  const std::string& alg_name,
253  const std::string& dir);
254 
255 std::string get_shared_checkpoint_dirname(const std::string& alg_name,
256  const std::string& dir,
257  visitor_hook hook,
258  execution_mode mode,
259  size_t epoch,
260  size_t step);
261 
262 std::string get_shared_checkpoint_dirname(const std::string& trainer_name,
263  const std::string& alg_name,
264  const std::string& dir,
265  visitor_hook hook,
266  execution_mode mode,
267  size_t epoch,
268  size_t step);
269 
270 std::string
271 get_last_distributed_checkpoint_filename(const std::string& alg_name,
272  const std::string& dir);
273 
274 std::string
275 get_last_distributed_checkpoint_filename(const std::string& trainer_name,
276  const std::string& alg_name,
277  const std::string& dir);
278 
279 std::string get_distributed_checkpoint_dirname(const std::string& alg_name,
280  const int rank_in_trainer,
281  const std::string& dir,
282  visitor_hook hook,
283  execution_mode mode,
284  size_t epoch,
285  size_t step);
286 
287 std::string get_distributed_checkpoint_dirname(const std::string& trainer_name,
288  const std::string& alg_name,
289  const int rank_in_trainer,
290  const std::string& dir,
291  visitor_hook hook,
292  execution_mode mode,
293  size_t epoch,
294  size_t step);
295 
296 // Print last checkpoint to file, used to determine which checkpoint to load
297 // from.
298 bool write_latest(std::string filename,
299  visitor_hook hook,
300  execution_mode mode,
301  size_t epoch,
302  size_t train);
303 
307 bool read_latest(std::string filename,
308  visitor_hook* hook,
309  execution_mode* mode,
310  size_t* epochLast,
311  size_t* trainLast);
312 
313 // Builder function
314 std::unique_ptr<callback_base>
315 build_checkpoint_callback_from_pbuf(const google::protobuf::Message&);
316 
317 } // namespace callback
318 } // namespace lbann
319 
320 #endif // LBANN_CALLBACKS_CALLBACK_CHECKPOINT_HPP_INCLUDED
void set_checkpoint_dir(const std::string &dir)
Definition: checkpoint.hpp:102
std::string get_shared_checkpoint_dirname(const std::string &alg_name, const std::string &dir, visitor_hook hook, execution_mode mode, size_t epoch, size_t step)
checkpoint(std::string checkpoint_dir, std::string restart_dir, int checkpoint_epochs, int checkpoint_steps, int checkpoint_secs, std::string per_rank_dir, int ckpt_dist_epochs, int ckpt_dist_steps)
Construct the checkpoint callback.
Definition: checkpoint.hpp:71
const std::string & get_restart_dir()
Definition: checkpoint.hpp:111
void set_checkpoint_epochs(int epochs)
Definition: checkpoint.hpp:133
void set_active_training_algorithm(TrainingAlgorithm *t)
Definition: checkpoint.hpp:126
std::string get_distributed_checkpoint_dirname(const std::string &alg_name, const int rank_in_trainer, const std::string &dir, visitor_hook hook, execution_mode mode, size_t epoch, size_t step)
std::string get_last_distributed_checkpoint_filename(const std::string &alg_name, const std::string &dir)
TrainingAlgorithm * m_active_training_algorithm
Definition: checkpoint.hpp:218
void set_ckpt_dist_steps(int ckpt_dist_steps)
Definition: checkpoint.hpp:151
std::string get_distributed_checkpoint_rootdir()
Definition: checkpoint.hpp:163
void set_per_rank_dir(std::string dir)
Definition: checkpoint.hpp:142
Base class for callbacks during training/testing.
Definition: callback.hpp:76
bool write_latest(std::string filename, visitor_hook hook, execution_mode mode, size_t epoch, size_t train)
Abstract base class for neural network models.
Definition: model.hpp:83
Checkpoint at given interval in given directory.
Definition: checkpoint.hpp:52
std::string get_trainer_checkpoint_dirname(const std::string &trainer_name, const std::string &dir)
execution_mode
Neural network execution mode.
Definition: base.hpp:229
void set_restart_dir(const std::string &dir)
Definition: checkpoint.hpp:109
const std::string & get_checkpoint_dir()
Definition: checkpoint.hpp:107
void set_ckpt_dist_epochs(int ckpt_dist_epochs)
Definition: checkpoint.hpp:146
User-facing class that represents a set of compute resources.
Definition: trainer.hpp:60
std::string get_last_shared_checkpoint_filename(const std::string &alg_name, const std::string &dir)
checkpoint * copy() const override
Definition: checkpoint.hpp:93
void set_active_trainer(trainer *t)
Definition: checkpoint.hpp:122
void set_checkpoint_secs(EvalType secs)
Definition: checkpoint.hpp:140
void set_checkpoint_steps(int steps)
Definition: checkpoint.hpp:138
std::string get_shared_checkpoint_rootdir()
Definition: checkpoint.hpp:156
visitor_hook
Neural network execution mode.
std::string name() const override
Return this callback&#39;s name.
Definition: checkpoint.hpp:192
std::unique_ptr< callback_base > build_checkpoint_callback_from_pbuf(const google::protobuf::Message &)
Base class for LBANN training_algorithms.
double EvalType
Definition: base.hpp:189
const std::string & get_per_rank_dir()
Definition: checkpoint.hpp:144
bool read_latest(std::string filename, visitor_hook *hook, execution_mode *mode, size_t *epochLast, size_t *trainLast)
Reads the "latest" file and returns the epoch number and sample offset for most recent checkpoint...