LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
data_reader_HDF5.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
26 #ifndef LBANN_DATA_READER_HDF5_REVISED_HPP
27 #define LBANN_DATA_READER_HDF5_REVISED_HPP
28 
33 
34 #include <set>
35 
36 // Forward declaration
38 
40 #define HDF5_METADATA_KEY_DIMS "dims"
41 #define HDF5_METADATA_KEY_CHANNELS "channels"
42 #define HDF5_METADATA_KEY_ORDERING "ordering"
43 #define HDF5_METADATA_KEY_SCALE "scale"
44 #define HDF5_METADATA_KEY_BIAS "bias"
45 #define HDF5_METADATA_KEY_LAYOUT "layout"
46 #define HDF5_METADATA_KEY_TRANSPOSE "transpose"
47 #define HDF5_METADATA_KEY_COERCE "coerce"
48 #define HDF5_METADATA_KEY_PACK "pack"
49 
50 #define HDF5_METADATA_VALUE_COERCE_FLOAT "float"
51 #define HDF5_METADATA_VALUE_COERCE_DOUBLE "double"
52 #define HDF5_METADATA_VALUE_COERCE_FLOAT64 "float64"
53 #define HDF5_METADATA_VALUE_COERCE_FLOAT16 "float16"
54 #define HDF5_METADATA_VALUE_LAYOUT_CHW "chw"
55 #define HDF5_METADATA_VALUE_LAYOUT_HWC "hwc"
56 #define HDF5_METADATA_VALUE_LAYOUT_CDHW "cdhw"
57 #define HDF5_METADATA_VALUE_LAYOUT_DHWC "dhwc"
58 
59 namespace lbann {
60 
61 bool is_hdf5_metadata_key_valid(std::string const& key);
62 bool is_hdf5_field_channels_last(conduit::Node const& field);
64  conduit::Node const& metadata);
69 std::string conduit_to_string(conduit::Node const& field);
70 
71 static std::set<std::string> const hdf5_metadata_valid_keys = {
81 };
82 
87  : public data_reader_sample_list<sample_list_hdf5<std::string>>
88 {
89 public:
90  hdf5_data_reader(bool shuffle = true);
93  hdf5_data_reader* copy() const override
94  {
95  return new hdf5_data_reader(*this);
96  }
97  void copy_members(const hdf5_data_reader& rhs);
98  ~hdf5_data_reader() override;
99 
100  bool has_conduit_output() override { return true; }
101 
102  std::string get_type() const override { return "hdf5_data_reader"; }
103 
112  void print_metadata(std::ostream& os = std::cout);
113 
114  void load() override;
115 
116  bool fetch_conduit_node(conduit::Node& sample, int data_id) override;
117 
119  void set_experiment_schema_filename(std::string fn)
120  {
122  }
123 
125  const std::string& get_experiment_schema_filename()
126  {
128  }
129 
131  void set_data_schema_filename(std::string fn) { m_data_schema_filename = fn; }
132 
134  const std::string& get_data_schema_filename()
135  {
136  return m_data_schema_filename;
137  }
138 
139  const std::vector<El::Int> get_data_dims() const override
140  {
142  }
143 
144  int get_linearized_data_size() const override
145  {
147  }
148 
149  int get_linearized_response_size() const override
150  {
152  }
153 
154  int get_linearized_label_size() const override
155  {
157  }
158 
159  int get_num_labels() const override { return get_linearized_label_size(); }
160 
161  int get_num_responses() const override
162  {
164  }
165 
167  conduit::Node get_experiment_schema() const { return m_experiment_schema; }
169  conduit::Node get_data_schema() const { return m_data_schema; }
171  void set_experiment_schema(const conduit::Node& s);
173  void set_data_schema(const conduit::Node& s);
175  std::unordered_map<std::string, conduit::Node> get_node_map() const
176  {
177  return m_useme_node_map;
178  }
179 
189  void adjust_metadata(conduit::Node* root);
190 
191 private:
195  std::unordered_map<std::string, std::vector<El::Int>> m_data_dims_lookup_table;
196 
200  std::unordered_map<std::string, int> m_linearized_size_lookup_table;
201 
203 
205 
206  // to set to false, use the cmd line flag: --keep_packed_fields
207  // I don't know a use case for keeping an original (possibly coerced)
208  // field if it's been packed, but someone else might ...
209  // note that setting to 'false' invokes both a memory and communication
210  // penalty
212 
214  {
215  std::string group_name;
216  conduit::index_t data_type;
217  size_t n_elts;
218  std::vector<std::string> names;
219  std::vector<size_t> sizes;
220  std::vector<conduit::index_t> data_types;
221  };
222 
223  std::unordered_map<std::string, PackingGroup> m_packing_groups;
224 
228  const std::string s_metadata_node_name = "metadata";
229 
230  const std::string s_composite_node = "composite_node";
231 
242  std::unordered_map<std::string, conduit::Node*> m_useme_node_map_ptrs;
244  std::unordered_map<std::string, conduit::Node> m_useme_node_map;
245 
250  conduit::Node m_experiment_schema;
251 
257  conduit::Node m_data_schema;
258 
262  std::unordered_map<std::string, conduit::Node*> m_data_map;
263 
265  std::unordered_set<std::string> m_add_to_map;
266 
267  //=========================================================================
268  // methods follow
269  //=========================================================================
270 
271  const std::vector<El::Int> get_data_dims(std::string name = "") const;
272 
274  int get_linearized_size(data_field_type const& data_field) const override;
275 
277  void load_sample_schema(conduit::Schema& s);
278 
282  void parse_schemas();
283 
288  void get_schema_ptrs(
289  conduit::Node* starting_node,
290  std::unordered_map<std::string, conduit::Node*>& schema_name_map);
291 
297  void get_leaves(conduit::Node* node_in,
298  std::unordered_map<std::string, conduit::Node*>& leaves_out);
299 
306  void
307  get_leaves_multi(conduit::Node* node_in,
308  std::unordered_map<std::string, conduit::Node*>& leaves_out);
309 
310  void do_preload_data_store() override;
311 
316  void load_sample(conduit::Node& node,
317  hid_t file_handle,
318  const std::string& sample_name,
319  bool ignore_failure = false);
320 
323  void load_sample_from_sample_list(conduit::Node& node,
324  size_t index,
325  bool ignore_failure = false);
326 
328  void pack_data(conduit::Node& node_in_out);
329 
331  void load_schema(std::string filename, conduit::Node& schema);
332 
334  void pack(conduit::Node& node, size_t index);
335 
340  conduit::Node merge_metadata_nodes(const conduit::Node* node_A,
341  const conduit::Node* node_B);
342 
344  void build_packing_map(conduit::Node& node);
345 
347  void repack_image(conduit::Node& node,
348  const std::string& path,
349  const conduit::Node& metadata);
350 
352  void coerce(const conduit::Node& metadata,
353  hid_t file_handle,
354  const std::string& original_path,
355  const std::string& new_pathname,
356  conduit::Node& node);
357 
358  void normalize(conduit::Node& node,
359  const std::string& path,
360  const conduit::Node& metadata);
361 
364  void construct_linearized_size_lookup_tables(conduit::Node& node);
365 
367  void test_that_all_nodes_contain_metadata(conduit::Node& node);
368 
370  void set_delete_packed_fields(bool flag) { m_delete_packed_fields = flag; }
371 
372  //=========================================================================
373  // template declarations follow
374  //=========================================================================
375 
380  template <typename T>
381  void pack(std::string const& group_name, conduit::Node& node, size_t index);
382 
386  bool is_composite_node(const conduit::Node& node) const;
387 
388  // Designate a whitebox testing friend
389  friend class ::DataReaderHDF5WhiteboxTester;
390 
391 }; // END: class hdf5_data_reader
392 
393 } // namespace lbann
394 
395 #endif // LBANN_DATA_READER_HDF5_REVISED_HPP
int get_linearized_size(data_field_type const &data_field) const override
void load_sample(conduit::Node &node, hid_t file_handle, const std::string &sample_name, bool ignore_failure=false)
void get_schema_ptrs(conduit::Node *starting_node, std::unordered_map< std::string, conduit::Node *> &schema_name_map)
std::string conduit_to_string(conduit::Node const &field)
std::unordered_map< std::string, int > m_linearized_size_lookup_table
bool is_hdf5_metadata_key_valid(std::string const &key)
void test_that_all_nodes_contain_metadata(conduit::Node &node)
#define HDF5_METADATA_KEY_COERCE
std::unordered_set< std::string > m_add_to_map
int get_num_responses() const override
Return the number of responses in this dataset.
bool is_hdf5_field_channels_last(conduit::Node const &field)
#define HDF5_METADATA_KEY_DIMS
static std::set< std::string > const hdf5_metadata_valid_keys
#define HDF5_METADATA_KEY_PACK
const std::string & get_experiment_schema_filename()
Returns the name of the yaml experiment file.
const std::string s_composite_node
#define HDF5_METADATA_KEY_BIAS
#define INPUT_DATA_TYPE_LABELS
conduit::Node merge_metadata_nodes(const conduit::Node *node_A, const conduit::Node *node_B)
std::unordered_map< std::string, PackingGroup > m_packing_groups
hdf5_data_reader(bool shuffle=true)
conduit::Node get_data_schema() const
this method is made public for testing
const std::string s_metadata_node_name
std::string get_type() const override
void set_data_schema_filename(std::string fn)
Sets the name of the yaml data file.
#define HDF5_METADATA_KEY_CHANNELS
void adjust_metadata(conduit::Node *root)
this method is made public for testing
#define HDF5_METADATA_KEY_TRANSPOSE
void construct_linearized_size_lookup_tables()
void pack(conduit::Node &node, size_t index)
std::vector< conduit::index_t > data_types
int get_num_labels() const override
Return the number of labels (classes) in this dataset.
void print_metadata(std::ostream &os=std::cout)
Prints metadata and data-types for all field-names.
void load_sample_from_sample_list(conduit::Node &node, size_t index, bool ignore_failure=false)
const std::string & get_data_schema_filename()
Returns the name of the yaml data file.
void load() override
int get_linearized_response_size() const override
Get the linearized size (i.e. number of elements) in a response.
int get_linearized_label_size() const override
Get the linearized size (i.e. number of elements) in a label.
std::string m_experiment_schema_filename
#define INPUT_DATA_TYPE_SAMPLES
std::unordered_map< std::string, conduit::Node > get_node_map() const
this method is made public for testing
std::unordered_map< std::string, std::vector< El::Int > > m_data_dims_lookup_table
void normalize(conduit::Node &node, const std::string &path, const conduit::Node &metadata)
bool fetch_conduit_node(conduit::Node &sample, int data_id) override
conduit::Node get_experiment_schema() const
this method is made public for testing
std::unordered_map< std::string, conduit::Node * > m_data_map
hdf5_data_reader * copy() const override
void get_leaves(conduit::Node *node_in, std::unordered_map< std::string, conduit::Node *> &leaves_out)
#define HDF5_METADATA_KEY_SCALE
#define INPUT_DATA_TYPE_RESPONSES
void set_experiment_schema(const conduit::Node &s)
this method is made public for testing
const std::vector< El::Int > get_data_dims() const override
Get the dimensions of the data.
std::unordered_map< std::string, conduit::Node * > m_useme_node_map_ptrs
void build_packing_map(conduit::Node &node)
void repack_image(conduit::Node &node, const std::string &path, const conduit::Node &metadata)
void do_preload_data_store() override
void set_data_schema(const conduit::Node &s)
this method is made public for testing
void set_delete_packed_fields(bool flag)
~hdf5_data_reader() override
bool does_hdf5_field_require_repack_to_channels_first(conduit::Node const &metadata)
void get_leaves_multi(conduit::Node *node_in, std::unordered_map< std::string, conduit::Node *> &leaves_out)
void pack_data(conduit::Node &node_in_out)
std::string data_field_type
std::unordered_map< std::string, conduit::Node > m_useme_node_map
int get_linearized_data_size() const override
Get the linearized size (i.e. number of elements) in a sample.
#define HDF5_METADATA_KEY_LAYOUT
void set_experiment_schema_filename(std::string fn)
Sets the name of the yaml experiment file.
void load_sample_schema(conduit::Schema &s)
void load_schema(std::string filename, conduit::Node &schema)
void copy_members(const hdf5_data_reader &rhs)
#define HDF5_METADATA_KEY_ORDERING
bool is_composite_node(const conduit::Node &node) const
bool has_conduit_output() override
hdf5_data_reader & operator=(const hdf5_data_reader &)
void coerce(const conduit::Node &metadata, hid_t file_handle, const std::string &original_path, const std::string &new_pathname, conduit::Node &node)