27 #ifndef _DATA_READER_JAG_CONDUIT_HPP_ 28 #define _DATA_READER_JAG_CONDUIT_HPP_ 30 #include "lbann_config.hpp" 32 #include "conduit/conduit.hpp" 39 #include <unordered_map> 42 #ifdef _USE_IO_HANDLE_ 50 class data_store_conduit;
61 conduit::float32_array;
69 #ifdef _USE_IO_HANDLE_ 76 using sample_t = std::pair<sample_file_id_t, sample_name_t>;
95 using TypeID = conduit::DataType::TypeID;
99 using prefix_t = std::pair<std::string, size_t>;
110 void setup(
int num_io_threads,
113 std::string
get_type()
const override {
return "data_reader_jag_conduit"; }
117 const std::vector<std::vector<variable_t>>& independent);
120 const std::vector<std::vector<variable_t>>& dependent);
141 void set_image_dims(
const int width,
const int height,
const int ch = 1);
170 #ifndef _JAG_OFFLINE_TOOL_MODE_ 171 void load()
override;
185 #endif // _JAG_OFFLINE_TOOL_MODE_ 223 bool& is_supported)
override;
234 conduit::Node& sample)
const;
237 std::vector<input_t>
get_inputs(
const size_t i, conduit::Node& sample)
const;
239 template <
typename S>
241 add_val(
const std::string key,
const conduit::Node& n, std::vector<S>& vals);
273 bool filter(
const std::set<std::string>& key_filter,
274 const std::vector<prefix_t>& prefix_filter,
275 const std::string& name)
const;
284 const std::vector<std::vector<data_reader_jag_conduit::variable_t>>& var)
292 static std::string
to_string(
const std::vector<variable_t>& vec);
295 static std::string
to_string(
const std::vector<std::vector<variable_t>>& vec);
297 virtual std::vector<CPUMat>
299 const std::vector<size_t>& sizes,
300 const int mb_idx)
const;
306 conduit::Node& sample,
310 const std::string tag);
315 #ifndef _JAG_OFFLINE_TOOL_MODE_ 343 #endif // _JAG_OFFLINE_TOOL_MODE_ 346 hid_t open_conduit_file(
const std::string& conduit_file_path);
350 void open_all_conduit_files();
368 const std::string& path,
369 conduit::Node& n)
const;
373 const std::string key);
380 const std::string& key,
381 conduit::Node& node)
const;
386 std::vector<std::vector<DataType>>
488 const std::string& sample_name,
489 const std::string& field_name,
491 conduit::Node& node);
500 template <
typename TN, conduit::DataType::TypeID TC>
505 #define _LBANN_CONDUIT_DTYPE_INSTANTIATION_(TN, TC) \ 507 struct is_same<TN, TC> : std::true_type \ 523 #undef _LBANN_CONDUIT_DTYPE_INSTANTIATION_ 527 template <
typename TN>
531 case conduit::DataType::INT8_ID:
533 case conduit::DataType::INT16_ID:
535 case conduit::DataType::INT32_ID:
537 case conduit::DataType::INT64_ID:
539 case conduit::DataType::UINT8_ID:
541 case conduit::DataType::UINT16_ID:
543 case conduit::DataType::UINT32_ID:
545 case conduit::DataType::UINT64_ID:
547 case conduit::DataType::FLOAT32_ID:
549 case conduit::DataType::FLOAT64_ID:
551 case conduit::DataType::CHAR8_STR_ID:
564 template <
typename S>
566 const conduit::Node& n,
567 std::vector<S>& vals)
571 switch (n.dtype().id()) {
572 case TypeID::OBJECT_ID: {
577 conduit::NodeConstIterator itr = n.children();
578 while (itr.has_next()) {
579 const conduit::Node& n_child = itr.next();
580 cnt +=
add_val(itr.name(), n_child, vals);
583 case TypeID::LIST_ID: {
588 conduit::NodeConstIterator itr = n.children();
589 while (itr.has_next()) {
590 const conduit::Node& n_child = itr.next();
591 cnt +=
add_val(itr.name(), n_child, vals);
594 case TypeID::INT8_ID:
595 case TypeID::INT16_ID:
596 case TypeID::INT32_ID:
597 case TypeID::INT64_ID:
598 case TypeID::UINT8_ID:
599 case TypeID::UINT16_ID:
600 case TypeID::UINT32_ID:
601 case TypeID::UINT64_ID:
602 case TypeID::FLOAT32_ID:
603 case TypeID::FLOAT64_ID:
607 vals.push_back(static_cast<S>(n.to_value()));
609 case TypeID::CHAR8_STR_ID: {
619 const char* c_str = n.as_char8_str();
621 const std::string str =
624 : std::string(c_str, n.dtype().number_of_elements()))
628 const S v =
static_cast<S
>(atof(str.c_str()));
634 case TypeID::EMPTY_ID:
637 std::string(
"data_reader_jag_conduit::add_val() : invalid dtype (") +
638 n.dtype().name() +
") for " + n.path() +
'.';
640 std::cerr << err <<
" Skipping for now." << std::endl;
649 #endif // _DATA_READER_JAG_CONDUIT_HPP_ size_t get_linearized_input_size() const
Return the linearized size of inputs.
std::vector< std::vector< variable_t > > m_dependent_groups
The list of independent variable types grouped for slicing.
bool priming_data_store() const override
void add_input_prefix_filter(const prefix_t &p)
Add an input key prefix to filter out.
static const std::set< std::string > non_numeric_vars
int m_cached_label_mb_size
void clear_scalar_normalization_params()
void add_input_filter(const std::string &key)
Add an input key to filter out.
void add_independent_variable_type(const variable_t independent)
add data type for independent variable
std::set< std::string > m_scalar_filter
The set of scalar variables to filter out.
std::vector< El::Int > get_slice_points_dependent() const
Return the slice points for linearized dependent variables.
std::vector< linear_transform_t > m_scalar_normalization_params
void set_list_per_trainer(bool flag)
Set the set of open hdf5 data files.
void set_list_per_model(bool flag)
Set every reader instances in a model to have an independent sample list.
void set_local_id(const std::string role)
Set the id of this local instance.
bool m_is_data_loaded
Whether data have been loaded.
size_t get_linearized_1ch_image_size() const
Return the linearized size of a single channel image.
static bool check_non_numeric(const std::string key)
bool has_list_per_trainer() const override
Does the data reader have a unique sample list per trainer.
~data_reader_jag_conduit() override
virtual bool priming_data_store() const
std::vector< variable_t > m_dependent
The flat list of dependent variable types.
void set_output_image_prefix(const std::string &prefix)
Set the common prefix path for any output images stored.
size_t get_linearized_scalar_size() const
Return the linearized size of scalar outputs.
std::vector< variable_t > get_independent_variable_type() const
Tell which data to use for independent variable.
void setup(int num_io_threads, observer_ptr< thread_pool > io_thread_pool) override
std::string get_type() const override
void set_image_dims(const int width, const int height, const int ch=1)
Set the image dimension.
size_t get_linearized_size(const variable_t t) const
Return the linearized size of a particular JAG variable type.
const std::vector< std::string > & get_input_choices() const
Report the selected simulation input parameters.
std::set< std::string > m_input_filter
The set of input variables to filter out.
void set_all_scalar_choices()
Set to use the entire set of scalar outputs.
bool fetch_datum(CPUMat &X, int data_id, int mb_idx) override
int get_local_id(const std::string role) const
Get the id of this local instance.
int get_linearized_label_size() const override
Get the linearized size (i.e. number of elements) in a label.
bool has_path(const file_handle_t &h, const std::string &path) const
static const conduit::Node & get_conduit_node(const conduit::Node &n_base, const std::string key)
Allow const access to the conduit data structure.
std::string m_input_prefix
Common prefix path to any input fields in Conduit / HDF5.
std::vector< El::Int > get_slice_points_impl(const std::vector< std::vector< data_reader_jag_conduit::variable_t >> &var) const
Return the slice points for linearized data or responses.
virtual bool data_store_active() const
std::pair< double, double > linear_transform_t
linear transform on X defined as: first * X + second => X'
unsigned int get_num_img_srcs() const
Return the number of measurement views.
int compute_max_num_parallel_readers()
bool load_conduit_node(const size_t i, const std::string &key, conduit::Node &node) const
void add_scalar_prefix_filter(const prefix_t &p)
Add a scalar key prefix to filter out.
int get_num_labels() const override
Return the number of labels (classes) in this dataset.
unsigned int m_num_img_srcs
number of views result in images
static std::unordered_map< std::string, int > m_num_local_readers
The number of local instances of this reader type.
data_store_conduit * m_data_store
sample_list_t::sample_file_id_t sample_file_id_t
std::string m_output_scalar_prefix
Common prefix path to any output scalar fields in Conduit / HDF5.
std::string get_description() const
Show the description.
std::vector< El::Int > get_slice_points(const slice_points_mode var_category, bool &is_supported) override
std::vector< sample_locator_t > sample_map_t
valid sample map type
int m_image_height
image height
virtual void shuffle_indices()
Shuffle indices (uses the data_seq_generator)
void load_list_of_samples(const std::string filename)
Rely on pre-determined list of samples.
const std::vector< El::Int > get_data_dims() const override
Return the dimension of data.
size_t m_1ch_image_linearized_size
void set_split_image_channels()
std::vector< variable_t > get_dependent_variable_type() const
Tell which data to use for dependent variable.
std::vector< variable_t > m_independent
The flat list of independent variable types.
int m_image_width
image width
void unset_split_image_channels()
void load() override
Load data and do data reader's chores.
int get_linearized_response_size() const override
Return the total linearized size of response.
virtual void copy_members(const data_reader_jag_conduit &rhs)
data_reader_jag_conduit(bool shuffle=true)
void set_linearized_image_size()
Obtain the linearized size of images of a sample from the meta info.
const std::vector< int > get_dims(const variable_t t) const
Return the dimension of a particular JAG variable type.
std::vector< El::Int > get_slice_points_independent() const
Return the slice points for linearized independent variables.
int m_cached_data_mb_size
void clear_image_normalization_params()
int m_image_num_channels
number of image channels
bool m_uniform_input_type
data_reader_jag_conduit & operator=(const data_reader_jag_conduit &)
std::vector< size_t > get_linearized_response_sizes() const
Return the per-source linearized sizes of composite response.
conduit::float32_array conduit_ch_t
conduit type for ch_t array wrapper
std::vector< linear_transform_t > m_input_normalization_params
El::Matrix< DataType, El::Device::CPU > CPUMat
static size_t add_val(const std::string key, const conduit::Node &n, std::vector< S > &vals)
void add_input_normalization_param(const linear_transform_t &t)
typename std::add_pointer< T >::type observer_ptr
Creating an observer_ptr to complement the unique_ptr and shared_ptr.
bool has_conduit_path(const size_t i, const std::string &key) const
Check if a key exist for sample i.
void set_image_choices(const std::vector< std::string > image_keys)
Choose images to use. e.g. by measurement views and time indices.
bool filter(const std::set< std::string > &key_filter, const std::vector< prefix_t > &prefix_filter, const std::string &name) const
Check if a key is in the black lists to filter out.
void clear_input_normalization_params()
exception lbann_exception
data_reader_jag_conduit * copy() const override
void add_scalar_normalization_param(const linear_transform_t &t)
void set_independent_variable_type(const std::vector< std::vector< variable_t >> &independent)
Choose which data to use for independent variable.
std::vector< prefix_t > m_input_prefix_filter
The list of input key prefixes to filter out.
double scalar_t
jag scalar output type
std::vector< linear_transform_t > m_image_normalization_params
temporary normalization parameters based on linear transforms
void read_node(const file_handle_t &h, const std::string &path, conduit::Node &n) const
void set_output_scalar_prefix(const std::string &prefix)
Set the common prefix path for any output scalar fields stored.
std::pair< std::string, hid_t > sample_locator_t
bool fetch(CPUMat &X, int data_id, conduit::Node &sample, int mb_idx, int tid, const variable_t vt, const std::string tag)
Export cached data minibatch.
sample_list_t m_sample_list
void set_dependent_variable_type(const std::vector< std::vector< variable_t >> &dependent)
Choose which data to use for dependent variable.
void add_image_normalization_param(const linear_transform_t &t)
static std::string to_string(const variable_t t)
A utility function to convert a JAG variable type to name string.
std::pair< sample_file_id_t, sample_name_t > sample_t
void load_list_of_samples_from_archive(const std::string &sample_list_archive)
Load the sample list from a serialized archive from another rank.
std::string sample_name_t
int get_linearized_data_size() const override
Return the total linearized size of data.
void print_schema(const size_t i) const
print the schema of the specific sample identified by a given id
void set_scalar_choices(const std::vector< std::string > &keys)
Select the set of scalar output variables to use.
void set_input_prefix(const std::string &prefix)
Set the common prefix path for any input variables stored.
float ch_t
jag output image channel type
void add_scalar_filter(const std::string &key)
Add a scalar key to filter out.
const std::vector< std::string > & get_image_choices() const
Report the image choices.
std::vector< size_t > get_linearized_data_sizes() const
Return the per-source linearized sizes of composite data.
sample_list_t::file_handle_t file_handle_t
void sample_schema_check(const bool check_data)
Check the consistency of the schema of the first sample.
std::pair< std::string, std::string > conduit_sample
conduit::DataType::TypeID TypeID
bool check_split_image_channels() const
int m_cached_response_mb_size
bool check_num_parallel_readers(long data_set_size)
std::vector< prefix_t > m_scalar_prefix_filter
The list of scalar key prefixes to filter out.
const std::vector< std::string > & get_scalar_choices() const
Report the selected scalar outputs.
void set_input_choices(const std::vector< std::string > &keys)
Select the set of simulation input parameters to use.
bool fetch_response(CPUMat &Y, int data_id, int mb_idx) override
std::vector< std::string > m_emi_image_keys
Allow image selection by the view and the time index.
virtual void set_defaults()
std::vector< scalar_t > get_scalars(const size_t i, conduit::Node &sample) const
Return the scalar simulation output data of the i-th sample.
virtual std::vector< CPUMat > create_datum_views(CPUMat &X, const std::vector< size_t > &sizes, const int mb_idx) const
std::vector< input_t > get_inputs(const size_t i, conduit::Node &sample) const
Return the simulation input parameters of the i-th sample.
virtual int get_linearized_size(data_field_type const &data_field) const
get the linearized size of what is identified by desc.
void check_input_keys()
Make sure that the keys to choose scalar outputs are valid.
bool is_same_type(const conduit::DataType::TypeID dt)
void check_scalar_keys()
Make sure that the keys to choose scalar outputs are valid.
size_t m_image_linearized_size
The linearized size of an image.
bool has_list_per_model() const override
Does the data reader have a unique sample list per model.
void setup_data_store(int mini_batch_size)
void do_preload_data_store() override
std::vector< std::string > m_input_keys
void check_image_data()
See if the image size is consistent with the linearized size.
std::vector< std::vector< DataType > > get_image_data(const size_t i, conduit::Node &sample) const
Obtain image data.
void set_all_input_choices()
Set to use the entire set of simulation input parameters.
size_t get_linearized_image_size() const
Return the linearized size of an image.
int m_num_labels
number of labels
void preload_helper(const hid_t &h, const std::string &sample_name, const std::string &field_name, int data_id, conduit::Node &node)
std::string m_output_image_prefix
Common prefix path to any output image fields in Conduit / HDF5.
bool fetch_label(CPUMat &X, int data_id, int mb_idx) override
void add_dependent_variable_type(const variable_t dependent)
add data type for dependent variable
std::size_t sample_file_id_t
The type for the index assigned to each sample file.
std::pair< std::string, size_t > prefix_t
std::vector< std::vector< variable_t > > m_independent_groups
The list of independent variable types grouped for slicing.
_LBANN_CONDUIT_DTYPE_INSTANTIATION_(int8_t, conduit::DataType::INT8_ID)
std::vector< std::string > m_scalar_keys
int get_num_data() const override
Get the number of samples in this dataset.
bool data_store_active() const override
bool m_split_channels
Whether to export a separate image per channel.