|
LBANN
0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
|
#include <data_reader.hpp>
Public Types | |
| using | unused_index_map_t = std::map< execution_mode, std::vector< int > > |
Public Member Functions | |
| generic_data_reader (bool shuffle=true) | |
| generic_data_reader (const generic_data_reader &)=default | |
| generic_data_reader & | operator= (const generic_data_reader &)=default |
| virtual | ~generic_data_reader () |
| virtual generic_data_reader * | copy () const =0 |
| template<class Archive > | |
| void | serialize (Archive &ar) |
| void | set_comm (lbann_comm *comm) |
| set the comm object More... | |
| lbann_comm * | get_comm () const |
| returns a (possibly nullptr) to comm More... | |
| virtual bool | has_conduit_output () |
| void | set_file_dir (std::string s) |
| void | set_local_file_dir (std::string s) |
| void | set_max_files_to_load (size_t n) |
| std::string | get_file_dir () const |
| std::string | get_local_file_dir () const |
| void | set_data_sample_list (std::string s) |
| std::string | get_data_sample_list () const |
| void | keep_sample_order (bool same_order=false) |
| void | set_data_filename (std::string s) |
| std::string | get_data_filename () const |
| void | set_label_filename (std::string s) |
| std::string | get_label_filename () const |
| void | set_shuffle (bool b) |
| bool | is_shuffled () const |
| void | set_shuffled_indices (const std::vector< int > &indices) |
| const std::vector< int > & | get_shuffled_indices () const |
| void | set_first_n (int n) |
| void | set_absolute_sample_count (size_t s) |
| void | set_use_fraction (double s) |
| virtual void | set_execution_mode_split_fraction (execution_mode m, double s) |
| virtual void | set_role (std::string role) |
| std::string | get_role () const |
| virtual void | load ()=0 |
| virtual void | setup (int num_io_threads, observer_ptr< thread_pool > io_thread_pool) |
| virtual std::string | get_type () const =0 |
| int | fetch (std::map< data_field_type, CPUMat *> &input_buffers, El::Matrix< El::Int > &indices_fetched, size_t mb_size) |
| Fetch a mini-batch worth of data, including samples, labels, responses (as appropriate) More... | |
| int | fetch (std::vector< conduit::Node > &samples, El::Matrix< El::Int > &indices_fetched, size_t mb_size) |
| virtual bool | has_data_field (data_field_type data_field) const |
| Check to see if the data reader supports this specific data field. More... | |
| virtual bool | has_labels () const |
| virtual bool | has_responses () const |
| void | set_has_data_field (data_field_type const data_field, const bool b) |
| Whether or not a data reader has a data field. More... | |
| virtual void | set_has_labels (const bool b) |
| Whether or not a data reader has labels. More... | |
| virtual void | set_has_responses (const bool b) |
| Whether or not a data reader has a response field. More... | |
| void | start_data_store_mini_batch_exchange () |
| void | finish_data_store_mini_batch_exchange () |
| virtual bool | update (bool is_active_reader) |
| virtual int | get_num_labels () const |
| Return the number of labels (classes) in this dataset. More... | |
| virtual int | get_num_responses () const |
| Return the number of responses in this dataset. More... | |
| virtual int | get_linearized_data_size () const |
| Get the linearized size (i.e. number of elements) in a sample. More... | |
| virtual int | get_linearized_label_size () const |
| Get the linearized size (i.e. number of elements) in a label. More... | |
| virtual int | get_linearized_response_size () const |
| Get the linearized size (i.e. number of elements) in a response. More... | |
| virtual int | get_linearized_size (data_field_type const &data_field) const |
| get the linearized size of what is identified by desc. More... | |
| virtual const std::vector< El::Int > | get_data_dims () const |
| Get the dimensions of the data. More... | |
| virtual std::vector< El::Int > | get_slice_points (const slice_points_mode var_category, bool &is_supported) |
| virtual bool | position_valid () const |
| True if the data reader's current position is valid. More... | |
| virtual bool | position_is_overrun () const |
| bool | at_new_epoch () const |
| True if the data reader is at the start of an epoch. More... | |
| void | set_mini_batch_size (const int s) |
| Set the mini batch size. More... | |
| int | get_mini_batch_size () const |
| Get the mini batch size. More... | |
| int | get_loaded_mini_batch_size () const |
| Get the loaded mini-batch size. More... | |
| int | get_current_mini_batch_size () const |
| Get the current mini-batch size. More... | |
| int | get_mini_batch_max () const |
| Return the full mini_batch_size. More... | |
| void | set_stride_to_next_mini_batch (const int s) |
| Set the mini batch stride. More... | |
| int | get_stride_to_next_mini_batch () const |
| Return the mini batch stride. More... | |
| void | set_sample_stride (const int s) |
| Set the sample stride. More... | |
| int | get_sample_stride () const |
| Return the sample stride. More... | |
| void | set_iteration_stride (const int s) |
| Set the iteration stride. More... | |
| int | get_iteration_stride () const |
| Return the iteration stride. More... | |
| virtual void | set_base_offset (const int s) |
| Return the base offset. More... | |
| int | get_base_offset () const |
| Return the base offset. More... | |
| void | set_last_mini_batch_size (const int s) |
| Set the last mini batch size. More... | |
| int | get_last_mini_batch_size () const |
| Return the last mini batch size. More... | |
| void | set_stride_to_last_mini_batch (const int s) |
| Set the last mini batch stride. More... | |
| int | get_stride_to_last_mini_batch () const |
| Return the last mini batch stride. More... | |
| void | set_num_parallel_readers (const int s) |
| Set the number of parallel readers per model. More... | |
| int | get_num_parallel_readers () const |
| Return the number of parallel readers per model. More... | |
| virtual void | set_reset_mini_batch_index (const int s) |
| Set the starting mini-batch index for the epoch. More... | |
| int | get_reset_mini_batch_index () const |
| Return the starting mini-batch index for the epoch. More... | |
| int | get_loaded_mini_batch_index () const |
| Return the current mini-batch index for the epoch. More... | |
| int | get_current_mini_batch_index () const |
| Return the current mini-batch index for the epoch. More... | |
| void | set_initial_position () |
| Set the current position based on the base and model offsets. More... | |
| int | get_position () const |
| Get the current position in the data reader. More... | |
| int | get_next_position () const |
| Get the next position in the data reader. More... | |
| int * | get_indices () |
| Get a pointer to the start of the shuffled indices. More... | |
| virtual int | get_num_data () const |
| Get the number of samples in this dataset. More... | |
| int | get_num_unused_data (execution_mode m) const |
| Get the number of unused samples in this dataset. More... | |
| int * | get_unused_data (execution_mode m) |
| Get a pointer to the start of the unused sample indices. More... | |
| const std::vector< int > & | get_unused_indices (execution_mode m) |
| void | set_num_iterations_per_epoch (int num_iterations_per_epoch) |
| Set the number of iterations in each epoch. More... | |
| int | get_num_iterations_per_epoch () const |
| Get the number of iterations in each epoch. More... | |
| int | get_current_step_in_epoch () const |
| void | resize_shuffled_indices () |
| void | select_subset_of_data () |
| virtual void | use_unused_index_set (execution_mode m) |
| virtual bool | has_list_per_model () const |
| Does the data reader have a unique sample list per model. More... | |
| virtual bool | has_list_per_trainer () const |
| Does the data reader have a unique sample list per trainer. More... | |
| bool | save_to_checkpoint_shared (persist &p, execution_mode mode) |
| Given directory to store checkpoint files, write state to file and add to number of bytes written. More... | |
| bool | load_from_checkpoint_shared (persist &p, execution_mode mode) |
| Given directory to store checkpoint files, read state from file and add to number of bytes read. More... | |
| bool | save_to_checkpoint_distributed (persist &p, execution_mode mode) |
| bool | load_from_checkpoint_distributed (persist &p, execution_mode mode) |
| Given directory to store checkpoint files, read state from file and add to number of bytes read. More... | |
| const data_store_conduit & | get_data_store () const |
| returns a const ref to the data store More... | |
| data_store_conduit & | get_data_store () |
| returns a non-const ref to the data store More... | |
| data_store_conduit * | get_data_store_ptr () const |
| void | setup_data_store (int mini_batch_size) |
| void | instantiate_data_store () |
| virtual void | preload_data_store () |
| void | set_gan_labelling (bool has_gan_labelling) |
| void | set_gan_label_value (int gan_label_value) |
| void | set_data_store (data_store_conduit *g) |
| support of data store functionality More... | |
| virtual bool | data_store_active () const |
| virtual bool | priming_data_store () const |
| virtual void | post_update () |
| void | set_transform_pipeline (transform::transform_pipeline &&tp) |
| void | print_get_methods (const std::string filename) |
| Print the return values from various get_X methods to file. More... | |
| size_t | get_num_indices_to_use () const |
| void | set_use_data_store (bool s) |
Public Attributes | |
| int | m_mini_batch_size |
| int | m_current_pos |
| int | m_stride_to_next_mini_batch |
| int | m_base_offset |
| int | m_sample_stride |
| int | m_iteration_stride |
| Stride used by parallel data readers within the model. More... | |
| std::vector< int > | m_shuffled_indices |
| unused_index_map_t | m_unused_indices |
| Record of the indicies that are not being used for training. More... | |
| int | m_last_mini_batch_size |
| int | m_stride_to_last_mini_batch |
| int | m_reset_mini_batch_index |
| The index at which this data reader starts its epoch. More... | |
| int | m_loaded_mini_batch_idx |
| The index of the current mini-batch that has been loaded. More... | |
| int | m_current_mini_batch_idx |
| int | m_num_iterations_per_epoch |
| int | m_num_parallel_readers |
| How many iterations all readers will execute. More... | |
| size_t | m_max_files_to_load |
| How many parallel readers are being used. More... | |
| std::string | m_file_dir |
| std::string | m_local_file_dir |
| std::string | m_data_sample_list |
| std::string | m_data_fn |
| std::string | m_label_fn |
| bool | m_shuffle |
| size_t | m_absolute_sample_count |
| std::map< execution_mode, double > | m_execution_mode_split_fraction |
| double | m_use_fraction |
| int | m_first_n |
| std::string | m_role |
Protected Member Functions | |
| size_t | get_absolute_sample_count () const |
| double | get_use_fraction () const |
| double | get_execution_mode_split_fraction (execution_mode m) const |
| virtual bool | fetch_data_block (std::map< data_field_type, CPUMat *> &input_buffers, El::Int block_offset, El::Int block_stride, El::Int mb_size, El::Matrix< El::Int > &indices_fetched) |
| bool | fetch_data_block_conduit (std::vector< conduit::Node > &samples, El::Int block_offset, El::Int block_stride, El::Int mb_size, El::Matrix< El::Int > &indices_fetched) |
| virtual bool | fetch_data_field (data_field_type data_field, CPUMat &Y, int data_id, int mb_idx) |
| Called by fetch_data, fetch_label, fetch_response. More... | |
| virtual bool | fetch_conduit_node (conduit::Node &sample, int data_id) |
| virtual bool | fetch_datum (CPUMat &X, int data_id, int mb_idx) |
| virtual bool | fetch_label (CPUMat &Y, int data_id, int mb_idx) |
| virtual bool | fetch_response (CPUMat &Y, int data_id, int mb_idx) |
| CPUMat | create_datum_view (CPUMat &X, const int mb_idx) |
| virtual void | preprocess_data_source (int tid) |
| virtual void | postprocess_data_source (int tid) |
| virtual void | shuffle_indices () |
| Shuffle indices (uses the data_seq_generator) More... | |
| virtual void | shuffle_indices (rng_gen &gen) |
| Shuffle indices and profide a random number generator. More... | |
| void | error_check_counts () const |
Protected Attributes | |
| bool | m_verbose = false |
| std::unordered_set< int > | m_using_random_node |
| data_store_conduit * | m_data_store |
| lbann_comm * | m_comm |
| bool | m_use_data_store = false |
| std::map< data_field_type, bool > | m_supported_input_types |
| Holds a true value for each input data type that is supported. Use an ordered map so that checkpoints are stable. More... | |
| bool | m_gan_labelling |
| int | m_gan_label_value |
| observer_ptr< thread_pool > | m_io_thread_pool |
| bool | m_keep_sample_order |
| transform::transform_pipeline | m_transform_pipeline |
| bool | m_issue_warning |
Private Member Functions | |
| virtual void | do_preload_data_store () |
Friends | |
| class | data_reader_merge_features |
| class | data_reader_merge_samples |
A data reader manages reading in data in a particular format. This abstract base class manages common functionality. In particular, child classes should implement load and the appropriate subset of fetch_datum, fetch_label, and fetch_response.
Definition at line 75 of file data_reader.hpp.
| using lbann::generic_data_reader::unused_index_map_t = std::map<execution_mode, std::vector<int> > |
Definition at line 78 of file data_reader.hpp.
|
inline |
|
default |
|
virtual |
|
inline |
True if the data reader is at the start of an epoch.
Note that data readers can start at a non-zero index if there are parallel data readers in a model
Definition at line 410 of file data_reader.hpp.
|
pure virtual |
Implemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::generic_compound_data_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::smiles_data_reader, lbann::cifar10_reader, lbann::csv_reader, lbann::numpy_reader, lbann::data_reader_merge_features, lbann::numpy_npz_reader, lbann::ras_lipid_conduit_data_reader, lbann::numpy_npz_conduit_reader, lbann::data_reader_sample_list< SampleListT >, lbann::data_reader_sample_list< sample_list_hdf5< std::string > >, lbann::data_reader_sample_list< sample_list_ifstream< long long > >, lbann::pilot2_molecular_reader, lbann::data_reader_merge_samples, lbann::mesh_reader, lbann::data_reader_nci, lbann::mnist_reader, and lbann::imagenet_reader.
|
inlineprotected |
Create a matrix view of a single column selected by mini-batch index.
| X | The matrix to load data into. |
| mb_idx | The index within the mini-batch. |
Definition at line 722 of file data_reader.hpp.
|
virtual |
|
privatevirtual |
|
protected |
throws exception if get_absolute_sample_count() and get_use_fraction() are incorrect
| int lbann::generic_data_reader::fetch | ( | std::map< data_field_type, CPUMat *> & | input_buffers, |
| El::Matrix< El::Int > & | indices_fetched, | ||
| size_t | mb_size | ||
| ) |
Fetch a mini-batch worth of data, including samples, labels, responses (as appropriate)
| int lbann::generic_data_reader::fetch | ( | std::vector< conduit::Node > & | samples, |
| El::Matrix< El::Int > & | indices_fetched, | ||
| size_t | mb_size | ||
| ) |
|
inlineprotectedvirtual |
Reimplemented in lbann::hdf5_data_reader.
Definition at line 674 of file data_reader.hpp.
|
protectedvirtual |
|
protected |
|
inlineprotectedvirtual |
Called by fetch_data, fetch_label, fetch_response.
Fetch data from a single data field into a matrix.
| data_field | The name of the data field. May be one of the commonly used (samples, labels, responses) or any data_field that exists within an HDF5 experiment schema, Python DR schema, or synthetic data reader |
| Y | The matrix to load data into. |
| data_id | The index of the datum to fetch. |
| mb_idx | The index within the mini-batch. |
Reimplemented in lbann::hdf5_reader< TensorDataType >, and lbann::data_reader_synthetic.
Definition at line 665 of file data_reader.hpp.
|
inlineprotectedvirtual |
Fetch a single sample into a matrix.
| X | The matrix to load data into. |
| data_id | The index of the datum to fetch. |
| mb_idx | The index within the mini-batch. |
Reimplemented in lbann::data_reader_jag_conduit, lbann::smiles_data_reader, lbann::ras_lipid_conduit_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::pilot2_molecular_reader, lbann::data_reader_synthetic, lbann::data_reader_merge_samples, lbann::numpy_npz_reader, lbann::data_reader_merge_features, lbann::numpy_npz_conduit_reader, lbann::mesh_reader, lbann::numpy_reader, lbann::cifar10_reader, lbann::mnist_reader, and lbann::imagenet_reader.
Definition at line 686 of file data_reader.hpp.
|
inlineprotectedvirtual |
Fetch a single label into a matrix.
| Y | The matrix to load data into. |
| data_id | The index of the datum to fetch. |
| mb_idx | The index within the mini-batch. |
Reimplemented in lbann::data_reader_jag_conduit, lbann::smiles_data_reader, lbann::ras_lipid_conduit_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::image_data_reader, lbann::data_reader_synthetic, lbann::data_reader_merge_samples, lbann::numpy_npz_reader, lbann::data_reader_merge_features, lbann::numpy_npz_conduit_reader, lbann::numpy_reader, lbann::cifar10_reader, and lbann::mnist_reader.
Definition at line 698 of file data_reader.hpp.
|
inlineprotectedvirtual |
Fetch a single response into a matrix.
| Y | The matrix to load data into. |
| data_id | The index of the datum to fetch. |
| mb_idx | The index within the mini-batch. |
Reimplemented in lbann::data_reader_jag_conduit, lbann::smiles_data_reader, lbann::ras_lipid_conduit_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::data_reader_merge_samples, lbann::numpy_npz_reader, lbann::data_reader_merge_features, lbann::numpy_npz_conduit_reader, lbann::mesh_reader, and lbann::numpy_reader.
Definition at line 710 of file data_reader.hpp.
| void lbann::generic_data_reader::finish_data_store_mini_batch_exchange | ( | ) |
|
protected |
Return the absolute number of data samples that will be used for training or testing.
|
inline |
Return the base offset.
Definition at line 448 of file data_reader.hpp.
|
inline |
returns a (possibly nullptr) to comm
Definition at line 132 of file data_reader.hpp.
|
inline |
Return the current mini-batch index for the epoch.
Definition at line 477 of file data_reader.hpp.
| int lbann::generic_data_reader::get_current_mini_batch_size | ( | ) | const |
Get the current mini-batch size.
|
inline |
Return the index of the current iteration step in the epoch (also the mini-batch index)
Definition at line 517 of file data_reader.hpp.
|
inlinevirtual |
Get the dimensions of the data.
Reimplemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::ras_lipid_conduit_data_reader, lbann::image_data_reader, lbann::data_reader_merge_samples, lbann::numpy_npz_conduit_reader, lbann::smiles_data_reader, lbann::numpy_npz_reader, lbann::data_reader_merge_features, lbann::mesh_reader, lbann::numpy_reader, and lbann::pilot2_molecular_reader.
Definition at line 383 of file data_reader.hpp.
| std::string lbann::generic_data_reader::get_data_filename | ( | ) | const |
Returns the complete filepath to you data file. See not for set_file_dir()
| std::string lbann::generic_data_reader::get_data_sample_list | ( | ) | const |
Returns the complete sample list for your data set.
| const data_store_conduit& lbann::generic_data_reader::get_data_store | ( | ) | const |
returns a const ref to the data store
| data_store_conduit& lbann::generic_data_reader::get_data_store | ( | ) |
returns a non-const ref to the data store
|
inline |
Definition at line 565 of file data_reader.hpp.
|
protected |
Return the fraction of the dataset to be used for other execution modes such as validation or tournament.
| std::string lbann::generic_data_reader::get_file_dir | ( | ) | const |
Returns the base directory for your data. If set_file_dir was not called, returns the empty string
|
inline |
Get a pointer to the start of the shuffled indices.
Definition at line 490 of file data_reader.hpp.
|
inline |
Return the iteration stride.
Definition at line 444 of file data_reader.hpp.
| std::string lbann::generic_data_reader::get_label_filename | ( | ) | const |
Returns the complete filepath to you data file. See not for set_file_dir(). Note: some pipelines (autoencoders) will not make use of this method.
|
inline |
Return the last mini batch size.
Definition at line 452 of file data_reader.hpp.
|
inlinevirtual |
Get the linearized size (i.e. number of elements) in a sample.
Reimplemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::image_data_reader, lbann::numpy_npz_conduit_reader, lbann::data_reader_merge_features, lbann::numpy_npz_reader, lbann::mesh_reader, lbann::data_reader_merge_samples, lbann::ras_lipid_conduit_data_reader, lbann::smiles_data_reader, lbann::numpy_reader, and lbann::pilot2_molecular_reader.
Definition at line 374 of file data_reader.hpp.
|
inlinevirtual |
Get the linearized size (i.e. number of elements) in a label.
Reimplemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::image_data_reader, lbann::numpy_npz_conduit_reader, lbann::data_reader_merge_features, lbann::numpy_npz_reader, lbann::data_reader_merge_samples, lbann::ras_lipid_conduit_data_reader, lbann::smiles_data_reader, and lbann::numpy_reader.
Definition at line 376 of file data_reader.hpp.
|
inlinevirtual |
Get the linearized size (i.e. number of elements) in a response.
Reimplemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::data_reader_merge_samples, lbann::ras_lipid_conduit_data_reader, lbann::numpy_npz_conduit_reader, lbann::smiles_data_reader, lbann::numpy_npz_reader, and lbann::mesh_reader.
Definition at line 378 of file data_reader.hpp.
|
virtual |
get the linearized size of what is identified by desc.
Reimplemented in lbann::hdf5_data_reader, and lbann::data_reader_synthetic.
|
inline |
Return the current mini-batch index for the epoch.
Definition at line 475 of file data_reader.hpp.
| int lbann::generic_data_reader::get_loaded_mini_batch_size | ( | ) | const |
Get the loaded mini-batch size.
| std::string lbann::generic_data_reader::get_local_file_dir | ( | ) | const |
Returns the base directory for caching files in local ssd If set_local_file_dir was not called, returns the empty string
|
inline |
Return the full mini_batch_size.
Definition at line 426 of file data_reader.hpp.
|
inline |
Get the mini batch size.
Definition at line 420 of file data_reader.hpp.
| int lbann::generic_data_reader::get_next_position | ( | ) | const |
Get the next position in the data reader.
|
inlinevirtual |
Get the number of samples in this dataset.
Reimplemented in lbann::data_reader_jag_conduit.
Definition at line 492 of file data_reader.hpp.
| size_t lbann::generic_data_reader::get_num_indices_to_use | ( | ) | const |
Returns the number of the shuffled indices that are to be used. Code in this method was formerly in select_subset_of_data()
|
inline |
Get the number of iterations in each epoch.
Definition at line 509 of file data_reader.hpp.
|
inlinevirtual |
Return the number of labels (classes) in this dataset.
This is called at the end of update; it permits data readers to perform actions that are specific to their data sets, for example, data_reader_jag_conduit_hdf5 has the 'primary' data reader bcast its shuffled indices to the other data readers. In general most data readers will probably not overide this method. It may also be called outside of update.
Reimplemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::ras_lipid_conduit_data_reader, lbann::smiles_data_reader, lbann::image_data_reader, lbann::data_reader_merge_features, lbann::numpy_npz_conduit_reader, lbann::numpy_npz_reader, lbann::numpy_reader, and lbann::data_reader_merge_samples.
Definition at line 370 of file data_reader.hpp.
|
inline |
Return the number of parallel readers per model.
Definition at line 466 of file data_reader.hpp.
|
inlinevirtual |
Return the number of responses in this dataset.
Reimplemented in lbann::hdf5_data_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::numpy_npz_conduit_reader, lbann::numpy_npz_reader, and lbann::data_reader_merge_samples.
Definition at line 372 of file data_reader.hpp.
| int lbann::generic_data_reader::get_num_unused_data | ( | execution_mode | m | ) | const |
Get the number of unused samples in this dataset.
|
inline |
Get the current position in the data reader.
Definition at line 486 of file data_reader.hpp.
|
inline |
Return the starting mini-batch index for the epoch.
Definition at line 473 of file data_reader.hpp.
|
inline |
Get the role for this dataset.
Definition at line 279 of file data_reader.hpp.
|
inline |
Return the sample stride.
Definition at line 440 of file data_reader.hpp.
|
inline |
Returns the shuffled indices; primary use is for testing.
Definition at line 236 of file data_reader.hpp.
|
inlinevirtual |
Reimplemented in lbann::data_reader_jag_conduit.
Definition at line 389 of file data_reader.hpp.
|
inline |
Return the last mini batch stride.
Definition at line 459 of file data_reader.hpp.
|
inline |
Return the mini batch stride.
Definition at line 433 of file data_reader.hpp.
|
pure virtual |
Return this data_reader's type
Implemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::smiles_data_reader, lbann::data_reader_merge_features, lbann::numpy_npz_reader, lbann::numpy_npz_conduit_reader, lbann::cifar10_reader, lbann::data_reader_sample_list< SampleListT >, lbann::data_reader_sample_list< sample_list_hdf5< std::string > >, lbann::data_reader_sample_list< sample_list_ifstream< long long > >, lbann::csv_reader, lbann::ras_lipid_conduit_data_reader, lbann::numpy_reader, lbann::pilot2_molecular_reader, lbann::data_reader_merge_samples, lbann::mesh_reader, lbann::data_reader_nci, lbann::mnist_reader, and lbann::imagenet_reader.
| int* lbann::generic_data_reader::get_unused_data | ( | execution_mode | m | ) |
Get a pointer to the start of the unused sample indices.
| const std::vector<int>& lbann::generic_data_reader::get_unused_indices | ( | execution_mode | m | ) |
|
protected |
Returns the fraction of the dataset to be used for training or testing. If training, this is the total for training and validation. Throws if set_use_fraction was not called.
|
inlinevirtual |
Reimplemented in lbann::hdf5_data_reader.
Definition at line 134 of file data_reader.hpp.
|
inlinevirtual |
Check to see if the data reader supports this specific data field.
Definition at line 313 of file data_reader.hpp.
|
inlinevirtual |
Reimplemented in lbann::generic_compound_data_reader.
Definition at line 324 of file data_reader.hpp.
|
inlinevirtual |
Does the data reader have a unique sample list per model.
Reimplemented in lbann::data_reader_jag_conduit.
Definition at line 541 of file data_reader.hpp.
|
inlinevirtual |
Does the data reader have a unique sample list per trainer.
Reimplemented in lbann::data_reader_jag_conduit.
Definition at line 543 of file data_reader.hpp.
|
inlinevirtual |
Reimplemented in lbann::generic_compound_data_reader.
Definition at line 328 of file data_reader.hpp.
| void lbann::generic_data_reader::instantiate_data_store | ( | ) |
|
inline |
Returns true if data samples are shuffled.
Definition at line 222 of file data_reader.hpp.
| void lbann::generic_data_reader::keep_sample_order | ( | bool | same_order = false | ) |
To facilictate the testing, maintain the order of loaded samples in the sample list as it is in the list file.
|
pure virtual |
Load the dataset. Each data reader implementation should implement this to initialize its internal data structures, determine the number of samples and their dimensionality (if needed), and set up and shuffle samples.
Implemented in lbann::data_reader_jag_conduit, lbann::csv_reader, lbann::hdf5_data_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_sample_list< SampleListT >, lbann::data_reader_sample_list< sample_list_hdf5< std::string > >, lbann::data_reader_sample_list< sample_list_ifstream< long long > >, lbann::data_reader_synthetic, lbann::cifar10_reader, lbann::mesh_reader, lbann::numpy_npz_reader, lbann::numpy_npz_conduit_reader, lbann::smiles_data_reader, lbann::image_data_reader, lbann::data_reader_merge_features, lbann::ras_lipid_conduit_data_reader, lbann::numpy_reader, lbann::data_reader_merge_samples, lbann::pilot2_molecular_reader, and lbann::mnist_reader.
| bool lbann::generic_data_reader::load_from_checkpoint_distributed | ( | persist & | p, |
| execution_mode | mode | ||
| ) |
Given directory to store checkpoint files, read state from file and add to number of bytes read.
| bool lbann::generic_data_reader::load_from_checkpoint_shared | ( | persist & | p, |
| execution_mode | mode | ||
| ) |
Given directory to store checkpoint files, read state from file and add to number of bytes read.
|
default |
|
inlinevirtual |
True if the data reader's current position is not valid but within # ranks per model of the end of the data set (e.g. it is a rank with no valid data on the last iteration)
Definition at line 403 of file data_reader.hpp.
|
inlinevirtual |
True if the data reader's current position is valid.
Definition at line 396 of file data_reader.hpp.
|
inlinevirtual |
experimental; used to ensure all readers for jag_conduit_hdf5 have identical shuffled indices
Definition at line 595 of file data_reader.hpp.
|
inlineprotectedvirtual |
Called after fetch_datum/label/response to allow initialization.
Definition at line 734 of file data_reader.hpp.
|
virtual |
|
inlineprotectedvirtual |
Called before fetch_datum/label/response to allow initialization.
Definition at line 730 of file data_reader.hpp.
|
virtual |
| void lbann::generic_data_reader::print_get_methods | ( | const std::string | filename | ) |
Print the return values from various get_X methods to file.
For use in unit testing. Only the master prints. Currently only prints values from get_X methods that only depend on the data_reader (i.e, not on the trainer, model, etc)
| void lbann::generic_data_reader::resize_shuffled_indices | ( | ) |
Optionally resizes the shuffled indices based on the data reader prototext settings: absolute_sample_count, fraction_of_data_to_use. (dah - this was formerly part of select_subset_of_data)
| bool lbann::generic_data_reader::save_to_checkpoint_distributed | ( | persist & | p, |
| execution_mode | mode | ||
| ) |
| bool lbann::generic_data_reader::save_to_checkpoint_shared | ( | persist & | p, |
| execution_mode | mode | ||
| ) |
Given directory to store checkpoint files, write state to file and add to number of bytes written.
| void lbann::generic_data_reader::select_subset_of_data | ( | ) |
Select the appropriate subset of data for the additional execution modes such as validation or tournament set based on the data reader prototext setting: validation_fraction or tournament_fraction
| void lbann::generic_data_reader::serialize | ( | Archive & | ar | ) |
Archive for checkpoint and restart
| void lbann::generic_data_reader::set_absolute_sample_count | ( | size_t | s | ) |
Sets the absolute number of data samples that will be used for training or testing.
|
inlinevirtual |
Return the base offset.
Definition at line 446 of file data_reader.hpp.
|
inline |
set the comm object
Definition at line 129 of file data_reader.hpp.
| void lbann::generic_data_reader::set_data_filename | ( | std::string | s | ) |
Set the filename for your data (images, etc). This may either be a complete filepath, or a subdirectory; see note for set_file_dir(). Also, use this method for cases where the file contains a list of files (e.g, imagenet)
| void lbann::generic_data_reader::set_data_sample_list | ( | std::string | s | ) |
Set the sample list for your data (images, etc). The sample lists contains an enumeration of all samples in the data set.
| void lbann::generic_data_reader::set_data_store | ( | data_store_conduit * | g | ) |
support of data store functionality
|
virtual |
Sets the fraction of the dataset to be used for validation.
| m | The execution mode. |
| s | The fraction used, in the range [0, 1]. |
Reimplemented in lbann::generic_compound_data_reader.
| void lbann::generic_data_reader::set_file_dir | ( | std::string | s | ) |
Set base directory for your data.
| void lbann::generic_data_reader::set_first_n | ( | int | n | ) |
Read the first 'n' samples. If nonzero, this over-rides set_absolute_sample_count, set_use_fraction. The intent is to use this for testing. A problem with set_absolute_sample_count and set_use_fraction is that the entire data set is read in, then a subset is selected
|
inline |
Definition at line 581 of file data_reader.hpp.
|
inline |
Definition at line 577 of file data_reader.hpp.
|
inline |
Whether or not a data reader has a data field.
Definition at line 334 of file data_reader.hpp.
|
inlinevirtual |
Whether or not a data reader has labels.
Reimplemented in lbann::generic_compound_data_reader.
Definition at line 340 of file data_reader.hpp.
|
inlinevirtual |
Whether or not a data reader has a response field.
Reimplemented in lbann::generic_compound_data_reader.
Definition at line 345 of file data_reader.hpp.
|
inline |
Set the current position based on the base and model offsets.
Definition at line 479 of file data_reader.hpp.
|
inline |
Set the iteration stride.
Definition at line 442 of file data_reader.hpp.
| void lbann::generic_data_reader::set_label_filename | ( | std::string | s | ) |
Set the filename for your data (images, etc). This may either be a complete filepath, or a subdirectory; see note for set_file_dir()
|
inline |
Set the last mini batch size.
Definition at line 450 of file data_reader.hpp.
| void lbann::generic_data_reader::set_local_file_dir | ( | std::string | s | ) |
Set base directory for your locally cached (e.g, on ssd) data.
|
inline |
for some data readers (jag_conduit) we load from multiple files; for testing we want to be able to restrict that number
Definition at line 153 of file data_reader.hpp.
| void lbann::generic_data_reader::set_mini_batch_size | ( | const int | s | ) |
Set the mini batch size.
|
inline |
Set the number of iterations in each epoch.
Definition at line 502 of file data_reader.hpp.
|
inline |
Set the number of parallel readers per model.
Definition at line 464 of file data_reader.hpp.
|
inlinevirtual |
Set the starting mini-batch index for the epoch.
Definition at line 468 of file data_reader.hpp.
|
virtual |
Set an idenifier for the dataset. The role should be one of "train", "test", or "validate".
Reimplemented in lbann::generic_compound_data_reader.
|
inline |
Set the sample stride.
Definition at line 438 of file data_reader.hpp.
|
inline |
If set to false, indices (data samples) are not shuffled default (in ctor) is true.
Definition at line 217 of file data_reader.hpp.
|
inline |
Set shuffled indices; primary use is for testing and reproducibility
Definition at line 228 of file data_reader.hpp.
|
inline |
Set the last mini batch stride.
Definition at line 454 of file data_reader.hpp.
|
inline |
Set the mini batch stride.
Definition at line 428 of file data_reader.hpp.
|
inline |
Set the transform pipeline this data reader will use.
Definition at line 598 of file data_reader.hpp.
|
inline |
Definition at line 804 of file data_reader.hpp.
| void lbann::generic_data_reader::set_use_fraction | ( | double | s | ) |
Set the fraction of the data set to use for training and validation or testing.
| s | The fraction used, in the range [0, 1]. |
|
virtual |
Prepare to start processing an epoch of data. If shuffle is true, then shuffle the indices of the data set If the base offset is not specified set it to 0 If the stride is not specified set it to batch size
Reimplemented in lbann::csv_reader, lbann::data_reader_jag_conduit, lbann::image_data_reader, lbann::mesh_reader, and lbann::data_reader_merge_features.
| void lbann::generic_data_reader::setup_data_store | ( | int | mini_batch_size | ) |
sets up a data_store; this is called from build_model_from_prototext() in utils/lbann_library.cpp. This is a bit awkward: would like to call it when we instantiate the data_store, but we don;t know the mini_batch_size until later.
|
protectedvirtual |
Shuffle indices (uses the data_seq_generator)
|
protectedvirtual |
Shuffle indices and profide a random number generator.
Reimplemented in lbann::data_reader_jag_conduit, lbann::data_reader_sample_list< SampleListT >, lbann::data_reader_sample_list< sample_list_hdf5< std::string > >, and lbann::data_reader_sample_list< sample_list_ifstream< long long > >.
| void lbann::generic_data_reader::start_data_store_mini_batch_exchange | ( | ) |
|
virtual |
During the network's update phase, the data reader will advanced the current position pointer. If the pointer wraps around, then reshuffle the data indicies.
|
virtual |
Replaced the shuffled index set with the unused index set, empying the unused set.
Reimplemented in lbann::smiles_data_reader.
|
friend |
Definition at line 801 of file data_reader.hpp.
|
friend |
Definition at line 802 of file data_reader.hpp.
| size_t lbann::generic_data_reader::m_absolute_sample_count |
Definition at line 781 of file data_reader.hpp.
| int lbann::generic_data_reader::m_base_offset |
If there are multiple instances of the reader, then it may not reset to zero
Definition at line 749 of file data_reader.hpp.
|
protected |
Definition at line 638 of file data_reader.hpp.
| int lbann::generic_data_reader::m_current_mini_batch_idx |
The index of the current mini-batch that is being processed (train/test/validate)
Definition at line 768 of file data_reader.hpp.
| int lbann::generic_data_reader::m_current_pos |
Definition at line 743 of file data_reader.hpp.
| std::string lbann::generic_data_reader::m_data_fn |
Definition at line 778 of file data_reader.hpp.
| std::string lbann::generic_data_reader::m_data_sample_list |
Definition at line 777 of file data_reader.hpp.
|
protected |
Definition at line 636 of file data_reader.hpp.
| std::map<execution_mode, double> lbann::generic_data_reader::m_execution_mode_split_fraction |
Definition at line 782 of file data_reader.hpp.
| std::string lbann::generic_data_reader::m_file_dir |
Definition at line 775 of file data_reader.hpp.
| int lbann::generic_data_reader::m_first_n |
Definition at line 784 of file data_reader.hpp.
|
protected |
Definition at line 819 of file data_reader.hpp.
|
protected |
Definition at line 817 of file data_reader.hpp.
|
protected |
Definition at line 822 of file data_reader.hpp.
|
protected |
for use with data_store: issue a warning a single time if m_data_store != nullptr, but we're not retrieving a conduit::Node from the store. This typically occurs during the test phase
Definition at line 834 of file data_reader.hpp.
| int lbann::generic_data_reader::m_iteration_stride |
Stride used by parallel data readers within the model.
Definition at line 754 of file data_reader.hpp.
|
protected |
Whether to keep the order of loaded samples same as it is in the file to make testing and validation easier
Definition at line 826 of file data_reader.hpp.
| std::string lbann::generic_data_reader::m_label_fn |
Definition at line 779 of file data_reader.hpp.
| int lbann::generic_data_reader::m_last_mini_batch_size |
Definition at line 760 of file data_reader.hpp.
| int lbann::generic_data_reader::m_loaded_mini_batch_idx |
The index of the current mini-batch that has been loaded.
Definition at line 765 of file data_reader.hpp.
| std::string lbann::generic_data_reader::m_local_file_dir |
Definition at line 776 of file data_reader.hpp.
| size_t lbann::generic_data_reader::m_max_files_to_load |
How many parallel readers are being used.
Definition at line 774 of file data_reader.hpp.
| int lbann::generic_data_reader::m_mini_batch_size |
Definition at line 742 of file data_reader.hpp.
| int lbann::generic_data_reader::m_num_iterations_per_epoch |
Definition at line 770 of file data_reader.hpp.
| int lbann::generic_data_reader::m_num_parallel_readers |
How many iterations all readers will execute.
Definition at line 772 of file data_reader.hpp.
| int lbann::generic_data_reader::m_reset_mini_batch_index |
The index at which this data reader starts its epoch.
Definition at line 763 of file data_reader.hpp.
| std::string lbann::generic_data_reader::m_role |
Definition at line 785 of file data_reader.hpp.
| int lbann::generic_data_reader::m_sample_stride |
Sample stride is used when a mini-batch is finely interleaved across a DATA_PARALLEL distribution.
Definition at line 752 of file data_reader.hpp.
| bool lbann::generic_data_reader::m_shuffle |
Definition at line 780 of file data_reader.hpp.
| std::vector<int> lbann::generic_data_reader::m_shuffled_indices |
Definition at line 756 of file data_reader.hpp.
| int lbann::generic_data_reader::m_stride_to_last_mini_batch |
Definition at line 761 of file data_reader.hpp.
| int lbann::generic_data_reader::m_stride_to_next_mini_batch |
Batch Stride is typically batch_size, but may be a multiple of batch size if there are multiple readers
Definition at line 746 of file data_reader.hpp.
|
protected |
Holds a true value for each input data type that is supported. Use an ordered map so that checkpoints are stable.
Definition at line 814 of file data_reader.hpp.
|
protected |
Transform pipeline for preprocessing data.
Definition at line 829 of file data_reader.hpp.
| unused_index_map_t lbann::generic_data_reader::m_unused_indices |
Record of the indicies that are not being used for training.
Definition at line 758 of file data_reader.hpp.
|
protected |
Definition at line 810 of file data_reader.hpp.
| double lbann::generic_data_reader::m_use_fraction |
Definition at line 783 of file data_reader.hpp.
|
mutableprotected |
Definition at line 615 of file data_reader.hpp.
|
protected |
Definition at line 612 of file data_reader.hpp.