LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
lbann::generic_data_reader Class Referenceabstract

#include <data_reader.hpp>

Inheritance diagram for lbann::generic_data_reader:
[legend]
Collaboration diagram for lbann::generic_data_reader:
[legend]

Public Types

using unused_index_map_t = std::map< execution_mode, std::vector< int > >
 

Public Member Functions

 generic_data_reader (bool shuffle=true)
 
 generic_data_reader (const generic_data_reader &)=default
 
generic_data_readeroperator= (const generic_data_reader &)=default
 
virtual ~generic_data_reader ()
 
virtual generic_data_readercopy () const =0
 
template<class Archive >
void serialize (Archive &ar)
 
void set_comm (lbann_comm *comm)
 set the comm object More...
 
lbann_commget_comm () const
 returns a (possibly nullptr) to comm More...
 
virtual bool has_conduit_output ()
 
void set_file_dir (std::string s)
 
void set_local_file_dir (std::string s)
 
void set_max_files_to_load (size_t n)
 
std::string get_file_dir () const
 
std::string get_local_file_dir () const
 
void set_data_sample_list (std::string s)
 
std::string get_data_sample_list () const
 
void keep_sample_order (bool same_order=false)
 
void set_data_filename (std::string s)
 
std::string get_data_filename () const
 
void set_label_filename (std::string s)
 
std::string get_label_filename () const
 
void set_shuffle (bool b)
 
bool is_shuffled () const
 
void set_shuffled_indices (const std::vector< int > &indices)
 
const std::vector< int > & get_shuffled_indices () const
 
void set_first_n (int n)
 
void set_absolute_sample_count (size_t s)
 
void set_use_fraction (double s)
 
virtual void set_execution_mode_split_fraction (execution_mode m, double s)
 
virtual void set_role (std::string role)
 
std::string get_role () const
 
virtual void load ()=0
 
virtual void setup (int num_io_threads, observer_ptr< thread_pool > io_thread_pool)
 
virtual std::string get_type () const =0
 
int fetch (std::map< data_field_type, CPUMat *> &input_buffers, El::Matrix< El::Int > &indices_fetched, size_t mb_size)
 Fetch a mini-batch worth of data, including samples, labels, responses (as appropriate) More...
 
int fetch (std::vector< conduit::Node > &samples, El::Matrix< El::Int > &indices_fetched, size_t mb_size)
 
virtual bool has_data_field (data_field_type data_field) const
 Check to see if the data reader supports this specific data field. More...
 
virtual bool has_labels () const
 
virtual bool has_responses () const
 
void set_has_data_field (data_field_type const data_field, const bool b)
 Whether or not a data reader has a data field. More...
 
virtual void set_has_labels (const bool b)
 Whether or not a data reader has labels. More...
 
virtual void set_has_responses (const bool b)
 Whether or not a data reader has a response field. More...
 
void start_data_store_mini_batch_exchange ()
 
void finish_data_store_mini_batch_exchange ()
 
virtual bool update (bool is_active_reader)
 
virtual int get_num_labels () const
 Return the number of labels (classes) in this dataset. More...
 
virtual int get_num_responses () const
 Return the number of responses in this dataset. More...
 
virtual int get_linearized_data_size () const
 Get the linearized size (i.e. number of elements) in a sample. More...
 
virtual int get_linearized_label_size () const
 Get the linearized size (i.e. number of elements) in a label. More...
 
virtual int get_linearized_response_size () const
 Get the linearized size (i.e. number of elements) in a response. More...
 
virtual int get_linearized_size (data_field_type const &data_field) const
 get the linearized size of what is identified by desc. More...
 
virtual const std::vector< El::Int > get_data_dims () const
 Get the dimensions of the data. More...
 
virtual std::vector< El::Int > get_slice_points (const slice_points_mode var_category, bool &is_supported)
 
virtual bool position_valid () const
 True if the data reader's current position is valid. More...
 
virtual bool position_is_overrun () const
 
bool at_new_epoch () const
 True if the data reader is at the start of an epoch. More...
 
void set_mini_batch_size (const int s)
 Set the mini batch size. More...
 
int get_mini_batch_size () const
 Get the mini batch size. More...
 
int get_loaded_mini_batch_size () const
 Get the loaded mini-batch size. More...
 
int get_current_mini_batch_size () const
 Get the current mini-batch size. More...
 
int get_mini_batch_max () const
 Return the full mini_batch_size. More...
 
void set_stride_to_next_mini_batch (const int s)
 Set the mini batch stride. More...
 
int get_stride_to_next_mini_batch () const
 Return the mini batch stride. More...
 
void set_sample_stride (const int s)
 Set the sample stride. More...
 
int get_sample_stride () const
 Return the sample stride. More...
 
void set_iteration_stride (const int s)
 Set the iteration stride. More...
 
int get_iteration_stride () const
 Return the iteration stride. More...
 
virtual void set_base_offset (const int s)
 Return the base offset. More...
 
int get_base_offset () const
 Return the base offset. More...
 
void set_last_mini_batch_size (const int s)
 Set the last mini batch size. More...
 
int get_last_mini_batch_size () const
 Return the last mini batch size. More...
 
void set_stride_to_last_mini_batch (const int s)
 Set the last mini batch stride. More...
 
int get_stride_to_last_mini_batch () const
 Return the last mini batch stride. More...
 
void set_num_parallel_readers (const int s)
 Set the number of parallel readers per model. More...
 
int get_num_parallel_readers () const
 Return the number of parallel readers per model. More...
 
virtual void set_reset_mini_batch_index (const int s)
 Set the starting mini-batch index for the epoch. More...
 
int get_reset_mini_batch_index () const
 Return the starting mini-batch index for the epoch. More...
 
int get_loaded_mini_batch_index () const
 Return the current mini-batch index for the epoch. More...
 
int get_current_mini_batch_index () const
 Return the current mini-batch index for the epoch. More...
 
void set_initial_position ()
 Set the current position based on the base and model offsets. More...
 
int get_position () const
 Get the current position in the data reader. More...
 
int get_next_position () const
 Get the next position in the data reader. More...
 
int * get_indices ()
 Get a pointer to the start of the shuffled indices. More...
 
virtual int get_num_data () const
 Get the number of samples in this dataset. More...
 
int get_num_unused_data (execution_mode m) const
 Get the number of unused samples in this dataset. More...
 
int * get_unused_data (execution_mode m)
 Get a pointer to the start of the unused sample indices. More...
 
const std::vector< int > & get_unused_indices (execution_mode m)
 
void set_num_iterations_per_epoch (int num_iterations_per_epoch)
 Set the number of iterations in each epoch. More...
 
int get_num_iterations_per_epoch () const
 Get the number of iterations in each epoch. More...
 
int get_current_step_in_epoch () const
 
void resize_shuffled_indices ()
 
void select_subset_of_data ()
 
virtual void use_unused_index_set (execution_mode m)
 
virtual bool has_list_per_model () const
 Does the data reader have a unique sample list per model. More...
 
virtual bool has_list_per_trainer () const
 Does the data reader have a unique sample list per trainer. More...
 
bool save_to_checkpoint_shared (persist &p, execution_mode mode)
 Given directory to store checkpoint files, write state to file and add to number of bytes written. More...
 
bool load_from_checkpoint_shared (persist &p, execution_mode mode)
 Given directory to store checkpoint files, read state from file and add to number of bytes read. More...
 
bool save_to_checkpoint_distributed (persist &p, execution_mode mode)
 
bool load_from_checkpoint_distributed (persist &p, execution_mode mode)
 Given directory to store checkpoint files, read state from file and add to number of bytes read. More...
 
const data_store_conduitget_data_store () const
 returns a const ref to the data store More...
 
data_store_conduitget_data_store ()
 returns a non-const ref to the data store More...
 
data_store_conduitget_data_store_ptr () const
 
void setup_data_store (int mini_batch_size)
 
void instantiate_data_store ()
 
virtual void preload_data_store ()
 
void set_gan_labelling (bool has_gan_labelling)
 
void set_gan_label_value (int gan_label_value)
 
void set_data_store (data_store_conduit *g)
 support of data store functionality More...
 
virtual bool data_store_active () const
 
virtual bool priming_data_store () const
 
virtual void post_update ()
 
void set_transform_pipeline (transform::transform_pipeline &&tp)
 
void print_get_methods (const std::string filename)
 Print the return values from various get_X methods to file. More...
 
size_t get_num_indices_to_use () const
 
void set_use_data_store (bool s)
 

Public Attributes

int m_mini_batch_size
 
int m_current_pos
 
int m_stride_to_next_mini_batch
 
int m_base_offset
 
int m_sample_stride
 
int m_iteration_stride
 Stride used by parallel data readers within the model. More...
 
std::vector< int > m_shuffled_indices
 
unused_index_map_t m_unused_indices
 Record of the indicies that are not being used for training. More...
 
int m_last_mini_batch_size
 
int m_stride_to_last_mini_batch
 
int m_reset_mini_batch_index
 The index at which this data reader starts its epoch. More...
 
int m_loaded_mini_batch_idx
 The index of the current mini-batch that has been loaded. More...
 
int m_current_mini_batch_idx
 
int m_num_iterations_per_epoch
 
int m_num_parallel_readers
 How many iterations all readers will execute. More...
 
size_t m_max_files_to_load
 How many parallel readers are being used. More...
 
std::string m_file_dir
 
std::string m_local_file_dir
 
std::string m_data_sample_list
 
std::string m_data_fn
 
std::string m_label_fn
 
bool m_shuffle
 
size_t m_absolute_sample_count
 
std::map< execution_mode, double > m_execution_mode_split_fraction
 
double m_use_fraction
 
int m_first_n
 
std::string m_role
 

Protected Member Functions

size_t get_absolute_sample_count () const
 
double get_use_fraction () const
 
double get_execution_mode_split_fraction (execution_mode m) const
 
virtual bool fetch_data_block (std::map< data_field_type, CPUMat *> &input_buffers, El::Int block_offset, El::Int block_stride, El::Int mb_size, El::Matrix< El::Int > &indices_fetched)
 
bool fetch_data_block_conduit (std::vector< conduit::Node > &samples, El::Int block_offset, El::Int block_stride, El::Int mb_size, El::Matrix< El::Int > &indices_fetched)
 
virtual bool fetch_data_field (data_field_type data_field, CPUMat &Y, int data_id, int mb_idx)
 Called by fetch_data, fetch_label, fetch_response. More...
 
virtual bool fetch_conduit_node (conduit::Node &sample, int data_id)
 
virtual bool fetch_datum (CPUMat &X, int data_id, int mb_idx)
 
virtual bool fetch_label (CPUMat &Y, int data_id, int mb_idx)
 
virtual bool fetch_response (CPUMat &Y, int data_id, int mb_idx)
 
CPUMat create_datum_view (CPUMat &X, const int mb_idx)
 
virtual void preprocess_data_source (int tid)
 
virtual void postprocess_data_source (int tid)
 
virtual void shuffle_indices ()
 Shuffle indices (uses the data_seq_generator) More...
 
virtual void shuffle_indices (rng_gen &gen)
 Shuffle indices and profide a random number generator. More...
 
void error_check_counts () const
 

Protected Attributes

bool m_verbose = false
 
std::unordered_set< int > m_using_random_node
 
data_store_conduitm_data_store
 
lbann_commm_comm
 
bool m_use_data_store = false
 
std::map< data_field_type, bool > m_supported_input_types
 Holds a true value for each input data type that is supported. Use an ordered map so that checkpoints are stable. More...
 
bool m_gan_labelling
 
int m_gan_label_value
 
observer_ptr< thread_poolm_io_thread_pool
 
bool m_keep_sample_order
 
transform::transform_pipeline m_transform_pipeline
 
bool m_issue_warning
 

Private Member Functions

virtual void do_preload_data_store ()
 

Friends

class data_reader_merge_features
 
class data_reader_merge_samples
 

Detailed Description

A data reader manages reading in data in a particular format. This abstract base class manages common functionality. In particular, child classes should implement load and the appropriate subset of fetch_datum, fetch_label, and fetch_response.

Definition at line 75 of file data_reader.hpp.

Member Typedef Documentation

◆ unused_index_map_t

using lbann::generic_data_reader::unused_index_map_t = std::map<execution_mode, std::vector<int> >

Definition at line 78 of file data_reader.hpp.

Constructor & Destructor Documentation

◆ generic_data_reader() [1/2]

lbann::generic_data_reader::generic_data_reader ( bool  shuffle = true)
inline

ctor

Definition at line 83 of file data_reader.hpp.

Here is the call graph for this function:

◆ generic_data_reader() [2/2]

lbann::generic_data_reader::generic_data_reader ( const generic_data_reader )
default

◆ ~generic_data_reader()

virtual lbann::generic_data_reader::~generic_data_reader ( )
virtual

Member Function Documentation

◆ at_new_epoch()

bool lbann::generic_data_reader::at_new_epoch ( ) const
inline

True if the data reader is at the start of an epoch.

Note that data readers can start at a non-zero index if there are parallel data readers in a model

Definition at line 410 of file data_reader.hpp.

◆ copy()

◆ create_datum_view()

CPUMat lbann::generic_data_reader::create_datum_view ( CPUMat X,
const int  mb_idx 
)
inlineprotected

Create a matrix view of a single column selected by mini-batch index.

Parameters
XThe matrix to load data into.
mb_idxThe index within the mini-batch.
Returns
Single column view of the input matrix

Definition at line 722 of file data_reader.hpp.

◆ data_store_active()

virtual bool lbann::generic_data_reader::data_store_active ( ) const
virtual

Reimplemented in lbann::data_reader_jag_conduit.

Here is the caller graph for this function:

◆ do_preload_data_store()

virtual void lbann::generic_data_reader::do_preload_data_store ( )
privatevirtual

◆ error_check_counts()

void lbann::generic_data_reader::error_check_counts ( ) const
protected

throws exception if get_absolute_sample_count() and get_use_fraction() are incorrect

◆ fetch() [1/2]

int lbann::generic_data_reader::fetch ( std::map< data_field_type, CPUMat *> &  input_buffers,
El::Matrix< El::Int > &  indices_fetched,
size_t  mb_size 
)

Fetch a mini-batch worth of data, including samples, labels, responses (as appropriate)

◆ fetch() [2/2]

int lbann::generic_data_reader::fetch ( std::vector< conduit::Node > &  samples,
El::Matrix< El::Int > &  indices_fetched,
size_t  mb_size 
)

◆ fetch_conduit_node()

virtual bool lbann::generic_data_reader::fetch_conduit_node ( conduit::Node &  sample,
int  data_id 
)
inlineprotectedvirtual

Reimplemented in lbann::hdf5_data_reader.

Definition at line 674 of file data_reader.hpp.

◆ fetch_data_block()

virtual bool lbann::generic_data_reader::fetch_data_block ( std::map< data_field_type, CPUMat *> &  input_buffers,
El::Int  block_offset,
El::Int  block_stride,
El::Int  mb_size,
El::Matrix< El::Int > &  indices_fetched 
)
protectedvirtual

◆ fetch_data_block_conduit()

bool lbann::generic_data_reader::fetch_data_block_conduit ( std::vector< conduit::Node > &  samples,
El::Int  block_offset,
El::Int  block_stride,
El::Int  mb_size,
El::Matrix< El::Int > &  indices_fetched 
)
protected

◆ fetch_data_field()

virtual bool lbann::generic_data_reader::fetch_data_field ( data_field_type  data_field,
CPUMat Y,
int  data_id,
int  mb_idx 
)
inlineprotectedvirtual

Called by fetch_data, fetch_label, fetch_response.

Fetch data from a single data field into a matrix.

Parameters
data_fieldThe name of the data field. May be one of the commonly used (samples, labels, responses) or any data_field that exists within an HDF5 experiment schema, Python DR schema, or synthetic data reader
YThe matrix to load data into.
data_idThe index of the datum to fetch.
mb_idxThe index within the mini-batch.

Reimplemented in lbann::hdf5_reader< TensorDataType >, and lbann::data_reader_synthetic.

Definition at line 665 of file data_reader.hpp.

◆ fetch_datum()

virtual bool lbann::generic_data_reader::fetch_datum ( CPUMat X,
int  data_id,
int  mb_idx 
)
inlineprotectedvirtual

◆ fetch_label()

virtual bool lbann::generic_data_reader::fetch_label ( CPUMat Y,
int  data_id,
int  mb_idx 
)
inlineprotectedvirtual

◆ fetch_response()

virtual bool lbann::generic_data_reader::fetch_response ( CPUMat Y,
int  data_id,
int  mb_idx 
)
inlineprotectedvirtual

Fetch a single response into a matrix.

Parameters
YThe matrix to load data into.
data_idThe index of the datum to fetch.
mb_idxThe index within the mini-batch.

Reimplemented in lbann::data_reader_jag_conduit, lbann::smiles_data_reader, lbann::ras_lipid_conduit_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::data_reader_merge_samples, lbann::numpy_npz_reader, lbann::data_reader_merge_features, lbann::numpy_npz_conduit_reader, lbann::mesh_reader, and lbann::numpy_reader.

Definition at line 710 of file data_reader.hpp.

◆ finish_data_store_mini_batch_exchange()

void lbann::generic_data_reader::finish_data_store_mini_batch_exchange ( )

◆ get_absolute_sample_count()

size_t lbann::generic_data_reader::get_absolute_sample_count ( ) const
protected

Return the absolute number of data samples that will be used for training or testing.

◆ get_base_offset()

int lbann::generic_data_reader::get_base_offset ( ) const
inline

Return the base offset.

Definition at line 448 of file data_reader.hpp.

◆ get_comm()

lbann_comm* lbann::generic_data_reader::get_comm ( ) const
inline

returns a (possibly nullptr) to comm

Definition at line 132 of file data_reader.hpp.

◆ get_current_mini_batch_index()

int lbann::generic_data_reader::get_current_mini_batch_index ( ) const
inline

Return the current mini-batch index for the epoch.

Definition at line 477 of file data_reader.hpp.

◆ get_current_mini_batch_size()

int lbann::generic_data_reader::get_current_mini_batch_size ( ) const

Get the current mini-batch size.

◆ get_current_step_in_epoch()

int lbann::generic_data_reader::get_current_step_in_epoch ( ) const
inline

Return the index of the current iteration step in the epoch (also the mini-batch index)

Definition at line 517 of file data_reader.hpp.

◆ get_data_dims()

◆ get_data_filename()

std::string lbann::generic_data_reader::get_data_filename ( ) const

Returns the complete filepath to you data file. See not for set_file_dir()

◆ get_data_sample_list()

std::string lbann::generic_data_reader::get_data_sample_list ( ) const

Returns the complete sample list for your data set.

◆ get_data_store() [1/2]

const data_store_conduit& lbann::generic_data_reader::get_data_store ( ) const

returns a const ref to the data store

◆ get_data_store() [2/2]

data_store_conduit& lbann::generic_data_reader::get_data_store ( )

returns a non-const ref to the data store

◆ get_data_store_ptr()

data_store_conduit* lbann::generic_data_reader::get_data_store_ptr ( ) const
inline

Definition at line 565 of file data_reader.hpp.

◆ get_execution_mode_split_fraction()

double lbann::generic_data_reader::get_execution_mode_split_fraction ( execution_mode  m) const
protected

Return the fraction of the dataset to be used for other execution modes such as validation or tournament.

◆ get_file_dir()

std::string lbann::generic_data_reader::get_file_dir ( ) const

Returns the base directory for your data. If set_file_dir was not called, returns the empty string

◆ get_indices()

int* lbann::generic_data_reader::get_indices ( )
inline

Get a pointer to the start of the shuffled indices.

Definition at line 490 of file data_reader.hpp.

◆ get_iteration_stride()

int lbann::generic_data_reader::get_iteration_stride ( ) const
inline

Return the iteration stride.

Definition at line 444 of file data_reader.hpp.

◆ get_label_filename()

std::string lbann::generic_data_reader::get_label_filename ( ) const

Returns the complete filepath to you data file. See not for set_file_dir(). Note: some pipelines (autoencoders) will not make use of this method.

◆ get_last_mini_batch_size()

int lbann::generic_data_reader::get_last_mini_batch_size ( ) const
inline

Return the last mini batch size.

Definition at line 452 of file data_reader.hpp.

◆ get_linearized_data_size()

◆ get_linearized_label_size()

virtual int lbann::generic_data_reader::get_linearized_label_size ( ) const
inlinevirtual

Get the linearized size (i.e. number of elements) in a label.

Reimplemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::image_data_reader, lbann::numpy_npz_conduit_reader, lbann::data_reader_merge_features, lbann::numpy_npz_reader, lbann::data_reader_merge_samples, lbann::ras_lipid_conduit_data_reader, lbann::smiles_data_reader, and lbann::numpy_reader.

Definition at line 376 of file data_reader.hpp.

Here is the caller graph for this function:

◆ get_linearized_response_size()

virtual int lbann::generic_data_reader::get_linearized_response_size ( ) const
inlinevirtual

Get the linearized size (i.e. number of elements) in a response.

Reimplemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::data_reader_merge_samples, lbann::ras_lipid_conduit_data_reader, lbann::numpy_npz_conduit_reader, lbann::smiles_data_reader, lbann::numpy_npz_reader, and lbann::mesh_reader.

Definition at line 378 of file data_reader.hpp.

Here is the caller graph for this function:

◆ get_linearized_size()

virtual int lbann::generic_data_reader::get_linearized_size ( data_field_type const &  data_field) const
virtual

get the linearized size of what is identified by desc.

Reimplemented in lbann::hdf5_data_reader, and lbann::data_reader_synthetic.

Here is the caller graph for this function:

◆ get_loaded_mini_batch_index()

int lbann::generic_data_reader::get_loaded_mini_batch_index ( ) const
inline

Return the current mini-batch index for the epoch.

Definition at line 475 of file data_reader.hpp.

◆ get_loaded_mini_batch_size()

int lbann::generic_data_reader::get_loaded_mini_batch_size ( ) const

Get the loaded mini-batch size.

◆ get_local_file_dir()

std::string lbann::generic_data_reader::get_local_file_dir ( ) const

Returns the base directory for caching files in local ssd If set_local_file_dir was not called, returns the empty string

◆ get_mini_batch_max()

int lbann::generic_data_reader::get_mini_batch_max ( ) const
inline

Return the full mini_batch_size.

Definition at line 426 of file data_reader.hpp.

◆ get_mini_batch_size()

int lbann::generic_data_reader::get_mini_batch_size ( ) const
inline

Get the mini batch size.

Definition at line 420 of file data_reader.hpp.

◆ get_next_position()

int lbann::generic_data_reader::get_next_position ( ) const

Get the next position in the data reader.

◆ get_num_data()

virtual int lbann::generic_data_reader::get_num_data ( ) const
inlinevirtual

Get the number of samples in this dataset.

Reimplemented in lbann::data_reader_jag_conduit.

Definition at line 492 of file data_reader.hpp.

◆ get_num_indices_to_use()

size_t lbann::generic_data_reader::get_num_indices_to_use ( ) const

Returns the number of the shuffled indices that are to be used. Code in this method was formerly in select_subset_of_data()

◆ get_num_iterations_per_epoch()

int lbann::generic_data_reader::get_num_iterations_per_epoch ( ) const
inline

Get the number of iterations in each epoch.

Todo:
BVE FIXME merge this with alternate approach

Definition at line 509 of file data_reader.hpp.

◆ get_num_labels()

virtual int lbann::generic_data_reader::get_num_labels ( ) const
inlinevirtual

Return the number of labels (classes) in this dataset.

This is called at the end of update; it permits data readers to perform actions that are specific to their data sets, for example, data_reader_jag_conduit_hdf5 has the 'primary' data reader bcast its shuffled indices to the other data readers. In general most data readers will probably not overide this method. It may also be called outside of update.

Reimplemented in lbann::data_reader_jag_conduit, lbann::hdf5_data_reader, lbann::csv_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::ras_lipid_conduit_data_reader, lbann::smiles_data_reader, lbann::image_data_reader, lbann::data_reader_merge_features, lbann::numpy_npz_conduit_reader, lbann::numpy_npz_reader, lbann::numpy_reader, and lbann::data_reader_merge_samples.

Definition at line 370 of file data_reader.hpp.

Here is the caller graph for this function:

◆ get_num_parallel_readers()

int lbann::generic_data_reader::get_num_parallel_readers ( ) const
inline

Return the number of parallel readers per model.

Definition at line 466 of file data_reader.hpp.

◆ get_num_responses()

virtual int lbann::generic_data_reader::get_num_responses ( ) const
inlinevirtual

Return the number of responses in this dataset.

Reimplemented in lbann::hdf5_data_reader, lbann::hdf5_reader< TensorDataType >, lbann::data_reader_synthetic, lbann::numpy_npz_conduit_reader, lbann::numpy_npz_reader, and lbann::data_reader_merge_samples.

Definition at line 372 of file data_reader.hpp.

Here is the caller graph for this function:

◆ get_num_unused_data()

int lbann::generic_data_reader::get_num_unused_data ( execution_mode  m) const

Get the number of unused samples in this dataset.

◆ get_position()

int lbann::generic_data_reader::get_position ( ) const
inline

Get the current position in the data reader.

Definition at line 486 of file data_reader.hpp.

◆ get_reset_mini_batch_index()

int lbann::generic_data_reader::get_reset_mini_batch_index ( ) const
inline

Return the starting mini-batch index for the epoch.

Definition at line 473 of file data_reader.hpp.

◆ get_role()

std::string lbann::generic_data_reader::get_role ( ) const
inline

Get the role for this dataset.

Definition at line 279 of file data_reader.hpp.

Here is the call graph for this function:

◆ get_sample_stride()

int lbann::generic_data_reader::get_sample_stride ( ) const
inline

Return the sample stride.

Definition at line 440 of file data_reader.hpp.

◆ get_shuffled_indices()

const std::vector<int>& lbann::generic_data_reader::get_shuffled_indices ( ) const
inline

Returns the shuffled indices; primary use is for testing.

Definition at line 236 of file data_reader.hpp.

◆ get_slice_points()

virtual std::vector<El::Int> lbann::generic_data_reader::get_slice_points ( const slice_points_mode  var_category,
bool &  is_supported 
)
inlinevirtual

Reimplemented in lbann::data_reader_jag_conduit.

Definition at line 389 of file data_reader.hpp.

◆ get_stride_to_last_mini_batch()

int lbann::generic_data_reader::get_stride_to_last_mini_batch ( ) const
inline

Return the last mini batch stride.

Definition at line 459 of file data_reader.hpp.

◆ get_stride_to_next_mini_batch()

int lbann::generic_data_reader::get_stride_to_next_mini_batch ( ) const
inline

Return the mini batch stride.

Definition at line 433 of file data_reader.hpp.

◆ get_type()

◆ get_unused_data()

int* lbann::generic_data_reader::get_unused_data ( execution_mode  m)

Get a pointer to the start of the unused sample indices.

◆ get_unused_indices()

const std::vector<int>& lbann::generic_data_reader::get_unused_indices ( execution_mode  m)

◆ get_use_fraction()

double lbann::generic_data_reader::get_use_fraction ( ) const
protected

Returns the fraction of the dataset to be used for training or testing. If training, this is the total for training and validation. Throws if set_use_fraction was not called.

◆ has_conduit_output()

virtual bool lbann::generic_data_reader::has_conduit_output ( )
inlinevirtual

Reimplemented in lbann::hdf5_data_reader.

Definition at line 134 of file data_reader.hpp.

◆ has_data_field()

virtual bool lbann::generic_data_reader::has_data_field ( data_field_type  data_field) const
inlinevirtual

Check to see if the data reader supports this specific data field.

Definition at line 313 of file data_reader.hpp.

◆ has_labels()

virtual bool lbann::generic_data_reader::has_labels ( ) const
inlinevirtual

Reimplemented in lbann::generic_compound_data_reader.

Definition at line 324 of file data_reader.hpp.

Here is the caller graph for this function:

◆ has_list_per_model()

virtual bool lbann::generic_data_reader::has_list_per_model ( ) const
inlinevirtual

Does the data reader have a unique sample list per model.

Reimplemented in lbann::data_reader_jag_conduit.

Definition at line 541 of file data_reader.hpp.

◆ has_list_per_trainer()

virtual bool lbann::generic_data_reader::has_list_per_trainer ( ) const
inlinevirtual

Does the data reader have a unique sample list per trainer.

Reimplemented in lbann::data_reader_jag_conduit.

Definition at line 543 of file data_reader.hpp.

◆ has_responses()

virtual bool lbann::generic_data_reader::has_responses ( ) const
inlinevirtual

Reimplemented in lbann::generic_compound_data_reader.

Definition at line 328 of file data_reader.hpp.

Here is the caller graph for this function:

◆ instantiate_data_store()

void lbann::generic_data_reader::instantiate_data_store ( )

◆ is_shuffled()

bool lbann::generic_data_reader::is_shuffled ( ) const
inline

Returns true if data samples are shuffled.

Definition at line 222 of file data_reader.hpp.

◆ keep_sample_order()

void lbann::generic_data_reader::keep_sample_order ( bool  same_order = false)

To facilictate the testing, maintain the order of loaded samples in the sample list as it is in the list file.

◆ load()

◆ load_from_checkpoint_distributed()

bool lbann::generic_data_reader::load_from_checkpoint_distributed ( persist p,
execution_mode  mode 
)

Given directory to store checkpoint files, read state from file and add to number of bytes read.

◆ load_from_checkpoint_shared()

bool lbann::generic_data_reader::load_from_checkpoint_shared ( persist p,
execution_mode  mode 
)

Given directory to store checkpoint files, read state from file and add to number of bytes read.

◆ operator=()

generic_data_reader& lbann::generic_data_reader::operator= ( const generic_data_reader )
default
Here is the caller graph for this function:

◆ position_is_overrun()

virtual bool lbann::generic_data_reader::position_is_overrun ( ) const
inlinevirtual

True if the data reader's current position is not valid but within # ranks per model of the end of the data set (e.g. it is a rank with no valid data on the last iteration)

Definition at line 403 of file data_reader.hpp.

◆ position_valid()

virtual bool lbann::generic_data_reader::position_valid ( ) const
inlinevirtual

True if the data reader's current position is valid.

Definition at line 396 of file data_reader.hpp.

◆ post_update()

virtual void lbann::generic_data_reader::post_update ( )
inlinevirtual

experimental; used to ensure all readers for jag_conduit_hdf5 have identical shuffled indices

Definition at line 595 of file data_reader.hpp.

◆ postprocess_data_source()

virtual void lbann::generic_data_reader::postprocess_data_source ( int  tid)
inlineprotectedvirtual

Called after fetch_datum/label/response to allow initialization.

Definition at line 734 of file data_reader.hpp.

◆ preload_data_store()

virtual void lbann::generic_data_reader::preload_data_store ( )
virtual

◆ preprocess_data_source()

virtual void lbann::generic_data_reader::preprocess_data_source ( int  tid)
inlineprotectedvirtual

Called before fetch_datum/label/response to allow initialization.

Definition at line 730 of file data_reader.hpp.

◆ priming_data_store()

virtual bool lbann::generic_data_reader::priming_data_store ( ) const
virtual

Reimplemented in lbann::data_reader_jag_conduit.

Here is the caller graph for this function:

◆ print_get_methods()

void lbann::generic_data_reader::print_get_methods ( const std::string  filename)

Print the return values from various get_X methods to file.

For use in unit testing. Only the master prints. Currently only prints values from get_X methods that only depend on the data_reader (i.e, not on the trainer, model, etc)

◆ resize_shuffled_indices()

void lbann::generic_data_reader::resize_shuffled_indices ( )

Optionally resizes the shuffled indices based on the data reader prototext settings: absolute_sample_count, fraction_of_data_to_use. (dah - this was formerly part of select_subset_of_data)

◆ save_to_checkpoint_distributed()

bool lbann::generic_data_reader::save_to_checkpoint_distributed ( persist p,
execution_mode  mode 
)

◆ save_to_checkpoint_shared()

bool lbann::generic_data_reader::save_to_checkpoint_shared ( persist p,
execution_mode  mode 
)

Given directory to store checkpoint files, write state to file and add to number of bytes written.

◆ select_subset_of_data()

void lbann::generic_data_reader::select_subset_of_data ( )

Select the appropriate subset of data for the additional execution modes such as validation or tournament set based on the data reader prototext setting: validation_fraction or tournament_fraction

◆ serialize()

template<class Archive >
void lbann::generic_data_reader::serialize ( Archive &  ar)

Archive for checkpoint and restart

◆ set_absolute_sample_count()

void lbann::generic_data_reader::set_absolute_sample_count ( size_t  s)

Sets the absolute number of data samples that will be used for training or testing.

◆ set_base_offset()

virtual void lbann::generic_data_reader::set_base_offset ( const int  s)
inlinevirtual

Return the base offset.

Definition at line 446 of file data_reader.hpp.

◆ set_comm()

void lbann::generic_data_reader::set_comm ( lbann_comm comm)
inline

set the comm object

Definition at line 129 of file data_reader.hpp.

◆ set_data_filename()

void lbann::generic_data_reader::set_data_filename ( std::string  s)

Set the filename for your data (images, etc). This may either be a complete filepath, or a subdirectory; see note for set_file_dir(). Also, use this method for cases where the file contains a list of files (e.g, imagenet)

◆ set_data_sample_list()

void lbann::generic_data_reader::set_data_sample_list ( std::string  s)

Set the sample list for your data (images, etc). The sample lists contains an enumeration of all samples in the data set.

◆ set_data_store()

void lbann::generic_data_reader::set_data_store ( data_store_conduit g)

support of data store functionality

◆ set_execution_mode_split_fraction()

virtual void lbann::generic_data_reader::set_execution_mode_split_fraction ( execution_mode  m,
double  s 
)
virtual

Sets the fraction of the dataset to be used for validation.

Parameters
mThe execution mode.
sThe fraction used, in the range [0, 1].

Reimplemented in lbann::generic_compound_data_reader.

Here is the caller graph for this function:

◆ set_file_dir()

void lbann::generic_data_reader::set_file_dir ( std::string  s)

Set base directory for your data.

◆ set_first_n()

void lbann::generic_data_reader::set_first_n ( int  n)

Read the first 'n' samples. If nonzero, this over-rides set_absolute_sample_count, set_use_fraction. The intent is to use this for testing. A problem with set_absolute_sample_count and set_use_fraction is that the entire data set is read in, then a subset is selected

◆ set_gan_label_value()

void lbann::generic_data_reader::set_gan_label_value ( int  gan_label_value)
inline

Definition at line 581 of file data_reader.hpp.

◆ set_gan_labelling()

void lbann::generic_data_reader::set_gan_labelling ( bool  has_gan_labelling)
inline

Definition at line 577 of file data_reader.hpp.

◆ set_has_data_field()

void lbann::generic_data_reader::set_has_data_field ( data_field_type const  data_field,
const bool  b 
)
inline

Whether or not a data reader has a data field.

Definition at line 334 of file data_reader.hpp.

◆ set_has_labels()

virtual void lbann::generic_data_reader::set_has_labels ( const bool  b)
inlinevirtual

Whether or not a data reader has labels.

Reimplemented in lbann::generic_compound_data_reader.

Definition at line 340 of file data_reader.hpp.

◆ set_has_responses()

virtual void lbann::generic_data_reader::set_has_responses ( const bool  b)
inlinevirtual

Whether or not a data reader has a response field.

Reimplemented in lbann::generic_compound_data_reader.

Definition at line 345 of file data_reader.hpp.

◆ set_initial_position()

void lbann::generic_data_reader::set_initial_position ( )
inline

Set the current position based on the base and model offsets.

Definition at line 479 of file data_reader.hpp.

◆ set_iteration_stride()

void lbann::generic_data_reader::set_iteration_stride ( const int  s)
inline

Set the iteration stride.

Definition at line 442 of file data_reader.hpp.

◆ set_label_filename()

void lbann::generic_data_reader::set_label_filename ( std::string  s)

Set the filename for your data (images, etc). This may either be a complete filepath, or a subdirectory; see note for set_file_dir()

◆ set_last_mini_batch_size()

void lbann::generic_data_reader::set_last_mini_batch_size ( const int  s)
inline

Set the last mini batch size.

Definition at line 450 of file data_reader.hpp.

◆ set_local_file_dir()

void lbann::generic_data_reader::set_local_file_dir ( std::string  s)

Set base directory for your locally cached (e.g, on ssd) data.

◆ set_max_files_to_load()

void lbann::generic_data_reader::set_max_files_to_load ( size_t  n)
inline

for some data readers (jag_conduit) we load from multiple files; for testing we want to be able to restrict that number

Definition at line 153 of file data_reader.hpp.

◆ set_mini_batch_size()

void lbann::generic_data_reader::set_mini_batch_size ( const int  s)

Set the mini batch size.

◆ set_num_iterations_per_epoch()

void lbann::generic_data_reader::set_num_iterations_per_epoch ( int  num_iterations_per_epoch)
inline

Set the number of iterations in each epoch.

Todo:
BVE FIXME merge this with alternate approach

Definition at line 502 of file data_reader.hpp.

◆ set_num_parallel_readers()

void lbann::generic_data_reader::set_num_parallel_readers ( const int  s)
inline

Set the number of parallel readers per model.

Definition at line 464 of file data_reader.hpp.

◆ set_reset_mini_batch_index()

virtual void lbann::generic_data_reader::set_reset_mini_batch_index ( const int  s)
inlinevirtual

Set the starting mini-batch index for the epoch.

Definition at line 468 of file data_reader.hpp.

◆ set_role()

virtual void lbann::generic_data_reader::set_role ( std::string  role)
virtual

Set an idenifier for the dataset. The role should be one of "train", "test", or "validate".

Reimplemented in lbann::generic_compound_data_reader.

Here is the caller graph for this function:

◆ set_sample_stride()

void lbann::generic_data_reader::set_sample_stride ( const int  s)
inline

Set the sample stride.

Definition at line 438 of file data_reader.hpp.

◆ set_shuffle()

void lbann::generic_data_reader::set_shuffle ( bool  b)
inline

If set to false, indices (data samples) are not shuffled default (in ctor) is true.

Definition at line 217 of file data_reader.hpp.

◆ set_shuffled_indices()

void lbann::generic_data_reader::set_shuffled_indices ( const std::vector< int > &  indices)
inline

Set shuffled indices; primary use is for testing and reproducibility

Definition at line 228 of file data_reader.hpp.

◆ set_stride_to_last_mini_batch()

void lbann::generic_data_reader::set_stride_to_last_mini_batch ( const int  s)
inline

Set the last mini batch stride.

Definition at line 454 of file data_reader.hpp.

◆ set_stride_to_next_mini_batch()

void lbann::generic_data_reader::set_stride_to_next_mini_batch ( const int  s)
inline

Set the mini batch stride.

Definition at line 428 of file data_reader.hpp.

◆ set_transform_pipeline()

void lbann::generic_data_reader::set_transform_pipeline ( transform::transform_pipeline &&  tp)
inline

Set the transform pipeline this data reader will use.

Definition at line 598 of file data_reader.hpp.

◆ set_use_data_store()

void lbann::generic_data_reader::set_use_data_store ( bool  s)
inline

Definition at line 804 of file data_reader.hpp.

◆ set_use_fraction()

void lbann::generic_data_reader::set_use_fraction ( double  s)

Set the fraction of the data set to use for training and validation or testing.

Parameters
sThe fraction used, in the range [0, 1].

◆ setup()

virtual void lbann::generic_data_reader::setup ( int  num_io_threads,
observer_ptr< thread_pool io_thread_pool 
)
virtual

Prepare to start processing an epoch of data. If shuffle is true, then shuffle the indices of the data set If the base offset is not specified set it to 0 If the stride is not specified set it to batch size

Reimplemented in lbann::csv_reader, lbann::data_reader_jag_conduit, lbann::image_data_reader, lbann::mesh_reader, and lbann::data_reader_merge_features.

◆ setup_data_store()

void lbann::generic_data_reader::setup_data_store ( int  mini_batch_size)

sets up a data_store; this is called from build_model_from_prototext() in utils/lbann_library.cpp. This is a bit awkward: would like to call it when we instantiate the data_store, but we don;t know the mini_batch_size until later.

◆ shuffle_indices() [1/2]

virtual void lbann::generic_data_reader::shuffle_indices ( )
protectedvirtual

Shuffle indices (uses the data_seq_generator)

Here is the caller graph for this function:

◆ shuffle_indices() [2/2]

virtual void lbann::generic_data_reader::shuffle_indices ( rng_gen gen)
protectedvirtual

◆ start_data_store_mini_batch_exchange()

void lbann::generic_data_reader::start_data_store_mini_batch_exchange ( )

◆ update()

virtual bool lbann::generic_data_reader::update ( bool  is_active_reader)
virtual

During the network's update phase, the data reader will advanced the current position pointer. If the pointer wraps around, then reshuffle the data indicies.

◆ use_unused_index_set()

virtual void lbann::generic_data_reader::use_unused_index_set ( execution_mode  m)
virtual

Replaced the shuffled index set with the unused index set, empying the unused set.

Reimplemented in lbann::smiles_data_reader.

Friends And Related Function Documentation

◆ data_reader_merge_features

friend class data_reader_merge_features
friend

Definition at line 801 of file data_reader.hpp.

◆ data_reader_merge_samples

friend class data_reader_merge_samples
friend

Definition at line 802 of file data_reader.hpp.

Member Data Documentation

◆ m_absolute_sample_count

size_t lbann::generic_data_reader::m_absolute_sample_count

Definition at line 781 of file data_reader.hpp.

◆ m_base_offset

int lbann::generic_data_reader::m_base_offset

If there are multiple instances of the reader, then it may not reset to zero

Definition at line 749 of file data_reader.hpp.

◆ m_comm

lbann_comm* lbann::generic_data_reader::m_comm
protected

Definition at line 638 of file data_reader.hpp.

◆ m_current_mini_batch_idx

int lbann::generic_data_reader::m_current_mini_batch_idx

The index of the current mini-batch that is being processed (train/test/validate)

Definition at line 768 of file data_reader.hpp.

◆ m_current_pos

int lbann::generic_data_reader::m_current_pos

Definition at line 743 of file data_reader.hpp.

◆ m_data_fn

std::string lbann::generic_data_reader::m_data_fn

Definition at line 778 of file data_reader.hpp.

◆ m_data_sample_list

std::string lbann::generic_data_reader::m_data_sample_list

Definition at line 777 of file data_reader.hpp.

◆ m_data_store

data_store_conduit* lbann::generic_data_reader::m_data_store
protected

Definition at line 636 of file data_reader.hpp.

◆ m_execution_mode_split_fraction

std::map<execution_mode, double> lbann::generic_data_reader::m_execution_mode_split_fraction

Definition at line 782 of file data_reader.hpp.

◆ m_file_dir

std::string lbann::generic_data_reader::m_file_dir

Definition at line 775 of file data_reader.hpp.

◆ m_first_n

int lbann::generic_data_reader::m_first_n

Definition at line 784 of file data_reader.hpp.

◆ m_gan_label_value

int lbann::generic_data_reader::m_gan_label_value
protected

Definition at line 819 of file data_reader.hpp.

◆ m_gan_labelling

bool lbann::generic_data_reader::m_gan_labelling
protected

Definition at line 817 of file data_reader.hpp.

◆ m_io_thread_pool

observer_ptr<thread_pool> lbann::generic_data_reader::m_io_thread_pool
protected

Definition at line 822 of file data_reader.hpp.

◆ m_issue_warning

bool lbann::generic_data_reader::m_issue_warning
protected

for use with data_store: issue a warning a single time if m_data_store != nullptr, but we're not retrieving a conduit::Node from the store. This typically occurs during the test phase

Definition at line 834 of file data_reader.hpp.

◆ m_iteration_stride

int lbann::generic_data_reader::m_iteration_stride

Stride used by parallel data readers within the model.

Definition at line 754 of file data_reader.hpp.

◆ m_keep_sample_order

bool lbann::generic_data_reader::m_keep_sample_order
protected

Whether to keep the order of loaded samples same as it is in the file to make testing and validation easier

Definition at line 826 of file data_reader.hpp.

◆ m_label_fn

std::string lbann::generic_data_reader::m_label_fn

Definition at line 779 of file data_reader.hpp.

◆ m_last_mini_batch_size

int lbann::generic_data_reader::m_last_mini_batch_size

Definition at line 760 of file data_reader.hpp.

◆ m_loaded_mini_batch_idx

int lbann::generic_data_reader::m_loaded_mini_batch_idx

The index of the current mini-batch that has been loaded.

Definition at line 765 of file data_reader.hpp.

◆ m_local_file_dir

std::string lbann::generic_data_reader::m_local_file_dir

Definition at line 776 of file data_reader.hpp.

◆ m_max_files_to_load

size_t lbann::generic_data_reader::m_max_files_to_load

How many parallel readers are being used.

Definition at line 774 of file data_reader.hpp.

◆ m_mini_batch_size

int lbann::generic_data_reader::m_mini_batch_size

Definition at line 742 of file data_reader.hpp.

◆ m_num_iterations_per_epoch

int lbann::generic_data_reader::m_num_iterations_per_epoch

Definition at line 770 of file data_reader.hpp.

◆ m_num_parallel_readers

int lbann::generic_data_reader::m_num_parallel_readers

How many iterations all readers will execute.

Definition at line 772 of file data_reader.hpp.

◆ m_reset_mini_batch_index

int lbann::generic_data_reader::m_reset_mini_batch_index

The index at which this data reader starts its epoch.

Definition at line 763 of file data_reader.hpp.

◆ m_role

std::string lbann::generic_data_reader::m_role

Definition at line 785 of file data_reader.hpp.

◆ m_sample_stride

int lbann::generic_data_reader::m_sample_stride

Sample stride is used when a mini-batch is finely interleaved across a DATA_PARALLEL distribution.

Definition at line 752 of file data_reader.hpp.

◆ m_shuffle

bool lbann::generic_data_reader::m_shuffle

Definition at line 780 of file data_reader.hpp.

◆ m_shuffled_indices

std::vector<int> lbann::generic_data_reader::m_shuffled_indices

Definition at line 756 of file data_reader.hpp.

◆ m_stride_to_last_mini_batch

int lbann::generic_data_reader::m_stride_to_last_mini_batch

Definition at line 761 of file data_reader.hpp.

◆ m_stride_to_next_mini_batch

int lbann::generic_data_reader::m_stride_to_next_mini_batch

Batch Stride is typically batch_size, but may be a multiple of batch size if there are multiple readers

Definition at line 746 of file data_reader.hpp.

◆ m_supported_input_types

std::map<data_field_type, bool> lbann::generic_data_reader::m_supported_input_types
protected

Holds a true value for each input data type that is supported. Use an ordered map so that checkpoints are stable.

Definition at line 814 of file data_reader.hpp.

◆ m_transform_pipeline

transform::transform_pipeline lbann::generic_data_reader::m_transform_pipeline
protected

Transform pipeline for preprocessing data.

Definition at line 829 of file data_reader.hpp.

◆ m_unused_indices

unused_index_map_t lbann::generic_data_reader::m_unused_indices

Record of the indicies that are not being used for training.

Definition at line 758 of file data_reader.hpp.

◆ m_use_data_store

bool lbann::generic_data_reader::m_use_data_store = false
protected

Definition at line 810 of file data_reader.hpp.

◆ m_use_fraction

double lbann::generic_data_reader::m_use_fraction

Definition at line 783 of file data_reader.hpp.

◆ m_using_random_node

std::unordered_set<int> lbann::generic_data_reader::m_using_random_node
mutableprotected

Definition at line 615 of file data_reader.hpp.

◆ m_verbose

bool lbann::generic_data_reader::m_verbose = false
protected

Definition at line 612 of file data_reader.hpp.


The documentation for this class was generated from the following file: