LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
lbann::data_store_conduit Class Reference

#include <data_store_conduit.hpp>

Collaboration diagram for lbann::data_store_conduit:
[legend]

Public Types

using map_ii_t = std::unordered_map< int, int >
 
using map_is_t = std::unordered_map< int, size_t >
 
using map_pssi_t = std::unordered_map< std::pair< size_t, size_t >, int, size_t_pair_hash >
 
using map_ss_t = std::unordered_map< size_t, size_t >
 

Public Member Functions

 data_store_conduit (generic_data_reader *reader)
 ctor More...
 
 data_store_conduit (const data_store_conduit &)
 copy ctor More...
 
 data_store_conduit (const data_store_conduit &, const std::vector< int > &)
 copy / split ctor More...
 
data_store_conduitoperator= (const data_store_conduit &)
 operator= More...
 
data_store_conduitcopy () const
 
 ~data_store_conduit ()
 dtor More...
 
void set_data_reader_ptr (generic_data_reader *reader)
 
void set_shuffled_indices (const std::vector< int > *indices)
 convenience handle More...
 
size_t get_num_global_indices () const
 Returns the number of samples summed over all ranks. More...
 
void setup (int mini_batch_size)
 
void check_mem_capacity (lbann_comm *comm, const std::string sample_list_file, size_t stride, size_t offset)
 
const conduit::Node & get_conduit_node (int data_id) const
 Returns the conduit Node associated with the data_id. More...
 
void set_conduit_node (int data_id, const conduit::Node &node, bool already_have=false)
 Set a conduit node in the data store. More...
 
void set_preloaded_conduit_node (int data_id, const conduit::Node &node)
 
void spill_preloaded_conduit_node (int data_id, const conduit::Node &node)
 
const conduit::Node & get_random_node () const
 
const conduit::Node & get_random_node (const std::string &field) const
 
conduit::Node & get_empty_node (int data_id)
 returns an empty node More...
 
bool is_preloading () const
 Returns true if preloading is turned on. More...
 
bool is_explicitly_loading () const
 Returns true if explicitly loading is turned on. More...
 
bool is_fully_loaded () const
 Returns true if all loading has been completed. More...
 
bool is_local_cache () const
 Returns "true" is running in local cache mode. More...
 
void set_is_preloading (bool flag)
 Turn preloading on or off. More...
 
void set_is_explicitly_loading (bool flag)
 Turn on explicit loading. More...
 
void set_loading_is_complete ()
 Marks the data_store as fully loaded. More...
 
void set_is_local_cache (bool flag=true)
 turns local cache mode on of off More...
 
void check_query_flags () const
 Check that explicit loading, preloading, and fully loaded flags are consistent. More...
 
void exchange_owner_maps ()
 fills in m_owner, which maps index -> owning processor More...
 
void build_preloaded_owner_map (const std::vector< int > &per_rank_list_sizes)
 fills in m_owner, which maps index -> owning processor More...
 
void set_preloaded_owner_map (const std::unordered_map< int, int > &owner)
 fills in m_owner, which maps index -> owning processor More...
 
void clear_owner_map ()
 Special hanling for ras_lipid_conduit_data_reader; may go away in the future. More...
 
void set_owner_map (const std::unordered_map< int, int > &m)
 
void add_owner (int data_id, int owner)
 Special handling for ras_lipid_conduit_data_reader; may go away in the future. More...
 
void set_finished_building_map ()
 Special handling for ras_lipid_conduit_data_reader; may go away in the future. More...
 
void compact_nodes ()
 
int get_index_owner (int idx)
 
void preload_local_cache ()
 Read the data set into memory. More...
 
void start_exchange_mini_batch_data (size_t current_pos, size_t mb_size)
 
void finish_exchange_mini_batch_data ()
 
void set_node_sizes_vary ()
 
bool has_conduit_node (int data_id) const
 
int get_data_size ()
 for use during development and debugging More...
 
void copy_members (const data_store_conduit &rhs)
 made public for debugging during development More...
 
void flush_debug_file ()
 Closes then reopens the debug logging file. More...
 
void flush_profile_file () const
 Closes then reopens the profile logging file. More...
 
void write_checkpoint (std::string dir_name)
 Writes object's state to file. More...
 
void load_checkpoint (std::string dir_name, generic_data_reader *reader=nullptr)
 Loads object's state from file. More...
 
void set_profile_msg (std::string)
 Add text to the profiling file, if it's opened. More...
 
bool test_local_cache_imagenet (int n)
 Runs an internal test to ensure the locally cached conduit data is correct. More...
 
void test_imagenet_node (int sample_id, bool dereference=true)
 
size_t get_mem_usage ()
 

Public Attributes

std::ofstream * m_debug = nullptr
 
std::ofstream * m_profile = nullptr
 

Private Member Functions

void start_exchange_data_by_sample (size_t current_pos, size_t mb_size)
 
void finish_exchange_data_by_sample ()
 
void setup_data_store_buffers ()
 
void build_node_for_sending (const conduit::Node &node_in, conduit::Node &node_out)
 called by exchange_data More...
 
void exchange_sample_sizes ()
 for use when conduit Nodes have non-uniform size, e.g, imagenet More...
 
int build_indices_i_will_send (int current_pos, int mb_size)
 
int build_indices_i_will_recv (int current_pos, int mb_size)
 
void error_check_compacted_node (const conduit::Node &nd, int data_id)
 
void exchange_local_caches ()
 All ranks exchange their cached data. More...
 
void get_image_sizes (map_is_t &sizes, std::vector< std::vector< int >> &indices)
 
void allocate_shared_segment (map_is_t &sizes, std::vector< std::vector< int >> &indices)
 for use in local cache mode More...
 
void read_files (std::vector< char > &work, map_is_t &sizes, std::vector< int > &indices)
 for use in local cache mode More...
 
void compute_image_offsets (map_is_t &image_sizes, std::vector< std::vector< int >> &indices)
 fills in m_image_offsets for use in local cache mode More...
 
void exchange_images (std::vector< char > &work, map_is_t &image_sizes, std::vector< std::vector< int >> &indices)
 for use in local cache mode More...
 
void build_conduit_nodes (map_is_t &sizes)
 
void fillin_shared_images (char *images, size_t size, size_t offset)
 for use in local cache mode More...
 
void test_checkpoint (const std::string &)
 For testing during development. More...
 
void print_variables ()
 Called by test_checkpoint. More...
 
void print_partial_owner_map (int n)
 Called by test_checkpoint. More...
 
std::string get_conduit_dir () const
 
std::string get_cereal_fn () const
 
std::string get_metadata_fn () const
 
void make_dir_if_it_doesnt_exist (const std::string &dir)
 Creates the directory if it does not already exist. More...
 
void spill_conduit_node (const conduit::Node &node, int data_id)
 Writes conduit node to file. More...
 
void load_spilled_conduit_nodes ()
 Loads conduit nodes from file into m_data. More...
 
void setup_spill (std::string dir)
 Creates directory structure, opens metadata file for output, etc. More...
 
void save_state ()
 Saves this object's state to file. More...
 
void open_informational_files ()
 Optionally open debug and profiling files. More...
 
void open_next_conduit_spill_directory ()
 Creates a directory for spilling conduit nodes. More...
 
void profile_timing ()
 Write timing data for data exchange to the profile file, if it's opened. More...
 
void setup_checkpoint_test ()
 
std::string get_lassen_spill_dir ()
 
void verify_sample_size ()
 
void PROFILE () const
 
template<typename T , typename... Types>
void PROFILE (T var1, Types... var2) const
 
void DEBUG_DS ()
 
template<typename T , typename... Types>
void DEBUG_DS (T var1, Types... var2)
 

Private Attributes

bool m_bcast_sample_size = true
 
data_store_conduitm_other = nullptr
 
bool m_owner_maps_were_exchanged = false
 
bool m_run_checkpoint_test = false
 
size_t m_my_num_indices = 0
 The number of samples that this processor owns. More...
 
bool m_spill = false
 if true, then we are spilling (offloading) samples to disk More...
 
bool m_is_spilled = false
 if true, then all samples have been spilled More...
 
std::ofstream m_metadata
 
std::string m_spill_dir_base
 Base directory for spilling (offloading) conduit nodes. More...
 
int m_cur_spill_dir_integer = -1
 Used to form the directory path for spilling conduit nodes. More...
 
std::string m_cur_spill_dir
 Current directory for spilling (writing to file) conduit nodes. More...
 
std::string m_test_dir
 The directory to use for testing checkpointing. More...
 
int m_num_files_in_cur_spill_dir
 Contains the number of conduit nodes that have been written to m_cur_dir. More...
 
map_ii_t m_spilled_nodes
 maps data_id to m_m_cur_spill_dir_integer. More...
 
std::mutex m_mutex
 used in set_conduit_node(...) More...
 
std::mutex m_mutex_2
 
char * m_mem_seg = 0
 for use in local cache mode More...
 
size_t m_mem_seg_length = 0
 
std::string m_seg_name
 
const std::string m_debug_filename_base = "debug"
 
std::string m_debug_filename
 
const std::string m_profile_filename_base = "data_store_profile"
 
std::string m_profile_filename
 
bool m_was_loaded_from_file = false
 
const std::string m_cereal_fn = "data_store_cereal"
 
const int m_max_files_per_directory = 500
 
double m_exchange_sample_sizes_time = 0
 
double m_start_snd_rcv_time = 0
 
double m_wait_all_time = 0
 
double m_rebuild_time = 0
 
double m_exchange_time = 0
 
bool m_is_setup = false
 
bool m_loading_is_complete = false
 set to true if data_store is preloaded More...
 
bool m_preloading = false
 True, if we are in preload mode. More...
 
bool m_explicitly_loading = false
 True, if we are in explicit loading mode. More...
 
int m_owner_map_mb_size = 0
 
int m_compacted_sample_size = 0
 size of a compacted conduit::Node that contains a single sample More...
 
bool m_is_local_cache = false
 
bool m_node_sizes_vary = false
 
bool m_have_sample_sizes = false
 used in exchange_data_by_sample, when sample sizes are non-uniform More...
 
generic_data_readerm_reader
 
lbann_commm_comm = nullptr
 
bool m_world_master
 convenience handles More...
 
bool m_trainer_master
 
int m_rank_in_trainer
 
int m_rank_in_world = -1
 
int m_partition_in_trainer
 
int m_offset_in_partition
 
int m_np_in_trainer
 number of procs in the trainer; convenience handle More...
 
int m_num_partitions_in_trainer
 
bool m_mini_batch_data_exchange_started = false
 Flag to indicate if a data exchange has started. More...
 
map_pssi_t m_owner
 Maps an index to the processor that owns the associated data First value of index is the sample ID and second value is the partiton ID. More...
 
const std::vector< int > * m_shuffled_indices
 convenience handle More...
 
std::unordered_map< int, conduit::Node > m_data
 Contains the conduit nodes that are "owned" by this rank. More...
 
std::unordered_map< int, conduit::Node > m_data_cache
 Contains a cache of the conduit nodes that are "owned" by this rank. More...
 
std::vector< int > m_recv_data_ids
 Contains the list of data IDs that will be received. More...
 
map_ii_t m_recv_sample_sizes
 
std::unordered_map< int, conduit::Node > m_minibatch_data
 
std::vector< conduit::Node > m_send_buffer
 work space; used in exchange_data More...
 
std::vector< conduit::Node > m_send_buffer_2
 
std::vector< El::mpi::Request< El::byte > > m_send_requests
 
std::vector< El::mpi::Request< El::byte > > m_recv_requests
 
std::vector< conduit::Node > m_recv_buffer
 
std::vector< size_t > m_outgoing_msg_sizes
 
std::vector< size_t > m_incoming_msg_sizes
 
map_is_t m_sample_sizes
 Maps a data_id to its image size. More...
 
map_is_t m_image_offsets
 Maps a data_id to the image location in a shared memory segment. More...
 
std::vector< std::unordered_set< int > > m_indices_to_send
 
std::vector< std::unordered_set< int > > m_indices_to_recv
 

Detailed Description

Definition at line 61 of file data_store_conduit.hpp.

Member Typedef Documentation

◆ map_ii_t

using lbann::data_store_conduit::map_ii_t = std::unordered_map<int, int>

Definition at line 66 of file data_store_conduit.hpp.

◆ map_is_t

using lbann::data_store_conduit::map_is_t = std::unordered_map<int, size_t>

Definition at line 67 of file data_store_conduit.hpp.

◆ map_pssi_t

using lbann::data_store_conduit::map_pssi_t = std::unordered_map<std::pair<size_t, size_t>, int, size_t_pair_hash>

Definition at line 71 of file data_store_conduit.hpp.

◆ map_ss_t

using lbann::data_store_conduit::map_ss_t = std::unordered_map<size_t, size_t>

Definition at line 74 of file data_store_conduit.hpp.

Constructor & Destructor Documentation

◆ data_store_conduit() [1/3]

lbann::data_store_conduit::data_store_conduit ( generic_data_reader reader)

ctor

◆ data_store_conduit() [2/3]

lbann::data_store_conduit::data_store_conduit ( const data_store_conduit )

copy ctor

◆ data_store_conduit() [3/3]

lbann::data_store_conduit::data_store_conduit ( const data_store_conduit ,
const std::vector< int > &   
)

copy / split ctor

◆ ~data_store_conduit()

lbann::data_store_conduit::~data_store_conduit ( )

dtor

Member Function Documentation

◆ add_owner()

void lbann::data_store_conduit::add_owner ( int  data_id,
int  owner 
)
inline

Special handling for ras_lipid_conduit_data_reader; may go away in the future.

Definition at line 226 of file data_store_conduit.hpp.

◆ allocate_shared_segment()

void lbann::data_store_conduit::allocate_shared_segment ( map_is_t sizes,
std::vector< std::vector< int >> &  indices 
)
private

for use in local cache mode

◆ build_conduit_nodes()

void lbann::data_store_conduit::build_conduit_nodes ( map_is_t sizes)
private

◆ build_indices_i_will_recv()

int lbann::data_store_conduit::build_indices_i_will_recv ( int  current_pos,
int  mb_size 
)
private

fills in m_indices_to_recv and returns the number of samples that will be received

◆ build_indices_i_will_send()

int lbann::data_store_conduit::build_indices_i_will_send ( int  current_pos,
int  mb_size 
)
private

fills in m_indices_to_send and returns the number of samples that will be sent

◆ build_node_for_sending()

void lbann::data_store_conduit::build_node_for_sending ( const conduit::Node &  node_in,
conduit::Node &  node_out 
)
private

called by exchange_data

◆ build_preloaded_owner_map()

void lbann::data_store_conduit::build_preloaded_owner_map ( const std::vector< int > &  per_rank_list_sizes)

fills in m_owner, which maps index -> owning processor

◆ check_mem_capacity()

void lbann::data_store_conduit::check_mem_capacity ( lbann_comm comm,
const std::string  sample_list_file,
size_t  stride,
size_t  offset 
)

◆ check_query_flags()

void lbann::data_store_conduit::check_query_flags ( ) const

Check that explicit loading, preloading, and fully loaded flags are consistent.

◆ clear_owner_map()

void lbann::data_store_conduit::clear_owner_map ( )

Special hanling for ras_lipid_conduit_data_reader; may go away in the future.

◆ compact_nodes()

void lbann::data_store_conduit::compact_nodes ( )

Recompact the nodes because they are not copied properly when instantiating using the copy constructor

◆ compute_image_offsets()

void lbann::data_store_conduit::compute_image_offsets ( map_is_t image_sizes,
std::vector< std::vector< int >> &  indices 
)
private

fills in m_image_offsets for use in local cache mode

◆ copy()

data_store_conduit* lbann::data_store_conduit::copy ( ) const
inline

Definition at line 88 of file data_store_conduit.hpp.

◆ copy_members()

void lbann::data_store_conduit::copy_members ( const data_store_conduit rhs)

made public for debugging during development

◆ DEBUG_DS() [1/2]

void lbann::data_store_conduit::DEBUG_DS ( )
inlineprivate

Definition at line 688 of file data_store_conduit.hpp.

◆ DEBUG_DS() [2/2]

template<typename T , typename... Types>
void lbann::data_store_conduit::DEBUG_DS ( var1,
Types...  var2 
)
inlineprivate

Definition at line 698 of file data_store_conduit.hpp.

◆ error_check_compacted_node()

void lbann::data_store_conduit::error_check_compacted_node ( const conduit::Node &  nd,
int  data_id 
)
private

◆ exchange_images()

void lbann::data_store_conduit::exchange_images ( std::vector< char > &  work,
map_is_t image_sizes,
std::vector< std::vector< int >> &  indices 
)
private

for use in local cache mode

◆ exchange_local_caches()

void lbann::data_store_conduit::exchange_local_caches ( )
private

All ranks exchange their cached data.

◆ exchange_owner_maps()

void lbann::data_store_conduit::exchange_owner_maps ( )

fills in m_owner, which maps index -> owning processor

◆ exchange_sample_sizes()

void lbann::data_store_conduit::exchange_sample_sizes ( )
private

for use when conduit Nodes have non-uniform size, e.g, imagenet

◆ fillin_shared_images()

void lbann::data_store_conduit::fillin_shared_images ( char *  images,
size_t  size,
size_t  offset 
)
private

for use in local cache mode

◆ finish_exchange_data_by_sample()

void lbann::data_store_conduit::finish_exchange_data_by_sample ( )
private

◆ finish_exchange_mini_batch_data()

void lbann::data_store_conduit::finish_exchange_mini_batch_data ( )

◆ flush_debug_file()

void lbann::data_store_conduit::flush_debug_file ( )

Closes then reopens the debug logging file.

Debug logging is enabled on all ranks via the cmd line flag: –data_store_debug

◆ flush_profile_file()

void lbann::data_store_conduit::flush_profile_file ( ) const

Closes then reopens the profile logging file.

Profile logging is enabled on P_0 via the cmd line flag: –data_store_profile

◆ get_cereal_fn()

std::string lbann::data_store_conduit::get_cereal_fn ( ) const
private

◆ get_conduit_dir()

std::string lbann::data_store_conduit::get_conduit_dir ( ) const
private

◆ get_conduit_node()

const conduit::Node& lbann::data_store_conduit::get_conduit_node ( int  data_id) const

Returns the conduit Node associated with the data_id.

◆ get_data_size()

int lbann::data_store_conduit::get_data_size ( )
inline

for use during development and debugging

Definition at line 264 of file data_store_conduit.hpp.

◆ get_empty_node()

conduit::Node& lbann::data_store_conduit::get_empty_node ( int  data_id)

returns an empty node

◆ get_image_sizes()

void lbann::data_store_conduit::get_image_sizes ( map_is_t sizes,
std::vector< std::vector< int >> &  indices 
)
private

Currently only used for imagenet. On return, 'sizes' maps a sample_id to image size, and indices[p] contains the sample_ids that P_p owns for use in local cache mode

◆ get_index_owner()

int lbann::data_store_conduit::get_index_owner ( int  idx)

returns the processor that owns the data associated with the index

◆ get_lassen_spill_dir()

std::string lbann::data_store_conduit::get_lassen_spill_dir ( )
private

◆ get_mem_usage()

size_t lbann::data_store_conduit::get_mem_usage ( )

◆ get_metadata_fn()

std::string lbann::data_store_conduit::get_metadata_fn ( ) const
private

◆ get_num_global_indices()

size_t lbann::data_store_conduit::get_num_global_indices ( ) const

Returns the number of samples summed over all ranks.

◆ get_random_node() [1/2]

const conduit::Node& lbann::data_store_conduit::get_random_node ( ) const

◆ get_random_node() [2/2]

const conduit::Node& lbann::data_store_conduit::get_random_node ( const std::string &  field) const

◆ has_conduit_node()

bool lbann::data_store_conduit::has_conduit_node ( int  data_id) const

◆ is_explicitly_loading()

bool lbann::data_store_conduit::is_explicitly_loading ( ) const
inline

Returns true if explicitly loading is turned on.

'explicitly loading' means that the data that will be owned by each rank is passed into the data store during the first epoch. This is in contrast to preloading, in which the data is passed into the data store prior to the first epoch. Explicit and preloading are exclusive: at most only one may be true, however, both will be set to false when all loading is complete.

Definition at line 150 of file data_store_conduit.hpp.

◆ is_fully_loaded()

bool lbann::data_store_conduit::is_fully_loaded ( ) const

Returns true if all loading has been completed.

See notes in: set_loading_is_complete()

◆ is_local_cache()

bool lbann::data_store_conduit::is_local_cache ( ) const
inline

Returns "true" is running in local cache mode.

In local cache mode, each node contains a complete copy of the data set. This is stored in a shared memory segment, but part of the set may be spilled to disk if memory is insufficient. Local cache mode is activated via the cmd line flag: –data_store_cache

Definition at line 166 of file data_store_conduit.hpp.

◆ is_preloading()

bool lbann::data_store_conduit::is_preloading ( ) const
inline

Returns true if preloading is turned on.

See notes in: is_explicitly_loading()

Definition at line 139 of file data_store_conduit.hpp.

◆ load_checkpoint()

void lbann::data_store_conduit::load_checkpoint ( std::string  dir_name,
generic_data_reader reader = nullptr 
)

Loads object's state from file.

◆ load_spilled_conduit_nodes()

void lbann::data_store_conduit::load_spilled_conduit_nodes ( )
private

Loads conduit nodes from file into m_data.

◆ make_dir_if_it_doesnt_exist()

void lbann::data_store_conduit::make_dir_if_it_doesnt_exist ( const std::string &  dir)
private

Creates the directory if it does not already exist.

◆ open_informational_files()

void lbann::data_store_conduit::open_informational_files ( )
private

Optionally open debug and profiling files.

A debug file is opened for every <rank, data reader role> pair; files are opened if the cmd flag –data_store_debug is passed. A profiling file is opened only be <world_master, data reader role> pairs; files are opened if the cmd flag –data_store_profile is passed.

◆ open_next_conduit_spill_directory()

void lbann::data_store_conduit::open_next_conduit_spill_directory ( )
private

Creates a directory for spilling conduit nodes.

◆ operator=()

data_store_conduit& lbann::data_store_conduit::operator= ( const data_store_conduit )

operator=

◆ preload_local_cache()

void lbann::data_store_conduit::preload_local_cache ( )

Read the data set into memory.

Each rank reads a portion of the data set, then bcasts to all other ranks.

◆ print_partial_owner_map()

void lbann::data_store_conduit::print_partial_owner_map ( int  n)
private

Called by test_checkpoint.

For testing and development. Prints the first 'n' entries from the owner map * (which maps sample_id -> owning rank) to std::cout

◆ print_variables()

void lbann::data_store_conduit::print_variables ( )
private

Called by test_checkpoint.

◆ PROFILE() [1/2]

void lbann::data_store_conduit::PROFILE ( ) const
inlineprivate

Definition at line 665 of file data_store_conduit.hpp.

◆ PROFILE() [2/2]

template<typename T , typename... Types>
void lbann::data_store_conduit::PROFILE ( var1,
Types...  var2 
) const
inlineprivate

Definition at line 675 of file data_store_conduit.hpp.

◆ profile_timing()

void lbann::data_store_conduit::profile_timing ( )
private

Write timing data for data exchange to the profile file, if it's opened.

◆ read_files()

void lbann::data_store_conduit::read_files ( std::vector< char > &  work,
map_is_t sizes,
std::vector< int > &  indices 
)
private

for use in local cache mode

◆ save_state()

void lbann::data_store_conduit::save_state ( )
private

Saves this object's state to file.

Here, "state" is all data, except for conduit nodes, that is needed to reload from checkpoint

◆ set_conduit_node()

void lbann::data_store_conduit::set_conduit_node ( int  data_id,
const conduit::Node &  node,
bool  already_have = false 
)

Set a conduit node in the data store.

if 'already_have = true' then the passed 'node' was obtained by a call to get_empty_node(); note, we do this to prevent copying the node

◆ set_data_reader_ptr()

void lbann::data_store_conduit::set_data_reader_ptr ( generic_data_reader reader)

◆ set_finished_building_map()

void lbann::data_store_conduit::set_finished_building_map ( )
inline

Special handling for ras_lipid_conduit_data_reader; may go away in the future.

Definition at line 233 of file data_store_conduit.hpp.

◆ set_is_explicitly_loading()

void lbann::data_store_conduit::set_is_explicitly_loading ( bool  flag)

Turn on explicit loading.

◆ set_is_local_cache()

void lbann::data_store_conduit::set_is_local_cache ( bool  flag = true)
inline

turns local cache mode on of off

Definition at line 188 of file data_store_conduit.hpp.

◆ set_is_preloading()

void lbann::data_store_conduit::set_is_preloading ( bool  flag)

Turn preloading on or off.

◆ set_loading_is_complete()

void lbann::data_store_conduit::set_loading_is_complete ( )

Marks the data_store as fully loaded.

Fully loaded means that each rank has all the data that it is intended to own. When not running in local cache mode, this occurs (1) at the conclusion of preloading, prior to the beginning of the first epoch, or (2) at the conclusion of the first epoch, if explicitly loading. When running in local cache mode, this occurs (1) at the conclusion of preload_local_cache(), which is called prior to the first epoch, or (2) at the conclusion of exchange_local_caches(), at th conclusion of the first epoch, if explicitly loading.

◆ set_node_sizes_vary()

void lbann::data_store_conduit::set_node_sizes_vary ( )
inline

Definition at line 253 of file data_store_conduit.hpp.

◆ set_owner_map()

void lbann::data_store_conduit::set_owner_map ( const std::unordered_map< int, int > &  m)
inline

Definition at line 217 of file data_store_conduit.hpp.

◆ set_preloaded_conduit_node()

void lbann::data_store_conduit::set_preloaded_conduit_node ( int  data_id,
const conduit::Node &  node 
)

◆ set_preloaded_owner_map()

void lbann::data_store_conduit::set_preloaded_owner_map ( const std::unordered_map< int, int > &  owner)
inline

fills in m_owner, which maps index -> owning processor

Definition at line 206 of file data_store_conduit.hpp.

◆ set_profile_msg()

void lbann::data_store_conduit::set_profile_msg ( std::string  )

Add text to the profiling file, if it's opened.

◆ set_shuffled_indices()

void lbann::data_store_conduit::set_shuffled_indices ( const std::vector< int > *  indices)

convenience handle

◆ setup()

void lbann::data_store_conduit::setup ( int  mini_batch_size)

◆ setup_checkpoint_test()

void lbann::data_store_conduit::setup_checkpoint_test ( )
private

◆ setup_data_store_buffers()

void lbann::data_store_conduit::setup_data_store_buffers ( )
private

◆ setup_spill()

void lbann::data_store_conduit::setup_spill ( std::string  dir)
private

Creates directory structure, opens metadata file for output, etc.

This method is called for both –data_store_spill and –data_store_test_checkpoint

◆ spill_conduit_node()

void lbann::data_store_conduit::spill_conduit_node ( const conduit::Node &  node,
int  data_id 
)
private

Writes conduit node to file.

◆ spill_preloaded_conduit_node()

void lbann::data_store_conduit::spill_preloaded_conduit_node ( int  data_id,
const conduit::Node &  node 
)

◆ start_exchange_data_by_sample()

void lbann::data_store_conduit::start_exchange_data_by_sample ( size_t  current_pos,
size_t  mb_size 
)
private

◆ start_exchange_mini_batch_data()

void lbann::data_store_conduit::start_exchange_mini_batch_data ( size_t  current_pos,
size_t  mb_size 
)

◆ test_checkpoint()

void lbann::data_store_conduit::test_checkpoint ( const std::string &  )
private

For testing during development.

At the beginning of the 2nd epoch, calls write_checkpoint(), clears some variables, calls load_checkpoint then continues. To activate this test use cmd flag: –data_store_test_checkpoint=

◆ test_imagenet_node()

void lbann::data_store_conduit::test_imagenet_node ( int  sample_id,
bool  dereference = true 
)

◆ test_local_cache_imagenet()

bool lbann::data_store_conduit::test_local_cache_imagenet ( int  n)

Runs an internal test to ensure the locally cached conduit data is correct.

For use during development and testing. This test is activated via the cmd line flag: –data_store_test_cache. Output may be written to cout, and the profile and debug files (if they are opened)

Parameters
nis the maximum number of samples to test; set to -1 to test all
Returns
true, if all samples read from file match those constructed from the local shared memory segment (aka, cache)

◆ verify_sample_size()

void lbann::data_store_conduit::verify_sample_size ( )
private

◆ write_checkpoint()

void lbann::data_store_conduit::write_checkpoint ( std::string  dir_name)

Writes object's state to file.

Member Data Documentation

◆ m_bcast_sample_size

bool lbann::data_store_conduit::m_bcast_sample_size = true
private

Definition at line 314 of file data_store_conduit.hpp.

◆ m_cereal_fn

const std::string lbann::data_store_conduit::m_cereal_fn = "data_store_cereal"
private

Definition at line 386 of file data_store_conduit.hpp.

◆ m_comm

lbann_comm* lbann::data_store_conduit::m_comm = nullptr
private

Definition at line 456 of file data_store_conduit.hpp.

◆ m_compacted_sample_size

int lbann::data_store_conduit::m_compacted_sample_size = 0
private

size of a compacted conduit::Node that contains a single sample

Definition at line 445 of file data_store_conduit.hpp.

◆ m_cur_spill_dir

std::string lbann::data_store_conduit::m_cur_spill_dir
private

Current directory for spilling (writing to file) conduit nodes.

m_cur_spill_dir = m_spill_dir_base/m_cur_spill_dir_integer

Definition at line 348 of file data_store_conduit.hpp.

◆ m_cur_spill_dir_integer

int lbann::data_store_conduit::m_cur_spill_dir_integer = -1
private

Used to form the directory path for spilling conduit nodes.

Definition at line 341 of file data_store_conduit.hpp.

◆ m_data

std::unordered_map<int, conduit::Node> lbann::data_store_conduit::m_data
mutableprivate

Contains the conduit nodes that are "owned" by this rank.

Map data_id -> conduit::Node. Must be mutable since rhs.m_owner may be modified in copy_members, in which rhs is const.

Definition at line 490 of file data_store_conduit.hpp.

◆ m_data_cache

std::unordered_map<int, conduit::Node> lbann::data_store_conduit::m_data_cache
private

Contains a cache of the conduit nodes that are "owned" by this rank.

This differs from m_data in that this holds temporarily, during the first epoch, if we're running in local cache mode and explicitly loading

Definition at line 499 of file data_store_conduit.hpp.

◆ m_debug

std::ofstream* lbann::data_store_conduit::m_debug = nullptr

only used for debugging; pass –debug on cmd line to get each data store to print to a different file. This is made public so data readers can also print to the file

Definition at line 260 of file data_store_conduit.hpp.

◆ m_debug_filename

std::string lbann::data_store_conduit::m_debug_filename
private

Definition at line 380 of file data_store_conduit.hpp.

◆ m_debug_filename_base

const std::string lbann::data_store_conduit::m_debug_filename_base = "debug"
private

Definition at line 379 of file data_store_conduit.hpp.

◆ m_exchange_sample_sizes_time

double lbann::data_store_conduit::m_exchange_sample_sizes_time = 0
private

Definition at line 398 of file data_store_conduit.hpp.

◆ m_exchange_time

double lbann::data_store_conduit::m_exchange_time = 0
private

Definition at line 410 of file data_store_conduit.hpp.

◆ m_explicitly_loading

bool lbann::data_store_conduit::m_explicitly_loading = false
private

True, if we are in explicit loading mode.

There is some redundancy here: m_preloading and m_explicitly_loading can not both be true, but both may be false. When m_loading_is_complete is true, both m_preloading and m_preloading should be false.

Definition at line 436 of file data_store_conduit.hpp.

◆ m_have_sample_sizes

bool lbann::data_store_conduit::m_have_sample_sizes = false
private

used in exchange_data_by_sample, when sample sizes are non-uniform

Definition at line 452 of file data_store_conduit.hpp.

◆ m_image_offsets

map_is_t lbann::data_store_conduit::m_image_offsets
private

Maps a data_id to the image location in a shared memory segment.

Definition at line 526 of file data_store_conduit.hpp.

◆ m_incoming_msg_sizes

std::vector<size_t> lbann::data_store_conduit::m_incoming_msg_sizes
private

Definition at line 516 of file data_store_conduit.hpp.

◆ m_indices_to_recv

std::vector<std::unordered_set<int> > lbann::data_store_conduit::m_indices_to_recv
private

maps processor id -> set of indices (whose associated samples) this proc needs to recv from others. (formerly called "needed")

Definition at line 535 of file data_store_conduit.hpp.

◆ m_indices_to_send

std::vector<std::unordered_set<int> > lbann::data_store_conduit::m_indices_to_send
private

maps processor id -> set of indices (whose associated samples) this proc needs to send. (formerly called "proc_to_indices); this is filled in by build_indices_i_will_send()

Definition at line 531 of file data_store_conduit.hpp.

◆ m_is_local_cache

bool lbann::data_store_conduit::m_is_local_cache = false
private

Definition at line 447 of file data_store_conduit.hpp.

◆ m_is_setup

bool lbann::data_store_conduit::m_is_setup = false
private

Definition at line 422 of file data_store_conduit.hpp.

◆ m_is_spilled

bool lbann::data_store_conduit::m_is_spilled = false
private

if true, then all samples have been spilled

Definition at line 332 of file data_store_conduit.hpp.

◆ m_loading_is_complete

bool lbann::data_store_conduit::m_loading_is_complete = false
private

set to true if data_store is preloaded

Definition at line 425 of file data_store_conduit.hpp.

◆ m_max_files_per_directory

const int lbann::data_store_conduit::m_max_files_per_directory = 500
private

used in spill_to_file (actually, conduit::Node.save() writes both a json file and a binary file, so double this number

Definition at line 391 of file data_store_conduit.hpp.

◆ m_mem_seg

char* lbann::data_store_conduit::m_mem_seg = 0
private

for use in local cache mode

Definition at line 375 of file data_store_conduit.hpp.

◆ m_mem_seg_length

size_t lbann::data_store_conduit::m_mem_seg_length = 0
private

Definition at line 376 of file data_store_conduit.hpp.

◆ m_metadata

std::ofstream lbann::data_store_conduit::m_metadata
private

During spilling, the conduit file pathnames are written to this file

Definition at line 335 of file data_store_conduit.hpp.

◆ m_mini_batch_data_exchange_started

bool lbann::data_store_conduit::m_mini_batch_data_exchange_started = false
private

Flag to indicate if a data exchange has started.

Definition at line 471 of file data_store_conduit.hpp.

◆ m_minibatch_data

std::unordered_map<int, conduit::Node> lbann::data_store_conduit::m_minibatch_data
private

This vector contains Nodes that this processor needs for the current minibatch; this is filled in by exchange_data()

Definition at line 507 of file data_store_conduit.hpp.

◆ m_mutex

std::mutex lbann::data_store_conduit::m_mutex
mutableprivate

used in set_conduit_node(...)

Definition at line 370 of file data_store_conduit.hpp.

◆ m_mutex_2

std::mutex lbann::data_store_conduit::m_mutex_2
private

Definition at line 372 of file data_store_conduit.hpp.

◆ m_my_num_indices

size_t lbann::data_store_conduit::m_my_num_indices = 0
private

The number of samples that this processor owns.

Definition at line 326 of file data_store_conduit.hpp.

◆ m_node_sizes_vary

bool lbann::data_store_conduit::m_node_sizes_vary = false
private

Definition at line 449 of file data_store_conduit.hpp.

◆ m_np_in_trainer

int lbann::data_store_conduit::m_np_in_trainer
private

number of procs in the trainer; convenience handle

Definition at line 467 of file data_store_conduit.hpp.

◆ m_num_files_in_cur_spill_dir

int lbann::data_store_conduit::m_num_files_in_cur_spill_dir
private

Contains the number of conduit nodes that have been written to m_cur_dir.

When m_num_files_in_cur_spill_dir == m_max_files_per_directory, m_cur_spill_dir_integer is incremented and a new m_cur_dir is created

Definition at line 363 of file data_store_conduit.hpp.

◆ m_num_partitions_in_trainer

int lbann::data_store_conduit::m_num_partitions_in_trainer
private

Definition at line 468 of file data_store_conduit.hpp.

◆ m_offset_in_partition

int lbann::data_store_conduit::m_offset_in_partition
private

Definition at line 464 of file data_store_conduit.hpp.

◆ m_other

data_store_conduit* lbann::data_store_conduit::m_other = nullptr
private

Definition at line 319 of file data_store_conduit.hpp.

◆ m_outgoing_msg_sizes

std::vector<size_t> lbann::data_store_conduit::m_outgoing_msg_sizes
private

Definition at line 515 of file data_store_conduit.hpp.

◆ m_owner

map_pssi_t lbann::data_store_conduit::m_owner
mutableprivate

Maps an index to the processor that owns the associated data First value of index is the sample ID and second value is the partiton ID.

Must be mutable since rhs.m_owner may be modified in copy_members, in which rhs is const.

Definition at line 479 of file data_store_conduit.hpp.

◆ m_owner_map_mb_size

int lbann::data_store_conduit::m_owner_map_mb_size = 0
private

The size of the mini-batch that was used to calculate ownership of samples when building the owner map. This size has to be used consistently when computing the indices that will be sent and received.

Definition at line 442 of file data_store_conduit.hpp.

◆ m_owner_maps_were_exchanged

bool lbann::data_store_conduit::m_owner_maps_were_exchanged = false
private

Definition at line 321 of file data_store_conduit.hpp.

◆ m_partition_in_trainer

int lbann::data_store_conduit::m_partition_in_trainer
private

Definition at line 463 of file data_store_conduit.hpp.

◆ m_preloading

bool lbann::data_store_conduit::m_preloading = false
private

True, if we are in preload mode.

Definition at line 428 of file data_store_conduit.hpp.

◆ m_profile

std::ofstream* lbann::data_store_conduit::m_profile = nullptr

Definition at line 261 of file data_store_conduit.hpp.

◆ m_profile_filename

std::string lbann::data_store_conduit::m_profile_filename
private

Definition at line 383 of file data_store_conduit.hpp.

◆ m_profile_filename_base

const std::string lbann::data_store_conduit::m_profile_filename_base = "data_store_profile"
private

Definition at line 382 of file data_store_conduit.hpp.

◆ m_rank_in_trainer

int lbann::data_store_conduit::m_rank_in_trainer
private

Definition at line 461 of file data_store_conduit.hpp.

◆ m_rank_in_world

int lbann::data_store_conduit::m_rank_in_world = -1
private

Definition at line 462 of file data_store_conduit.hpp.

◆ m_reader

generic_data_reader* lbann::data_store_conduit::m_reader
private

Definition at line 454 of file data_store_conduit.hpp.

◆ m_rebuild_time

double lbann::data_store_conduit::m_rebuild_time = 0
private

Definition at line 407 of file data_store_conduit.hpp.

◆ m_recv_buffer

std::vector<conduit::Node> lbann::data_store_conduit::m_recv_buffer
private

Definition at line 514 of file data_store_conduit.hpp.

◆ m_recv_data_ids

std::vector<int> lbann::data_store_conduit::m_recv_data_ids
private

Contains the list of data IDs that will be received.

Definition at line 502 of file data_store_conduit.hpp.

◆ m_recv_requests

std::vector<El::mpi::Request<El::byte> > lbann::data_store_conduit::m_recv_requests
private

Definition at line 513 of file data_store_conduit.hpp.

◆ m_recv_sample_sizes

map_ii_t lbann::data_store_conduit::m_recv_sample_sizes
private

Definition at line 503 of file data_store_conduit.hpp.

◆ m_run_checkpoint_test

bool lbann::data_store_conduit::m_run_checkpoint_test = false
private

Definition at line 323 of file data_store_conduit.hpp.

◆ m_sample_sizes

map_is_t lbann::data_store_conduit::m_sample_sizes
private

Maps a data_id to its image size.

Used when conduit Nodes have non-uniform size, e.g, imagenet; see: set_node_sizes_vary()

Definition at line 523 of file data_store_conduit.hpp.

◆ m_seg_name

std::string lbann::data_store_conduit::m_seg_name
private

Definition at line 377 of file data_store_conduit.hpp.

◆ m_send_buffer

std::vector<conduit::Node> lbann::data_store_conduit::m_send_buffer
private

work space; used in exchange_data

Definition at line 510 of file data_store_conduit.hpp.

◆ m_send_buffer_2

std::vector<conduit::Node> lbann::data_store_conduit::m_send_buffer_2
private

Definition at line 511 of file data_store_conduit.hpp.

◆ m_send_requests

std::vector<El::mpi::Request<El::byte> > lbann::data_store_conduit::m_send_requests
private

Definition at line 512 of file data_store_conduit.hpp.

◆ m_shuffled_indices

const std::vector<int>* lbann::data_store_conduit::m_shuffled_indices
private

convenience handle

Definition at line 482 of file data_store_conduit.hpp.

◆ m_spill

bool lbann::data_store_conduit::m_spill = false
private

if true, then we are spilling (offloading) samples to disk

Definition at line 329 of file data_store_conduit.hpp.

◆ m_spill_dir_base

std::string lbann::data_store_conduit::m_spill_dir_base
private

Base directory for spilling (offloading) conduit nodes.

Definition at line 338 of file data_store_conduit.hpp.

◆ m_spilled_nodes

map_ii_t lbann::data_store_conduit::m_spilled_nodes
private

maps data_id to m_m_cur_spill_dir_integer.

Definition at line 366 of file data_store_conduit.hpp.

◆ m_start_snd_rcv_time

double lbann::data_store_conduit::m_start_snd_rcv_time = 0
private

Definition at line 401 of file data_store_conduit.hpp.

◆ m_test_dir

std::string lbann::data_store_conduit::m_test_dir
private

The directory to use for testing checkpointing.

Testing is activated by passing the cmd flag: –data_store_test_checkpoint=<dir>

Definition at line 355 of file data_store_conduit.hpp.

◆ m_trainer_master

bool lbann::data_store_conduit::m_trainer_master
private

Definition at line 460 of file data_store_conduit.hpp.

◆ m_wait_all_time

double lbann::data_store_conduit::m_wait_all_time = 0
private

Definition at line 404 of file data_store_conduit.hpp.

◆ m_was_loaded_from_file

bool lbann::data_store_conduit::m_was_loaded_from_file = false
private

Definition at line 385 of file data_store_conduit.hpp.

◆ m_world_master

bool lbann::data_store_conduit::m_world_master
private

convenience handles

Definition at line 459 of file data_store_conduit.hpp.


The documentation for this class was generated from the following file: