28 #ifndef LBANN_DATA_READER_SMILES_HPP 29 #define LBANN_DATA_READER_SMILES_HPP 49 using offset_t = std::pair<long long, unsigned short>;
62 std::string
get_type()
const override {
return "smiles_data_reader"; }
95 std::string& filename_out,
97 unsigned short& length_out)
const;
110 std::vector<unsigned short>&
data);
115 std::vector<unsigned short>& data);
119 void decode_smiles(
const std::vector<unsigned short>& data, std::string& out);
130 get_raw_sample(std::istream* istrm,
size_t index,
size_t buf_offset = 0);
135 void set_offset(
size_t index,
long long offset,
unsigned short length);
163 : index(idx), offset(off), length(len)
237 std::istream* istream,
239 size_t buf_offset = 0);
244 std::vector<unsigned short>& output,
245 size_t buf_offset = 0);
251 std::vector<std::string>& data_filenames,
252 std::vector<std::string>& offsets_filenames);
256 return (isspace(c) || c ==
'\n' || c ==
'\t' || c ==
',');
262 #endif // LBANN_DATA_READER_SMILES_HPP
bool encode_smiles(const char *smiles, unsigned short size, std::vector< unsigned short > &data)
void load_offsets_and_lengths()
void decode_smiles(const std::vector< unsigned short > &data, std::string &out)
std::unordered_map< std::string, std::set< size_t > > m_filename_to_local_id_set
std::unordered_map< std::string, std::map< size_t, size_t > > m_local_to_index
size_t m_missing_char_in_vocab_count
const std::vector< El::Int > get_data_dims() const override
Get the dimensions of the data.
void set_linearized_data_size(size_t s)
std::unordered_map< size_t, size_t > m_index_to_local_id
int get_linearized_response_size() const override
Get the linearized size (i.e. number of elements) in a response.
~smiles_data_reader() override
std::string m_metadata_filename
int get_sequence_length()
void set_offset(size_t index, long long offset, unsigned short length)
T & data(const cnpy::NpyArray &na, const std::vector< size_t > indices)
std::unordered_map< size_t, std::string > m_index_to_filename
void read_offset_data(std::vector< SampleData > &data)
std::unordered_set< char > m_missing_chars
std::set< int > get_my_indices() const
void get_sample_origin(const size_t index_in, std::string &filename_out, size_t &offset_out, unsigned short &length_out) const
int get_linearized_data_size() const override
Get the linearized size (i.e. number of elements) in a sample.
std::unordered_map< char, short > m_vocab
smiles_data_reader & operator=(const smiles_data_reader &)
int m_linearized_response_size
int get_num_labels() const override
Return the number of labels (classes) in this dataset.
std::string get_type() const override
std::pair< long long, unsigned short > offset_t
const size_t OffsetBinarySize
int get_linearized_label_size() const override
Get the linearized size (i.e. number of elements) in a label.
bool fetch_label(CPUMat &Y, int data_id, int mb_idx) override
El::Matrix< DataType, El::Device::CPU > CPUMat
execution_mode
Neural network execution mode.
int m_linearized_data_size
void load_list_of_samples(const std::string sample_list_file)
void set_sequence_length(int n)
void print_statistics() const
void set_metadata_filename(std::string fn)
Sets the name of the metadata file.
void construct_conduit_node(conduit::Node &node, std::istream *istream, size_t sample_id, size_t buf_offset=0)
void use_unused_index_set(execution_mode m) override
bool fetch_datum(CPUMat &X, int data_id, int mb_idx) override
const std::string & get_metadata_filename()
Returns the name of the metadata file.
std::unordered_map< short, std::string > m_vocab_inv
void load_sample(std::istream *istrm, size_t index, std::vector< unsigned short > &output, size_t buf_offset=0)
void load_vocab(std::string filename)
std::unordered_map< size_t, offset_t > offset_map_t
bool is_delimiter(const char c)
SampleData(int idx, long long off, unsigned short len)
bool fetch_response(CPUMat &Y, int data_id, int mb_idx) override
void read_metadata_file(std::vector< size_t > &samples_per_file, std::vector< std::string > &data_filenames, std::vector< std::string > &offsets_filenames)
smiles_data_reader(const bool shuffle)
const size_t LengthBinarySize
smiles_data_reader * copy() const override
const std::streamsize OffsetAndLengthBinarySize
void do_preload_data_store() override
int m_linearized_label_size
std::string get_raw_sample(std::istream *istrm, size_t index, size_t buf_offset=0)
void copy_members(const smiles_data_reader &rhs)
Contains common code for operator= and copy ctor.
offset_map_t m_sample_offsets