LBANN  0.103.0
LivermoreBigArtificialNeuralNetworkToolkit
data_reader_smiles.hpp
Go to the documentation of this file.
1 // Copyright (c) 2014-2023, Lawrence Livermore National Security, LLC.
3 // Produced at the Lawrence Livermore National Laboratory.
4 // Written by the LBANN Research Team (B. Van Essen, et al.) listed in
5 // the CONTRIBUTORS file. <lbann-dev@llnl.gov>
6 //
7 // LLNL-CODE-697807.
8 // All rights reserved.
9 //
10 // This file is part of LBANN: Livermore Big Artificial Neural Network
11 // Toolkit. For details, see http://software.llnl.gov/LBANN or
12 // https://github.com/LLNL/LBANN.
13 //
14 // Licensed under the Apache License, Version 2.0 (the "Licensee"); you
15 // may not use this file except in compliance with the License. You may
16 // obtain a copy of the License at:
17 //
18 // http://www.apache.org/licenses/LICENSE-2.0
19 //
20 // Unless required by applicable law or agreed to in writing, software
21 // distributed under the License is distributed on an "AS IS" BASIS,
22 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
23 // implied. See the License for the specific language governing
24 // permissions and limitations under the license.
25 //
27 
28 #ifndef LBANN_DATA_READER_SMILES_HPP
29 #define LBANN_DATA_READER_SMILES_HPP
30 
33 
34 namespace lbann {
45  : public data_reader_sample_list<sample_list_ifstream<long long>>
46 {
47 public:
48  // Types for mapping a sample id to an <offset,length> locator
49  using offset_t = std::pair<long long, unsigned short>;
50  using offset_map_t = std::unordered_map<size_t, offset_t>;
51 
52  smiles_data_reader(const bool shuffle);
55  ~smiles_data_reader() override;
56 
57  smiles_data_reader* copy() const override
58  {
59  return new smiles_data_reader(*this);
60  }
61 
62  std::string get_type() const override { return "smiles_data_reader"; }
63 
64  void load() override;
65 
66  int get_linearized_data_size() const override
67  {
69  }
70  int get_linearized_label_size() const override
71  {
73  }
74  int get_linearized_response_size() const override
75  {
77  }
78  const std::vector<El::Int> get_data_dims() const override
79  {
80  return {get_linearized_data_size()};
81  }
82  int get_num_labels() const override { return m_num_labels; }
83 
84  void set_sequence_length(int n)
85  {
87  m_linearized_data_size = n + 2;
88  }
90 
91  void use_unused_index_set(execution_mode m) override;
92 
94  void get_sample_origin(const size_t index_in,
95  std::string& filename_out,
96  size_t& offset_out,
97  unsigned short& length_out) const;
98 
103  std::set<int> get_my_indices() const;
104 
108  bool encode_smiles(const char* smiles,
109  unsigned short size,
110  std::vector<unsigned short>& data);
114  bool encode_smiles(const std::string& smiles,
115  std::vector<unsigned short>& data);
119  void decode_smiles(const std::vector<unsigned short>& data, std::string& out);
120 
122  void load_vocab(std::string filename);
124  void load_vocab(std::stringstream& s);
128  // reads and returns the smiles string from the input stream
129  std::string
130  get_raw_sample(std::istream* istrm, size_t index, size_t buf_offset = 0);
135  void set_offset(size_t index, long long offset, unsigned short length);
136 
138  void load_list_of_samples(const std::string sample_list_file);
139 
141  void set_metadata_filename(std::string fn)
142  {
143  m_metadata_filename = std::move(fn);
144  }
145 
147  const std::string& get_metadata_filename() { return m_metadata_filename; }
148 
149 private:
150  // note: linearized_size is m_sequence_length+2; the +2 is for the
151  // <bos> and <eos> characters that get tacked on
153 
154  const size_t OffsetBinarySize = sizeof(long long);
155  const size_t LengthBinarySize = sizeof(unsigned short);
156  const std::streamsize OffsetAndLengthBinarySize =
157  OffsetBinarySize + LengthBinarySize;
158 
159  struct SampleData
160  {
162  SampleData(int idx, long long off, unsigned short len)
163  : index(idx), offset(off), length(len)
164  {}
165  size_t index;
166  long long offset;
167  unsigned short length;
168  };
169 
173  int m_num_labels = 0;
174 
175  // these may be changed when the vocab file is read
176  short m_pad = 420;
177  short m_unk = 421;
178  short m_bos = 422;
179  short m_eos = 423;
180 
181  std::string m_metadata_filename;
182 
183  std::unordered_map<char, short> m_vocab;
184  std::unordered_map<short, std::string> m_vocab_inv;
185 
186  std::mutex m_mutex;
187 
189  std::unordered_set<char> m_missing_chars;
190 
191  // maps: sample id -> offset within a file
193 
201  std::unordered_map<size_t, size_t> m_index_to_local_id;
202 
204  std::unordered_map<std::string, std::map<size_t, size_t>> m_local_to_index;
205 
207  std::unordered_map<std::string, std::set<size_t>> m_filename_to_local_id_set;
208 
210  std::unordered_map<size_t, std::string> m_index_to_filename;
211 
212  //=====================================================================
213  // private methods follow
214  //=====================================================================
215 
217  void copy_members(const smiles_data_reader& rhs);
218 
219  void do_preload_data_store() override;
220 
221  bool fetch_datum(CPUMat& X, int data_id, int mb_idx) override;
222  bool fetch_label(CPUMat& Y, int data_id, int mb_idx) override;
223  bool fetch_response(CPUMat& Y, int data_id, int mb_idx) override;
224 
225  void print_statistics() const;
226 
227  // load "offset" and "length" for samples from a binary file;
228  // the (offset, length) specify the location of a sample within
229  // the data file
231 
232  // called by load_offsets_and_lengths
233  void read_offset_data(std::vector<SampleData>& data);
234 
235  // calls load_sample
236  void construct_conduit_node(conduit::Node& node,
237  std::istream* istream,
238  size_t sample_id,
239  size_t buf_offset = 0);
240 
241  // calls get_raw_sample; returns in 'output' an encoded version of the sample
242  void load_sample(std::istream* istrm,
243  size_t index,
244  std::vector<unsigned short>& output,
245  size_t buf_offset = 0);
246 
247  void build_some_maps();
248 
249  // called by read_offset_data()
250  void read_metadata_file(std::vector<size_t>& samples_per_file,
251  std::vector<std::string>& data_filenames,
252  std::vector<std::string>& offsets_filenames);
253 
254  bool is_delimiter(const char c)
255  {
256  return (isspace(c) || c == '\n' || c == '\t' || c == ',');
257  }
258 };
259 
260 } // namespace lbann
261 
262 #endif // LBANN_DATA_READER_SMILES_HPP
bool encode_smiles(const char *smiles, unsigned short size, std::vector< unsigned short > &data)
void decode_smiles(const std::vector< unsigned short > &data, std::string &out)
std::unordered_map< std::string, std::set< size_t > > m_filename_to_local_id_set
std::unordered_map< std::string, std::map< size_t, size_t > > m_local_to_index
const std::vector< El::Int > get_data_dims() const override
Get the dimensions of the data.
std::unordered_map< size_t, size_t > m_index_to_local_id
int get_linearized_response_size() const override
Get the linearized size (i.e. number of elements) in a response.
void set_offset(size_t index, long long offset, unsigned short length)
T & data(const cnpy::NpyArray &na, const std::vector< size_t > indices)
Definition: cnpy_utils.hpp:75
std::unordered_map< size_t, std::string > m_index_to_filename
void read_offset_data(std::vector< SampleData > &data)
std::unordered_set< char > m_missing_chars
std::set< int > get_my_indices() const
void get_sample_origin(const size_t index_in, std::string &filename_out, size_t &offset_out, unsigned short &length_out) const
int get_linearized_data_size() const override
Get the linearized size (i.e. number of elements) in a sample.
std::unordered_map< char, short > m_vocab
smiles_data_reader & operator=(const smiles_data_reader &)
int get_num_labels() const override
Return the number of labels (classes) in this dataset.
std::string get_type() const override
std::pair< long long, unsigned short > offset_t
int get_linearized_label_size() const override
Get the linearized size (i.e. number of elements) in a label.
bool fetch_label(CPUMat &Y, int data_id, int mb_idx) override
El::Matrix< DataType, El::Device::CPU > CPUMat
Definition: base.hpp:116
execution_mode
Neural network execution mode.
Definition: base.hpp:229
void load_list_of_samples(const std::string sample_list_file)
void print_statistics() const
void set_metadata_filename(std::string fn)
Sets the name of the metadata file.
void construct_conduit_node(conduit::Node &node, std::istream *istream, size_t sample_id, size_t buf_offset=0)
void use_unused_index_set(execution_mode m) override
bool fetch_datum(CPUMat &X, int data_id, int mb_idx) override
const std::string & get_metadata_filename()
Returns the name of the metadata file.
std::unordered_map< short, std::string > m_vocab_inv
void load_sample(std::istream *istrm, size_t index, std::vector< unsigned short > &output, size_t buf_offset=0)
void load_vocab(std::string filename)
std::unordered_map< size_t, offset_t > offset_map_t
SampleData(int idx, long long off, unsigned short len)
bool fetch_response(CPUMat &Y, int data_id, int mb_idx) override
void read_metadata_file(std::vector< size_t > &samples_per_file, std::vector< std::string > &data_filenames, std::vector< std::string > &offsets_filenames)
smiles_data_reader(const bool shuffle)
smiles_data_reader * copy() const override
const std::streamsize OffsetAndLengthBinarySize
void do_preload_data_store() override
std::string get_raw_sample(std::istream *istrm, size_t index, size_t buf_offset=0)
void copy_members(const smiles_data_reader &rhs)
Contains common code for operator= and copy ctor.