27 #ifndef LBANN_LAYERS_LEARNING_GRU_HPP_INCLUDED 28 #define LBANN_LAYERS_LEARNING_GRU_HPP_INCLUDED 31 #ifdef LBANN_HAS_DNN_LIB 33 #endif // LBANN_HAS_DNN_LIB 34 #ifdef LBANN_HAS_ONEDNN 36 #endif // LBANN_HAS_ONEDNN 62 template <
typename TensorDataType, data_layout Layout, El::Device Device>
67 "GRU layer only supports data parallel layout");
70 gru_layer(
size_t hidden_size,
size_t num_layers);
77 std::string
get_type()
const override;
91 template <
typename ArchiveT>
108 void setup_data(
size_t max_mini_batch_size)
override;
119 #ifdef LBANN_GRU_LAYER_ONEDNN_CPU_SUPPORTED 124 struct OnednnCpuObjects
128 using Backend = onednn_backend<El::Device::CPU>;
129 using TensorDesc = Backend::TensorDescriptor;
132 ::dnnl::lbr_gru_forward::primitive_desc gru_forward_primitive_desc;
133 ::dnnl::lbr_gru_forward::primitive gru_forward_primitive;
134 ::dnnl::lbr_gru_backward::primitive_desc gru_backward_primitive_desc;
135 ::dnnl::lbr_gru_backward::primitive gru_backward_primitive;
136 TensorDesc input_sequence_desc;
137 TensorDesc init_hidden_desc;
138 TensorDesc output_sequence_desc;
139 TensorDesc final_hidden_desc;
140 TensorDesc input_sequence_grad_desc;
141 TensorDesc init_hidden_grad_desc;
142 TensorDesc output_sequence_grad_desc;
143 TensorDesc final_hidden_grad_desc;
146 TensorDesc forward_ih_matrix_weights;
147 TensorDesc forward_hh_matrix_weights;
148 TensorDesc backward_ih_matrix_weights;
149 TensorDesc backward_hh_matrix_weights;
150 TensorDesc bias_weights;
151 TensorDesc ih_matrix_weights_grad;
152 TensorDesc hh_matrix_weights_grad;
153 TensorDesc bias_weights_grad;
154 TensorDesc workspace;
158 std::unique_ptr<OnednnCpuObjects> m_onednn_cpu_objects;
161 void setup_onednn_cpu();
164 #endif // LBANN_GRU_LAYER_ONEDNN_CPU_SUPPORTED 166 #ifdef LBANN_GRU_LAYER_CUDNN_SUPPORTED 175 using ByteBuffer = hydrogen::simple_buffer<El::byte, El::Device::GPU>;
176 using IntBuffer = hydrogen::simple_buffer<int32_t, El::Device::GPU>;
177 using LocalMat = El::Matrix<TensorDataType, El::Device::GPU>;
179 std::unordered_map<size_t, std::pair<size_t, cuda::ExecutableGraph>>;
182 dnn_lib::RNNDescriptor rnn_desc;
183 dnn_lib::RNNDataDescriptor input_desc;
184 dnn_lib::RNNDataDescriptor output_desc;
185 dnn_lib::TensorDescriptor hidden_desc;
188 LocalMat input_sequence_workspace;
189 LocalMat output_sequence_workspace;
190 LocalMat input_sequence_grad_workspace;
191 LocalMat output_sequence_grad_workspace;
192 LocalMat init_hidden_workspace;
193 LocalMat init_hidden_grad_workspace;
194 ByteBuffer weights_workspace;
195 ByteBuffer weights_grad_workspace;
196 ByteBuffer workspace;
197 ByteBuffer reserve_space;
198 IntBuffer gpu_sequence_lengths;
205 GraphCache forward_prop_graph_cache;
211 GraphCache backward_prop_graph_cache;
215 std::unique_ptr<CudnnObjects> m_cudnn_objects;
221 #endif // LBANN_GRU_LAYER_CUDNN_SUPPORTED 223 template <
typename T>
225 template <
typename T>
233 #ifndef LBANN_GRU_LAYER_INSTANTIATE 235 #ifdef LBANN_GRU_LAYER_ONEDNN_CPU_SUPPORTED 237 extern template class gru_layer<T, \ 238 data_layout::DATA_PARALLEL, \ 240 #define LBANN_INSTANTIATE_CPU_HALF 243 #endif // LBANN_GRU_LAYER_ONEDNN_CPU_SUPPORTED 245 #ifdef LBANN_GRU_LAYER_CUDNN_SUPPORTED 247 extern template class gru_layer<T, \ 248 data_layout::DATA_PARALLEL, \ 250 #define LBANN_INSTANTIATE_GPU_HALF 253 #endif // LBANN_GRU_LAYER_CUDNN_SUPPORTED 255 #endif // LBANN_GRU_LAYER_INSTANTIATE 259 #endif // LBANN_LAYERS_LEARNING_GRU_HPP_INCLUDED void fp_compute() override
Apply layer operation. Called by the 'forward_prop' function. Given the input tensors, the output tensors are populated with computed values.
void write_specific_proto(lbann_data::Layer &proto) const final
data_layout get_data_layout() const override
Get data layout of the data tensors. We assume that the data layouts of the previous activations...
gru_layer & operator=(const gru_layer &other)
El::Device get_device_allocation() const override
Get the device allocation for the data tensors. We assume that the decice allocation of the previous ...
friend void bp_compute_impl(gru_layer< T, Layout, Device > &)
Generates nicely formatted description messages.
std::string get_type() const override
Get the layer type's name.
gru_layer * copy() const override
Copy function. This function dynamically allocates memory for a layer instance and instantiates a cop...
size_t m_num_layers
Number of stacked GRU cells.
constexpr El::Device Device
int get_backprop_requirements() const override
Returns the necessary tensors for computing backpropagation.
bool can_run_inplace() const override
If True, the computation can run in-place (feeding each input activations tensor as the corresponding...
void serialize(ArchiveT &ar)
friend void fp_compute_impl(gru_layer< T, Layout, Device > &)
friend class cereal::access
void setup_dims() override
Setup tensor dimensions Called by the 'setup' function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
Stacked gated recurrent unit.
const hydrogen::simple_buffer< El::byte, Device > & get_reserve_space() const
data_layout
Data layout that is optimized for different modes of parallelism.
size_t get_hidden_size() const
void bp_compute() override
Compute objective funciton gradients. Called by the 'back_prop' function. Given the input...
void setup_data(size_t max_mini_batch_size) override
Setup layer data. Called by the 'setup' function. Memory is allocated for distributed matrices...
LBANN_DEFINE_LAYER_BUILDER(elu)
size_t get_num_layers() const
size_t m_hidden_size
Size of each hidden state and output vector.
description get_description() const override
Human-readable description.