27 #ifndef LBANN_LAYERS_TRANSFORM_SLICE_HPP_INCLUDED 28 #define LBANN_LAYERS_TRANSFORM_SLICE_HPP_INCLUDED 34 #include "lbann/proto/layers.pb.h" 46 template <
typename TensorDataType,
61 template <
typename ArchiveT>
66 std::string
get_type()
const override;
81 bool set_slice_points_from_data_reader,
123 std::shared_ptr<hydrogen::simple_buffer<unsigned char, El::Device::CPU>>
130 gpu_lib::event_wrapper m_workspace_event;
131 #endif // LBANN_HAS_GPU 133 template <
typename U, El::Device D>
135 template <
typename U>
137 template <
typename U>
145 template <
typename T, data_layout L, El::Device D>
148 proto.set_datatype(proto::ProtoDataType<T>);
149 auto* msg = proto.mutable_slice();
154 template <
typename TensorDataType, data_layout Layout, El::Device Device>
162 std::make_shared<hydrogen::simple_buffer<unsigned char, El::Device::CPU>>(
164 hydrogen::SyncInfo<El::Device::CPU>{},
171 template <
typename TensorDataType, data_layout Layout, El::Device Device>
178 template <
typename TensorDataType, data_layout Layout, El::Device Device>
184 template <
typename TensorDataType, data_layout Layout, El::Device Device>
190 template <
typename TensorDataType, data_layout Layout, El::Device Device>
197 template <
typename TensorDataType, data_layout Layout, El::Device Device>
202 std::ostringstream ss;
206 desc.add(
"Slice points", ss.str());
210 template <
typename TensorDataType, El::Device Device>
220 for (
size_t j = 0; j < num_outputs; ++j) {
223 El::LockedView(output,
225 El::IR(offset, offset + output_size),
227 offset += output_size;
231 template <
typename TensorDataType, El::Device Device>
238 for (
size_t j = 0; j < num_outputs; ++j) {
245 template <
typename TensorDataType, data_layout Layout, El::Device Device>
251 template <
typename TensorDataType, data_layout Layout, El::Device Device>
255 const size_t num_dims = input_dims.size();
264 "but only 3-D tensors are currently supported");
267 const int split_dim = input_dims[this->
m_slice_dim];
277 "Subgrpah parallelism is supported when split axis is the last " 282 auto const* ptr_input =
dynamic_cast< 284 DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Device> const*
>(
288 El::copy::TranslateBetweenGridsScatter<TensorDataType, Device, Device>(
297 El::copy::TranslateBetweenGridsScatter<TensorDataType, Device, Device>(
306 El::copy::TranslateBetweenGridsScatter<TensorDataType, Device, Device>(
316 template <
typename TensorDataType, data_layout Layout, El::Device Device>
320 const size_t num_dims = input_dims.size();
331 template <
typename TensorDataType, data_layout Layout, El::Device Device>
336 const int split_dim =
341 auto* ptr_input_grad =
dynamic_cast< 342 El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Device>*
>(
345 El::copy::TranslateBetweenGridsGather<TensorDataType, Device, Device>(
353 template <
typename TensorDataType, data_layout Layout, El::Device Device>
358 input_grad.Empty(
false);
360 El::Zeros(input_grad, this->
get_input_size(), output0_grad.Width());
363 template <
typename TensorDataType, data_layout Layout, El::Device Device>
368 const size_t num_dims = input_dims.size();
379 #ifndef LBANN_SLICE_LAYER_INSTANTIATE 380 #define PROTO_DEVICE(T, Device) \ 381 extern template class slice_layer<T, data_layout::DATA_PARALLEL, Device>; \ 382 extern template class slice_layer<T, data_layout::MODEL_PARALLEL, Device> 386 #endif // LBANN_SLICE_LAYER_INSTANTIATE 390 #endif // LBANN_LAYERS_TRANSFORM_SLICE_HPP_INCLUDED
void serialize(ArchiveT &ar)
El::SyncInfo< Device > syncSubGridCommunication
bool m_set_slice_points_from_data_reader
void setup_dims() override
Setup tensor dimensions Called by the 'setup' function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
int get_output_size(size_t output_index=0) const
Get output tensor size.
std::vector< int > get_input_dims(size_t input_index=0) const
Get input tensor dimensions.
Generates nicely formatted description messages.
void setup_slice_points(size_t slice_dim, bool set_slice_points_from_data_reader, const slice_points_mode var_category)
void bp_compute() override
Compute objective funciton gradients. Called by the 'back_prop' function. Given the input...
El::Device get_device_allocation() const override
Get the device allocation for the data tensors. We assume that the decice allocation of the previous ...
virtual description get_description() const
Human-readable description.
int get_backprop_requirements() const override
Returns the necessary tensors for computing backpropagation.
constexpr El::Device Device
OutputAbsDistMatrixType & get_prev_error_signals(int child_index=0)
InputAbsDistMatrixType & get_prev_activations(int parent_index=0)
Slice tensor along a specified dimension.
const OutputAbsDistMatrixType & get_activations(const Layer &child) const override
int m_expected_num_child_layers
Expected number of child layers. A negative value indicates no limit.
int get_input_size(size_t input_index=0) const
Get input tensor size.
void assign_to_repeated(google::protobuf::RepeatedField< T > &field, ContainerT const &values)
Assign a range of values to a repeated protobuf field.
void bp_compute_subgrid()
slice_points_mode m_var_category
std::vector< std::unique_ptr< OutputAbsDistMatrixType > > & get_all_activations()
std::vector< size_t > m_slice_points
int get_num_children() const noexcept
Get number of child layers.
slice_layer & operator=(const slice_layer &other)=default
bool can_run_inplace() const override
If True, the computation can run in-place (feeding each input activations tensor as the corresponding...
void write_specific_proto(lbann_data::Layer &proto) const final
std::string get_type() const override
Get the layer type's name.
void setup_slice_points(size_t slice_dim, std::vector< size_t > slice_points)
std::string get_name() const
Get the layer instance's name.
slice_layer * copy() const override
Copy function. This function dynamically allocates memory for a layer instance and instantiates a cop...
void fp_compute() override
Apply layer operation. Called by the 'forward_prop' function. Given the input tensors, the output tensors are populated with computed values.
void fp_compute_subgrid()
friend void fp_setup_outputs_impl(slice_layer< U, Layout, D > &)
data_layout
Data layout that is optimized for different modes of parallelism.
void bp_setup_gradient_wrt_inputs() override
Setup gradient w.r.t. input tensors. Called by the 'back_prop' function. Each gradient w...
friend class cereal::access
SubGraphCommunication get_communication_flag()
El::mpi::Comm & get_subgrid_comm()
friend void fp_compute_impl(slice_layer< U, Layout, Device > &)
description get_description() const override
Human-readable description.
bool subgraph_parallelism_execution() const noexcept
void fp_setup_outputs() override
Setup output tensors. Called by the 'forward_prop' function. Each output tensor is resized to match t...
data_layout get_data_layout() const override
Get data layout of the data tensors. We assume that the data layouts of the previous activations...
std::vector< std::unique_ptr< OutputAbsDistMatrixType > > & get_all_prev_error_signals()
const InputAbsDistMatrixType & get_error_signals(const Layer &parent) const override
friend void bp_compute_impl(slice_layer< U, Layout, Device > &)