27 #ifndef LBANN_LAYER_CROSS_GRID_SUM_SLICE_HPP_INCLUDED 28 #define LBANN_LAYER_CROSS_GRID_SUM_SLICE_HPP_INCLUDED 35 template <
typename TensorDataType, El::Device Dev>
50 std::string
get_type()
const override {
return "cross_grid_sum_slice"; }
72 "\" has no parent layers");
83 std::vector<int> output_dims_slice(input_dims);
84 output_dims_slice.back() = int(output_dims_slice.back() / subgridCommSize);
99 auto& output_cast =
dynamic_cast< 100 El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Dev>&
>(
103 auto const sync_info_output =
104 El::SyncInfoFromMatrix(output_cast.LockedMatrix());
106 const El::Int mloc = input.LocalHeight();
107 const El::Int nloc = input.LocalWidth();
109 El::Matrix<TensorDataType, Dev> prev_allreduce(mloc, nloc),
110 after_allreduce(mloc, nloc);
112 El::Copy(input.LockedMatrix(), prev_allreduce);
114 El::mpi::AllReduce(prev_allreduce.Buffer(),
115 after_allreduce.Buffer(),
122 int last_dim = input_dims.back();
124 int last_dim_start_point = 1;
125 for (
int i = 0; i < int(input_dims.size()) - 1; ++i) {
126 last_dim_start_point = last_dim_start_point * input_dims[i];
129 if (last_dim % subgrid_comm_size != 0)
130 LBANN_ERROR(
"cross_grid_sum_slice layer: last dimension should be " 131 "divided by the number of branches in subgraph");
133 int const last_dim_index =
134 int(last_dim / subgrid_comm_size) * subgrid_comm_rank;
136 El::copy::util::InterleaveMatrix(
137 (last_dim / subgrid_comm_size),
138 input.LocalWidth() * last_dim_start_point,
139 after_allreduce.LockedBuffer(last_dim_index, 0),
142 output_cast.Buffer(),
144 (last_dim / subgrid_comm_size),
156 auto mini_batch_size =
173 int last_dim = input_dims.back();
176 const auto& gradient_wrt_output =
180 El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Dev>;
181 auto& gradient_wrt_output_cast =
182 dynamic_cast<const MatrixType&
>(gradient_wrt_output);
184 auto& gradient_wrt_input_cast =
dynamic_cast<MatrixType&
>(input_grad);
186 int mloc = gradient_wrt_output_cast.LocalHeight();
187 int nloc = gradient_wrt_output_cast.LocalWidth();
188 int per_grid_last_dim = last_dim / subgrid_comm_size;
190 El::Matrix<TensorDataType, Dev> temp_input(nloc, mloc),
191 temp_output(nloc * (mloc / per_grid_last_dim), last_dim),
192 transposed(nloc * (mloc / per_grid_last_dim), per_grid_last_dim),
193 transposed_output(last_dim, nloc * (mloc / per_grid_last_dim));
195 El::Copy(gradient_wrt_output_cast.LockedMatrix(), temp_input);
196 temp_input.Resize(per_grid_last_dim, nloc * (mloc / per_grid_last_dim));
198 El::Transpose(temp_input, transposed);
200 El::mpi::AllGather(transposed.Buffer(),
202 temp_output.Buffer(),
207 El::Transpose(temp_output, transposed_output);
208 transposed_output.Resize(mloc * subgrid_comm_size, nloc);
210 auto mini_batch_size =
213 for (
int i = 0; i < El::To<int>(children.size()); i++) {
214 auto& gradient_wrt_input_cast_layer =
219 El::Copy(transposed_output, gradient_wrt_input_cast.Matrix());
225 #ifndef LBANN_CROSS_GRID_SUM_SLICE_LAYER_INSTANTIATE 226 #define PROTO_DEVICE(T, Device) \ 227 extern template class cross_grid_sum_slice_layer<T, Device> 232 #endif // LBANN_CROSS_GRID_SUM_SLICE_LAYER_INSTANTIATE 236 #endif // LBANN_LAYER_CROSS_GRID_SUM_SLICE_HPP_INCLUDED virtual void setup_dims()
Setup tensor dimensions Called by the 'setup' function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
bool can_run_inplace() const override
If True, the computation can run in-place (feeding each input activations tensor as the corresponding...
int get_output_size(size_t output_index=0) const
Get output tensor size.
int get_num_parents() const noexcept
Get number of parent layers.
void bp_compute() final
Compute objective funciton gradients. Called by the 'back_prop' function. Given the input...
std::vector< int > get_input_dims(size_t input_index=0) const
Get input tensor dimensions.
constexpr El::Device Device
OutputAbsDistMatrixType & get_prev_error_signals(int child_index=0)
InputAbsDistMatrixType & get_prev_activations(int parent_index=0)
El::Int infer_mini_batch_size_from_parents_or_default_to_current() const override
const OutputAbsDistMatrixType & get_activations(const Layer &child) const override
int m_expected_num_child_layers
Expected number of child layers. A negative value indicates no limit.
int get_input_size(size_t input_index=0) const
Get input tensor size.
void set_output_dims(std::vector< int > dims, size_t output_index=0)
Set output tensor dimensions.
std::string get_type() const override
Get the layer type's name.
int get_num_children() const noexcept
Get number of child layers.
void write_specific_proto(lbann_data::Layer &proto) const final
data_layout get_data_layout() const override
Get data layout of the data tensors. We assume that the data layouts of the previous activations...
void setup_pointers() override
Setup layer pointers. Called by the 'setup' function. Pointers to parent/child layers are assumed to ...
std::string get_name() const
Get the layer instance's name.
std::vector< const Layer * > get_child_layers() const
int get_backprop_requirements() const override
Returns the necessary tensors for computing backpropagation.
void setup_dims() override
Setup tensor dimensions Called by the 'setup' function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
cross_grid_sum_slice_layer(lbann_comm *comm)
El::SyncInfo< Dev > syncSubGridCommunication
data_layout
Data layout that is optimized for different modes of parallelism.
void bp_setup_gradient_wrt_inputs() override
Setup gradient w.r.t. input tensors. Called by the 'back_prop' function. Each gradient w...
El::Device get_device_allocation() const override
Get the device allocation for the data tensors. We assume that the decice allocation of the previous ...
cross_grid_sum_slice_layer * copy() const override
Copy function. This function dynamically allocates memory for a layer instance and instantiates a cop...
El::mpi::Comm & get_subgrid_comm()
void fp_setup_outputs() override
Setup output tensors. Called by the 'forward_prop' function. Each output tensor is resized to match t...
virtual void setup_pointers()
Setup layer pointers. Called by the 'setup' function. Pointers to parent/child layers are assumed to ...
int m_expected_num_parent_layers
const InputAbsDistMatrixType & get_error_signals(const Layer &parent) const override
void fp_compute() override
Apply layer operation. Called by the 'forward_prop' function. Given the input tensors, the output tensors are populated with computed values.