27 #ifndef LBANN_LAYER_CROSS_GRID_SUM_HPP_INCLUDED 28 #define LBANN_LAYER_CROSS_GRID_SUM_HPP_INCLUDED 35 template <
typename TensorDataType, El::Device Dev>
50 std::string
get_type() const final {
return "cross_grid_sum"; }
71 "\" has no parent layers");
84 auto const dims_size = dims_print.size();
85 for (
auto ii = 0UL; ii < dims_size; ++ii) {
86 std::cout <<
"Index:" << ii <<
" dim" << dims_print[ii] <<
"\n";
96 std::stringstream err;
98 <<
"has input tensors with incompatible dimensions (";
101 err << (j > 0 ?
", " :
"") <<
"layer \"" << parents[j]->
get_name()
103 for (
size_t k = 0; k < dims.size(); ++k) {
104 err << (k > 0 ?
" x " :
"") << dims[k];
119 for (
int i = 0; i < El::To<int>(parents.size()); i++) {
126 El::Copy(input, output);
128 auto& output_cast =
dynamic_cast< 129 El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Dev>&
>(
132 auto const syncInfoOutput =
133 El::SyncInfoFromMatrix(output_cast.LockedMatrix());
135 const El::Int mloc = output_cast.LocalHeight();
136 const El::Int nloc = output_cast.LocalWidth();
138 El::Matrix<TensorDataType, Dev> temp_output(mloc, nloc);
140 El::Copy(output_cast.LockedMatrix(), temp_output);
142 El::mpi::AllReduce(temp_output.Buffer(),
143 output_cast.Buffer(),
156 auto mini_batch_size =
174 for (
int i = 0; i < El::To<int>(parents.size()); i++) {
178 int const tag = tag_parent - 1;
183 int gradient_wrt_output_Height = gradient_wrt_output.Height();
184 int gradient_wrt_output_Width = gradient_wrt_output.Width();
185 for (
int i = 0; i < El::To<int>(children.size()); i++) {
186 auto& gradient_wrt_input_cast =
dynamic_cast< 187 El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Dev>&
>(
189 gradient_wrt_input_cast.Resize(gradient_wrt_output_Height,
190 gradient_wrt_output_Width);
193 El::Copy(gradient_wrt_output, gradient_wrt_input);
195 auto& gradient_wrt_input_cast =
dynamic_cast< 196 El::DistMatrix<TensorDataType, El::STAR, El::VC, El::ELEMENT, Dev>&
>(
199 const El::Int mloc = gradient_wrt_input_cast.LocalHeight();
200 const El::Int nloc = gradient_wrt_input_cast.LocalWidth();
202 El::Matrix<TensorDataType, Dev> temp_output(mloc, nloc);
204 El::Copy(gradient_wrt_input_cast.LockedMatrix(), temp_output);
212 #ifndef LBANN_CROSS_GRID_SUM_LAYER_INSTANTIATE 213 #define PROTO_DEVICE(T, Device) \ 214 extern template class cross_grid_sum_layer<T, Device> 219 #endif // LBANN_CROSS_GRID_SUM_LAYER_INSTANTIATE 223 #endif // LBANN_LAYER_CROSS_GRID_SUM_HPP_INCLUDED virtual void setup_dims()
Setup tensor dimensions Called by the 'setup' function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
void write_specific_proto(lbann_data::Layer &proto) const final
El::Device get_device_allocation() const final
Get the device allocation for the data tensors. We assume that the decice allocation of the previous ...
cross_grid_sum_layer * copy() const final
Copy function. This function dynamically allocates memory for a layer instance and instantiates a cop...
void setup_pointers() final
Setup layer pointers. Called by the 'setup' function. Pointers to parent/child layers are assumed to ...
int get_grid_tag() const noexcept
Identifying tag for process grid.
int get_output_size(size_t output_index=0) const
Get output tensor size.
int get_num_parents() const noexcept
Get number of parent layers.
std::vector< int > get_input_dims(size_t input_index=0) const
Get input tensor dimensions.
void setup_dims() final
Setup tensor dimensions Called by the 'setup' function. If there are any input tensors, the base method sets all uninitialized output tensor dimensions equal to the first input tensor dimensions.
constexpr El::Device Device
int get_backprop_requirements() const override
Returns the necessary tensors for computing backpropagation.
OutputAbsDistMatrixType & get_prev_error_signals(int child_index=0)
InputAbsDistMatrixType & get_prev_activations(int parent_index=0)
El::Int infer_mini_batch_size_from_parents_or_default_to_current() const override
const OutputAbsDistMatrixType & get_activations(const Layer &child) const override
int m_expected_num_child_layers
Expected number of child layers. A negative value indicates no limit.
void fp_compute() final
Apply layer operation. Called by the 'forward_prop' function. Given the input tensors, the output tensors are populated with computed values.
void set_output_dims(std::vector< int > dims, size_t output_index=0)
Set output tensor dimensions.
int get_num_children() const noexcept
Get number of child layers.
std::vector< const Layer * > get_parent_layers() const
void set_subgraph_parallelism_execution()
bool can_run_inplace() const override
If True, the computation can run in-place (feeding each input activations tensor as the corresponding...
std::string get_name() const
Get the layer instance's name.
std::vector< const Layer * > get_child_layers() const
std::string get_type() const final
Get the layer type's name.
cross_grid_sum_layer(lbann_comm *comm)
void bp_compute() final
Compute objective funciton gradients. Called by the 'back_prop' function. Given the input...
data_layout
Data layout that is optimized for different modes of parallelism.
El::mpi::Comm & get_subgrid_comm()
void fp_setup_outputs() final
Setup output tensors. Called by the 'forward_prop' function. Each output tensor is resized to match t...
virtual void setup_pointers()
Setup layer pointers. Called by the 'setup' function. Pointers to parent/child layers are assumed to ...
std::vector< int > get_output_dims(size_t output_index=0) const
Get output tensor dimensions.
data_layout get_data_layout() const final
Get data layout of the data tensors. We assume that the data layouts of the previous activations...
int m_expected_num_parent_layers
const InputAbsDistMatrixType & get_error_signals(const Layer &parent) const override
void bp_setup_gradient_wrt_inputs() final
Setup gradient w.r.t. input tensors. Called by the 'back_prop' function. Each gradient w...