Heat equation successive over-relaxation¶

Solves the two-dimensional time-independent heat equation with fixed temperatures at the border via successive over-relaxation. Implementation uses mpl::distributed_grid.
#include <cstdlib>
#include <iostream>
#include <cmath>
#include <tuple>
#include <random>
#include <mpl/mpl.hpp>

using double_2 = std::tuple<double, double>;


template<std::size_t dim, typename T, typename A>
void update_overlap(const mpl::cartesian_communicator &communicator,
                    mpl::distributed_grid<dim, T, A> &distributed_grid,
                    mpl::tag_t tag = mpl::tag_t()) {
  mpl::shift_ranks ranks;
  mpl::irequest_pool r;
  for (std::size_t i{0}; i < dim; ++i) {
    // send to left
    ranks = communicator.shift(i, -1);
    r.push(communicator.isend(distributed_grid.data(), distributed_grid.left_border_layout(i),
                              ranks.destination, tag));
    r.push(communicator.irecv(distributed_grid.data(), distributed_grid.right_mirror_layout(i),
                              ranks.source, tag));
    // send to right
    ranks = communicator.shift(i, +1);
    r.push(communicator.isend(distributed_grid.data(), distributed_grid.right_border_layout(i),
                              ranks.destination, tag));
    r.push(communicator.irecv(distributed_grid.data(), distributed_grid.left_mirror_layout(i),
                              ranks.source, tag));
  }
  r.waitall();
}


template<std::size_t dim, typename T, typename A>
void scatter(const mpl::cartesian_communicator &communicator, int root,
             const mpl::local_grid<dim, T, A> &local_grid,
             mpl::distributed_grid<dim, T, A> &distributed_grid) {
  communicator.scatterv(root, local_grid.data(), local_grid.sub_layouts(),
                        distributed_grid.data(), distributed_grid.interior_layout());
}


template<std::size_t dim, typename T, typename A>
void scatter(const mpl::cartesian_communicator &communicator, int root,
             mpl::distributed_grid<dim, T, A> &distributed_grid) {
  communicator.scatterv(root, distributed_grid.data(), distributed_grid.interior_layout());
}


template<std::size_t dim, typename T, typename A>
void gather(const mpl::cartesian_communicator &communicator, int root,
            const mpl::distributed_grid<dim, T, A> &distributed_grid,
            mpl::local_grid<dim, T, A> &local_grid) {
  communicator.gatherv(root, distributed_grid.data(), distributed_grid.interior_layout(),
                       local_grid.data(), local_grid.sub_layouts());
}


template<std::size_t dim, typename T, typename A>
void gather(const mpl::cartesian_communicator &communicator, int root,
            const mpl::distributed_grid<dim, T, A> &distributed_grid) {
  communicator.gatherv(root, distributed_grid.data(), distributed_grid.interior_layout());
}


int main() {
  // world communicator
  const mpl::communicator &comm_world{mpl::environment::comm_world()};
  // construct a two-dimensional Cartesian communicator with no periodic boundary conditions
  mpl::cartesian_communicator::dimensions size{mpl::cartesian_communicator::non_periodic,
                                               mpl::cartesian_communicator::non_periodic};
  mpl::cartesian_communicator comm_c(comm_world, mpl::dims_create(comm_world.size(), size));
  // total number of inner grid points
  const int n_x{768}, n_y{512};
  // grid points with extremal indices (-1, Nx or Ny) hold fixed boundary data
  // grid lengths and grid spacings
  const double l_x{1.5}, l_y{1};
  const double dx{l_x / (n_x + 1)}, dy{l_y / (n_y + 1)};
  // distributed grid that holds each processor's subgrid plus one row and
  // one column of neighboring data
  mpl::distributed_grid<2, double> u_d{comm_c, {{n_x, 1}, {n_y, 1}}};
  // rank 0 initializes with some random data
  if (comm_c.rank() == 0) {
    std::default_random_engine engine;
    std::uniform_real_distribution<double> uniform{0, 1};
    // local grid to store the whole set of inner grid points
    mpl::local_grid<2, double> u{comm_c, {n_x, n_y}};
    for (auto j{u.begin(1)}, j_end{u.end(1)}; j < j_end; ++j)
      for (auto i{u.begin(0)}, i_end{u.end(0)}; i < i_end; ++i)
        u(i, j) = uniform(engine);
    // scatter data to each processors subgrid
    scatter(comm_c, 0, u, u_d);
  } else
    scatter(comm_c, 0, u_d);
  // initialize boundary data, loop with obegin and oend over all
  // data including the overlap
  for (auto j : {u_d.obegin(1), u_d.oend(1) - 1})
    for (auto i{u_d.obegin(0)}, i_end{u_d.oend(0)}; i < i_end; ++i) {
      if (u_d.gindex(0, i) < 0 or u_d.gindex(1, j) < 0)
        u_d(i, j) = 1;  // left boundary condition
      if (u_d.gindex(0, i) >= n_x or u_d.gindex(1, j) >= n_y)
        u_d(i, j) = 0;  // right boundary condition
    }
  for (auto i : {u_d.obegin(0), u_d.oend(0) - 1})
    for (auto j{u_d.obegin(1)}, j_end{u_d.oend(1)}; j < j_end; ++j) {
      if (u_d.gindex(0, i) < 0 or u_d.gindex(1, j) < 0)
        u_d(i, j) = 1;  // lower boundary condition
      if (u_d.gindex(0, i) >= n_x or u_d.gindex(1, j) >= n_y)
        u_d(i, j) = 0;  // upper boundary condition
    }
  double w{1.875};  // the over-relaxation parameter
  double dx_2{dx * dx}, dy_2{dy * dy};
  // loop until converged
  bool converged{false};
  while (not converged) {
    // exchange overlap data
    update_overlap(comm_c, u_d);
    // apply one successive over-relaxation step
    double delta_u{0}, sum_u{0};
    for (auto j{u_d.begin(1)}, j_end{u_d.end(1)}; j < j_end; ++j)
      for (auto i{u_d.begin(0)}, i_end{u_d.end(0)}; i < i_end; ++i) {
        double du = -w * u_d(i, j) + w *
                                         (dy_2 * (u_d(i - 1, j) + u_d(i + 1, j)) +
                                          dx_2 * (u_d(i, j - 1) + u_d(i, j + 1))) /
                                         (2 * (dx_2 + dy_2));
        u_d(i, j) += du;
        delta_u += std::abs(du);
        sum_u += std::abs(u_d(i, j));
      }
    // determine global sum of Delta_u and sum_u and distribute to all processors
    double_2 delta_sum_u{delta_u, sum_u};  // pack into pair
    // use a lambda function for global reduction
    comm_c.allreduce(
        [](double_2 a, double_2 b) {
          // reduction adds component-by-component
          return double_2{std::get<0>(a) + std::get<0>(b), std::get<1>(a) + std::get<1>(b)};
        },
        delta_sum_u);
    std::tie(delta_u, sum_u) = delta_sum_u;  // unpack from pair
    converged = delta_u / sum_u < 1e-6;      // check for convergence
  }
  if (comm_c.rank() == 0) {
    // local grid to store the whole set of inner grid points
    mpl::local_grid<2, double> u{comm_c, {n_x, n_y}};
    // gather data and print result
    gather(comm_c, 0, u_d, u);
    for (auto j{u.begin(1)}, j_end{u.end(1)}; j < j_end; ++j) {
      for (auto i{u.begin(0)}, i_end{u.end(0)}; i < i_end; ++i)
        std::cout << u(i, j) << '\t';
      std::cout << '\n';
    }
  } else
    gather(comm_c, 0, u_d);
  return EXIT_SUCCESS;
}