Heat equation Jacobi solver#

Solves the two-dimensional time-independent heat equation with fixed temperatures at the border via Jacobi iterations. Implementation uses mpl::distributed_grid.

#include <cstdlib>
#include <iostream>
#include <cmath>
#include <tuple>
#include <random>
#include <mpl/mpl.hpp>

using double_2 = std::tuple<double, double>;


template<std::size_t dim, typename T, typename A>
mpl::irequest_pool update_overlap(const mpl::cartesian_communicator &communicator,
                                  mpl::distributed_grid<dim, T, A> &distributed_grid,
                                  mpl::tag_t tag = mpl::tag_t()) {
  mpl::shift_ranks ranks;
  mpl::irequest_pool r;
  for (std::size_t i{0}; i < dim; ++i) {
    // send to left
    ranks = communicator.shift(i, -1);
    r.push(communicator.isend(distributed_grid.data(), distributed_grid.left_border_layout(i),
                              ranks.destination, tag));
    r.push(communicator.irecv(distributed_grid.data(), distributed_grid.right_mirror_layout(i),
                              ranks.source, tag));
    // send to right
    ranks = communicator.shift(i, +1);
    r.push(communicator.isend(distributed_grid.data(), distributed_grid.right_border_layout(i),
                              ranks.destination, tag));
    r.push(communicator.irecv(distributed_grid.data(), distributed_grid.left_mirror_layout(i),
                              ranks.source, tag));
  }
  return r;
}


template<std::size_t dim, typename T, typename A>
void scatter(const mpl::cartesian_communicator &communicator, int root,
             const mpl::local_grid<dim, T, A> &local_grid,
             mpl::distributed_grid<dim, T, A> &distributed_grid) {
  communicator.scatterv(root, local_grid.data(), local_grid.sub_layouts(),
                        distributed_grid.data(), distributed_grid.interior_layout());
}

template<std::size_t dim, typename T, typename A>
void scatter(const mpl::cartesian_communicator &communicator, int root,
             mpl::distributed_grid<dim, T, A> &distributed_grid) {
  communicator.scatterv(root, distributed_grid.data(), distributed_grid.interior_layout());
}


template<std::size_t dim, typename T, typename A>
void gather(const mpl::cartesian_communicator &communicator, int root,
            const mpl::distributed_grid<dim, T, A> &distributed_grid,
            mpl::local_grid<dim, T, A> &local_grid) {
  communicator.gatherv(root, distributed_grid.data(), distributed_grid.interior_layout(),
                       local_grid.data(), local_grid.sub_layouts());
}

template<std::size_t dim, typename T, typename A>
void gather(const mpl::cartesian_communicator &communicator, int root,
            const mpl::distributed_grid<dim, T, A> &distributed_grid) {
  communicator.gatherv(root, distributed_grid.data(), distributed_grid.interior_layout());
}


int main() {
  // world communicator
  const mpl::communicator &comm_world{mpl::environment::comm_world()};
  // construct a two-dimensional Cartesian communicator with no periodic boundary conditions
  mpl::cartesian_communicator::dimensions size{mpl::cartesian_communicator::non_periodic,
                                               mpl::cartesian_communicator::non_periodic};
  mpl::cartesian_communicator comm_c{comm_world, mpl::dims_create(comm_world.size(), size)};
  // total number of inner grid points
  const int n_x{768}, n_y{512};
  // grid points with extremal indices (-1, Nx or Ny) hold fixed boundary data
  // grid lengths and grid spacings
  double l_x{1.5}, l_y{1};
  double dx{l_x / (n_x + 1)}, dy{l_y / (n_y + 1)};
  // distributed grids that hold each processor's subgrid plus one row and
  // one column  of neighboring data
  mpl::distributed_grid<2, double> u_d_1(comm_c, {{n_x, 1}, {n_y, 1}});
  mpl::distributed_grid<2, double> u_d_2(comm_c, {{n_x, 1}, {n_y, 1}});
  // rank 0 initializes with some random data
  if (comm_c.rank() == 0) {
    std::default_random_engine engine;
    std::uniform_real_distribution<double> uniform{0, 1};
    // local grid to store the whole set of inner grid points
    mpl::local_grid<2, double> u(comm_c, {n_x, n_y});
    for (auto j{u.begin(1)}, j_end{u.end(1)}; j < j_end; ++j)
      for (auto i{u.begin(0)}, i_end{u.end(0)}; i < i_end; ++i)
        u(i, j) = uniform(engine);
    // scatter data to each processors subgrid
    scatter(comm_c, 0, u, u_d_1);
  } else
    scatter(comm_c, 0, u_d_1);
  // initialize boundary data, loop with obegin and oend over all
  // data including the overlap, initialize if local border is global border
  for (auto j : {u_d_1.obegin(1), u_d_1.oend(1) - 1})
    for (auto i{u_d_1.obegin(0)}, i_end{u_d_1.oend(0)}; i < i_end; ++i) {
      if (u_d_1.gindex(0, i) < 0 or u_d_1.gindex(1, j) < 0)
        u_d_1(i, j) = u_d_2(i, j) = 1;  // left boundary condition
      if (u_d_1.gindex(0, i) >= n_x or u_d_1.gindex(1, j) >= n_y)
        u_d_1(i, j) = u_d_2(i, j) = 0;  // right boundary condition
    }
  for (auto i : {u_d_1.obegin(0), u_d_1.oend(0) - 1})
    for (auto j{u_d_1.obegin(1)}, j_end{u_d_1.oend(1)}; j < j_end; ++j) {
      if (u_d_1.gindex(0, i) < 0 or u_d_1.gindex(1, j) < 0)
        u_d_1(i, j) = u_d_2(i, j) = 1;  // lower boundary condition
      if (u_d_1.gindex(0, i) >= n_x or u_d_1.gindex(1, j) >= n_y)
        u_d_1(i, j) = u_d_2(i, j) = 0;  // upper boundary condition
    }
  const double dx_2{dx * dx}, dy_2{dy * dy};
  // loop until converged
  bool converged{false};
  int iterations{0};
  while (not converged) {
    iterations++;
    // exchange asynchronously overlapping boundary data
    mpl::irequest_pool r(update_overlap(comm_c, u_d_1));
    // apply one Jacobi iteration step for interior region
    double delta_u{0}, sum_u{0};
    for (auto j{u_d_1.begin(1) + 1}, j_end{u_d_1.end(1) - 1}; j < j_end; ++j)
      for (auto i{u_d_1.begin(0) + 1}, i_end{u_d_1.end(0) - 1}; i < i_end; ++i) {
        u_d_2(i, j) = (dy_2 * (u_d_1(i - 1, j) + u_d_1(i + 1, j)) +
                       dx_2 * (u_d_1(i, j - 1) + u_d_1(i, j + 1))) /
                      (2 * (dx_2 + dy_2));
        delta_u += std::abs(u_d_2(i, j) - u_d_1(i, j));
        sum_u += std::abs(u_d_2(i, j));
      }
    r.waitall();
    // apply one Jacobi iteration step for edge region, which requires overlapping boundary data
    for (auto j : {u_d_1.begin(1), u_d_1.end(1) - 1})
      for (auto i{u_d_1.begin(0)}, i_end{u_d_1.end(0)}; i < i_end; ++i) {
        u_d_2(i, j) = (dy_2 * (u_d_1(i - 1, j) + u_d_1(i + 1, j)) +
                       dx_2 * (u_d_1(i, j - 1) + u_d_1(i, j + 1))) /
                      (2 * (dx_2 + dy_2));
        delta_u += std::abs(u_d_2(i, j) - u_d_1(i, j));
        sum_u += std::abs(u_d_2(i, j));
      }
    for (auto j{u_d_1.begin(1)}, j_end{u_d_1.end(1)}; j < j_end; ++j)
      for (auto i : {u_d_1.begin(0), u_d_1.end(0) - 1}) {
        u_d_2(i, j) = (dy_2 * (u_d_1(i - 1, j) + u_d_1(i + 1, j)) +
                       dx_2 * (u_d_1(i, j - 1) + u_d_1(i, j + 1))) /
                      (2 * (dx_2 + dy_2));
        delta_u += std::abs(u_d_2(i, j) - u_d_1(i, j));
        sum_u += std::abs(u_d_2(i, j));
      }
    // determine global sum of delta_u and sum_u and distribute to all processors
    double_2 delta_sum_u{delta_u, sum_u};  // pack into pair
    comm_c.allreduce(
        [](double_2 a, double_2 b) {
          // reduction adds component-by-component
          return double_2{std::get<0>(a) + std::get<0>(b), std::get<1>(a) + std::get<1>(b)};
        },
        delta_sum_u);
    std::tie(delta_u, sum_u) = delta_sum_u;  // unpack from pair
    converged = delta_u / sum_u < 1e-3;      // check for convergence
    u_d_2.swap(u_d_1);
  }
  // gather data and print result
  if (comm_c.rank() == 0) {
    std::cerr << iterations << " iterations\n";
    // local grid to store the whole set of inner grid points
    mpl::local_grid<2, double> u{comm_c, {n_x, n_y}};
    gather(comm_c, 0, u_d_1, u);
    for (auto j{u.begin(1)}, j_end{u.end(1)}; j < j_end; ++j) {
      for (auto i{u.begin(0)}, i_end{u.end(0)}; i < i_end; ++i)
        std::cout << u(i, j) << '\t';
      std::cout << '\n';
    }
  } else
    gather(comm_c, 0, u_d_1);
  return EXIT_SUCCESS;
}