time_integration.f90

Source Code

!> @file time_integration.f90
!> @brief Time integration schemes for method-of-lines ODE systems.
!!
!! SSPRK = Strong Stability Preserving Runge-Kutta (Shu & Osher, 1988).
!!
!! Supported schemes:
!!   'euler'   — Explicit Euler (1st order)
!!   'ssprk22' — SSPRK(2,2) (Shu & Osher, 1988)
!!   'rk3'     — TVD-RK3 (Shu & Osher, 1988) [default]
!!   'rk4'     — Classic RK4
!!   'ssprk54' — SSPRK(5,4) (Spiteri & Ruuth, 2002)
!!   'beuler'  — Backward Euler with Newton-Raphson (implicit, 1st order)
!!   'bdf2'    — BDF2 with Newton-Raphson (implicit, 2nd order)
!!
!! Implicit banded solver selection is controlled per-instance via
!!   state%cfg%lapack_solver = .true.  (default) — LAPACK dgbsv (pivoted, faster)
!!   state%cfg%lapack_solver = .false.           — built-in Gaussian elimination (no pivoting)

module time_integration
  use precision, only: wp
  use solver_state, only: solver_state_t, neq, allocate_work_arrays, release_work_arrays
  use logger, only: log_warn
  use option_registry, only: time_euler, time_ssprk22, time_rk3, time_rk4, &
                             time_ssprk54, time_beuler, time_bdf2, &
                             time_scheme_names, join_token_list
  use parallel_reductions, only: par_sum_real
  use domain_decomposition, only: decomp_t, decompose
  use mpi_runtime, only: my_rank, n_ranks, parallel_fatal
  use implicit_gather_scatter, only: gather_field_to_root, scatter_field_from_root
  use initial_conditions, only: apply_initial_condition
  use stepper_kernels, only: rhs_fn, kernel_euler, kernel_ssprk22, kernel_tvd_rk3, &
                             kernel_rk4, kernel_ssprk54
  implicit none
  private

  !> Abstract interface satisfied by every single-step stepper.
  public :: stepper_iface
  abstract interface
    subroutine stepper_iface(state)
      import :: solver_state_t
      type(solver_state_t), intent(inout), target :: state
    end subroutine stepper_iface
  end interface

  !> Procedure pointer to the active time-stepping scheme.
  !! Initialised to null; set once by init_time_scheme() before the time loop.
  procedure(stepper_iface), pointer, public :: step => null()

  !> Maximum Newton-Raphson iterations per time step (beuler and bdf2).
  !! 3 iterations is sufficient for BDF2 at CFL ≤ 10; increase for stiff problems.
  integer, parameter :: n_newton = 3

  !> Newton convergence tolerance: max-norm of correction ΔQ must fall below this.
  !! 1e-10 gives full convergence well below solver tolerance for double precision.
  real(wp), parameter :: tol_newton = 1.0e-10_wp

  !> Finite-difference step size for Jacobian column approximation.
  !! The classic optimum for a one-sided FD balances truncation error O(h)
  !! against floating-point cancellation O(ε/h) at h ≈ √ε ≈ √(2.2e-16) ≈ 1.5e-8.
  !! We use 1e-7 (~6.5× larger): somewhat above the theoretical √ε optimum,
  !! chosen for robustness against cancellation in the column FD of the
  !! residual where the per-component magnitudes vary widely.
  real(wp), parameter :: eps_jac = 1.0e-7_wp

  !> Snapshot of a solver_state_t's per-rank local layout, used by the
  !! implicit gather-solve-scatter wrappers when rank 0 temporarily
  !! swaps into a single-rank decomp.
  type :: local_layout_save_t
    type(decomp_t) :: decomp
    real(wp), allocatable :: ub_interior(:, :)         ! (neq, n_local)
    real(wp), allocatable :: bdf2_ub_prev(:, :)        ! (neq, n_local), only present if BDF2 has been bootstrapped
    logical :: bdf2_was_initialized = .false.
  end type local_layout_save_t

  public :: init_time_scheme, resolve_time_scheme
  public :: beuler_step, bdf2_step

contains

  ! ---------------------------------------------------------------------------
  !> Capture the per-rank local layout of state so it can be restored
  !! after the implicit gather-solve-scatter window. Saves the decomp,
  !! the ub-interior slice, and bdf2_ub_prev if present.
  subroutine save_local_layout(state, snap)
    type(solver_state_t), intent(in) :: state
    type(local_layout_save_t), intent(out) :: snap
    integer :: info

    snap % decomp = state % decomp
    allocate (snap % ub_interior(neq, state % n_pt), stat=info)
    if (info /= 0) error stop 'save_local_layout: allocation failed (ub_interior)'
    snap % ub_interior = state % ub(:, 1:state % n_pt)

    snap % bdf2_was_initialized = state % bdf2_initialized
    if (allocated(state % bdf2_ub_prev)) then
      allocate (snap % bdf2_ub_prev(neq, state % n_pt), stat=info)
      if (info /= 0) error stop 'save_local_layout: allocation failed (bdf2_ub_prev)'
      snap % bdf2_ub_prev = state % bdf2_ub_prev
    end if
  end subroutine save_local_layout

  ! ---------------------------------------------------------------------------
  !> Restore the per-rank local layout captured by save_local_layout.
  !! Releases all work arrays, swaps state%decomp back to the saved
  !! local decomp, reallocates work arrays at local size, and copies
  !! the saved ub-interior (and bdf2_ub_prev if present) back in.
  !! The bdf2_initialized flag is also restored.
  subroutine restore_local_layout(state, snap)
    type(solver_state_t), intent(inout) :: state
    type(local_layout_save_t), intent(in) :: snap
    integer :: info

    call release_work_arrays(state)
    state % decomp = snap % decomp
    call allocate_work_arrays(state)

    state % ub(:, 1:state % n_pt) = snap % ub_interior

    if (allocated(snap % bdf2_ub_prev)) then
      if (.not. allocated(state % bdf2_ub_prev)) then
        allocate (state % bdf2_ub_prev(neq, state % n_pt), stat=info)
        if (info /= 0) error stop 'restore_local_layout: allocation failed (bdf2_ub_prev)'
      end if
      state % bdf2_ub_prev = snap % bdf2_ub_prev
    end if
    state % bdf2_initialized = snap % bdf2_was_initialized
  end subroutine restore_local_layout

  ! ---------------------------------------------------------------------------
  !> Resolve a scheme name to a specific stepper procedure pointer.
  !!
  !! This is the session-safe path used by `solver_runtime`, while the legacy
  !! module-global `step` pointer remains available for unit tests and older
  !! call sites through `init_time_scheme`.
  ! ---------------------------------------------------------------------------
  subroutine resolve_time_scheme(stepper, scheme)
    procedure(stepper_iface), pointer, intent(out) :: stepper
    character(len=*), intent(in) :: scheme

    select case (trim(scheme))
    case (time_euler)
      stepper => euler_step
    case (time_ssprk22)
      stepper => ssprk22_step
    case (time_rk3)
      stepper => tvd_rk3_step
    case (time_rk4)
      stepper => rk4_step
    case (time_ssprk54)
      stepper => ssprk54_step
    case (time_beuler)
      stepper => beuler_step
    case (time_bdf2)
      stepper => bdf2_step
    case default
      error stop 'time_integration: unknown scheme "'//trim(scheme)// &
        '"; valid: '//trim(join_token_list(time_scheme_names))
    end select
  end subroutine resolve_time_scheme

  ! ---------------------------------------------------------------------------
  !> Bind the procedure pointer @p step to the requested scheme.
  !!
  !! Valid scheme names:
  !!   'euler'   — Explicit Euler (1st order, 1 stage)
  !!   'ssprk22' — SSPRK(2,2) / Heun (2nd order, 2 stages; Shu & Osher, 1988)
  !!   'rk3'     — TVD-RK3 (Shu & Osher, 1988)  [default]
  !!   'rk4'     — Classic RK4 (4th order, 4 stages; not SSP)
  !!   'ssprk54' — SSPRK(5,4) (Spiteri & Ruuth, 2002)
  !!   'beuler'  — Backward Euler (implicit, 1st order)
  !!   'bdf2'    — BDF2 (implicit, 2nd order; Gear 1971)
  ! ---------------------------------------------------------------------------
  subroutine init_time_scheme(scheme)
    character(len=*), intent(in) :: scheme

    call resolve_time_scheme(step, scheme)
  end subroutine init_time_scheme

  ! ---------------------------------------------------------------------------
  !> stepper_kernels rhs callback: recompute the 1D residual (halo exchange +
  !! BCs + interior resid happen inside compute_resid).
  ! ---------------------------------------------------------------------------
  subroutine rhs_1d(ctx)
    use spatial_discretization, only: compute_resid
    class(*), intent(inout), target :: ctx

    select type (ctx)
    type is (solver_state_t)
      call compute_resid(ctx)
    class default
      error stop 'time_integration: rhs_1d received unexpected context type'
    end select
  end subroutine rhs_1d

  ! ---------------------------------------------------------------------------
  !> Advance the solution by one time step using the TVD-RK3 scheme.
  !!
  !! Third-order Strong Stability Preserving Runge-Kutta (Shu & Osher, 1988).
  !! Stage arithmetic lives in stepper_kernels::kernel_tvd_rk3; this adapter
  !! only remaps the 1D state's flat pointers and supplies the rhs callback.
  !! Also computes the global L2 residual norm for convergence monitoring.
  ! ---------------------------------------------------------------------------
  subroutine tvd_rk3_step(state)
    type(solver_state_t), intent(inout), target :: state
    real(wp), pointer :: u(:), r(:), s1(:)

    ! Halo-zero invariant (applies to every kernel call below, and in the
    ! other four explicit adapters): the whole-array stage updates inside
    ! stepper_kernels span the halo cells too, so each stage advances halo
    ! points as ub_halo + dt*resid_halo. This stays correct only because
    ! `resid` halo cells are zero — they are zeroed once in
    ! allocate_work_arrays and never written by compute_resid (which fills
    ! interior cells only). Stale halo `ub` values left by these updates are
    ! harmless because halos are refreshed by halo exchange + BC application
    ! before the next compute_resid. See stepper_kernels module header.
    u(1:size(state % ub)) => state % ub
    r(1:size(state % resid)) => state % resid
    s1(1:size(state % scratch1)) => state % scratch1
    call kernel_tvd_rk3(u, r, s1, state % dt, rhs_1d, state)

    call compute_resid_glob(state)
  end subroutine tvd_rk3_step

  ! ---------------------------------------------------------------------------
  !> Advance the solution by one time step using the SSPRK(5,4) scheme.
  !!
  !! Five-stage, fourth-order SSP Runge-Kutta method (Spiteri & Ruuth, 2002,
  !! Table 1). Stage arithmetic and coefficients live in
  !! stepper_kernels::kernel_ssprk54; this adapter only remaps the 1D state's
  !! flat pointers and supplies the rhs callback.
  !! Also computes the global L2 residual norm for convergence monitoring.
  ! ---------------------------------------------------------------------------
  subroutine ssprk54_step(state)
    type(solver_state_t), intent(inout), target :: state
    real(wp), pointer :: u(:), r(:), s1(:), s2(:), s3(:)

    u(1:size(state % ub)) => state % ub
    r(1:size(state % resid)) => state % resid
    s1(1:size(state % scratch1)) => state % scratch1
    s2(1:size(state % scratch2)) => state % scratch2
    s3(1:size(state % scratch3)) => state % scratch3
    call kernel_ssprk54(u, r, s1, s2, s3, state % dt, rhs_1d, state)

    call compute_resid_glob(state)
  end subroutine ssprk54_step

  ! ---------------------------------------------------------------------------
  !> Advance the solution by one time step using the explicit Euler scheme.
  !!
  !! First-order, one-stage method. Stage arithmetic lives in
  !! stepper_kernels::kernel_euler; this adapter only remaps the 1D state's
  !! flat pointers and supplies the rhs callback. No scratch arrays required.
  !! Also computes the global L2 residual norm for convergence monitoring.
  ! ---------------------------------------------------------------------------
  subroutine euler_step(state)
    type(solver_state_t), intent(inout), target :: state
    real(wp), pointer :: u(:), r(:)

    u(1:size(state % ub)) => state % ub
    r(1:size(state % resid)) => state % resid
    call kernel_euler(u, r, state % dt, rhs_1d, state)

    call compute_resid_glob(state)
  end subroutine euler_step

  ! ---------------------------------------------------------------------------
  !> Advance the solution by one time step using the SSPRK(2,2) scheme.
  !!
  !! Two-stage, second-order Strong Stability Preserving Runge-Kutta
  !! (Shu & Osher, 1988; also known as Heun's method in SSP form). Stage
  !! arithmetic lives in stepper_kernels::kernel_ssprk22; this adapter only
  !! remaps the 1D state's flat pointers and supplies the rhs callback.
  !! CFL stability limit: 1.
  !! Also computes the global L2 residual norm for convergence monitoring.
  ! ---------------------------------------------------------------------------
  subroutine ssprk22_step(state)
    type(solver_state_t), intent(inout), target :: state
    real(wp), pointer :: u(:), r(:), s1(:)

    u(1:size(state % ub)) => state % ub
    r(1:size(state % resid)) => state % resid
    s1(1:size(state % scratch1)) => state % scratch1
    call kernel_ssprk22(u, r, s1, state % dt, rhs_1d, state)

    call compute_resid_glob(state)
  end subroutine ssprk22_step

  ! ---------------------------------------------------------------------------
  !> Advance the solution by one time step using the classic RK4 scheme.
  !!
  !! Four-stage, fourth-order Runge-Kutta (not SSP). Stage arithmetic lives in
  !! stepper_kernels::kernel_rk4 (s1 holds Q^n; s2 accumulates the weighted
  !! stages); this adapter only remaps the 1D state's flat pointers and
  !! supplies the rhs callback.
  !!
  !! WARNING: Classic RK4 is not strong-stability preserving.  Near shocks
  !! it may amplify oscillations at large CFL numbers.  Keep CFL <= 1 with
  !! WENO spatial discretisation to avoid spurious artefacts.
  !! Also computes the global L2 residual norm for convergence monitoring.
  ! ---------------------------------------------------------------------------
  subroutine rk4_step(state)
    type(solver_state_t), intent(inout), target :: state
    real(wp), pointer :: u(:), r(:), s1(:), s2(:)

    u(1:size(state % ub)) => state % ub
    r(1:size(state % resid)) => state % resid
    s1(1:size(state % scratch1)) => state % scratch1
    s2(1:size(state % scratch2)) => state % scratch2
    call kernel_rk4(u, r, s1, s2, state % dt, rhs_1d, state)

    call compute_resid_glob(state)
  end subroutine rk4_step

  ! ---------------------------------------------------------------------------
  !> Compute banded-storage dimensions for the Newton-step Jacobian.
  !!
  !! n_dof = neq * n_pt.  The Jacobian is banded with
  !! kl = ku = neq*halo_width because resid(i) depends on ub(j) only when
  !! |i_cell - j_cell| <= halo_width.
  !!
  !! LAPACK dgbsv needs kl extra pivot rows: ldab = 2*kl+ku+1, diag_row = kl+ku+1.
  !! Built-in solver uses compact storage:   ldab = kl+ku+1,   diag_row = ku+1.
  !!
  !! @param[in]  state     Solver state (reads n_pt and cfg%lapack_solver).
  !! @param[out] n_dof     Total degrees of freedom (neq * n_pt)
  !! @param[out] kl        Lower bandwidth
  !! @param[out] ku        Upper bandwidth
  !! @param[out] ldab      Leading dimension of band matrix
  !! @param[out] diag_row  Band-storage row that holds the diagonal
  ! ---------------------------------------------------------------------------
  subroutine setup_band_storage(state, n_dof, kl, ku, ldab, diag_row)
    type(solver_state_t), intent(in) :: state
    integer, intent(out) :: n_dof, kl, ku, ldab, diag_row

    n_dof = neq * state % n_pt
    kl = neq * state % halo_width
    ku = neq * state % halo_width
    if (state % cfg % lapack_solver) then
      ldab = 2 * kl + ku + 1
      diag_row = kl + ku + 1
    else
      ldab = kl + ku + 1
      diag_row = ku + 1
    end if
  end subroutine setup_band_storage

  ! ---------------------------------------------------------------------------
  !> Banded linear solver using LAPACK dgbsv.
  !!
  !! AB must be in LAPACK band storage: AB(kl+ku+1+i-j, j) = A(i,j),
  !! leading dimension ldab = 2*kl + ku + 1 (the extra kl rows are used as
  !! pivoting workspace by LAPACK).  On exit b contains the solution x = A^{-1} b.
  !!
  !! @param[inout] ab    Banded matrix in LAPACK storage (overwritten with LU)
  !! @param[in]    kl    Lower bandwidth
  !! @param[in]    ku    Upper bandwidth
  !! @param[in]    n     Matrix order (number of unknowns)
  !! @param[inout] b     RHS on entry; solution on exit
  !! @param[out]   info  LAPACK return code (0 = success)
  ! ---------------------------------------------------------------------------
  subroutine band_lapack_solve(ab, kl, ku, n, b, info)
    integer, intent(in) :: kl, ku, n
    real(wp), intent(inout) :: ab(2 * kl + ku + 1, n)
    real(wp), intent(inout) :: b(n)
    integer, intent(out) :: info
    integer :: ipiv(n)

    ! Explicit interface for LAPACK dgbsv (double-precision banded solver).
    ! wp = real64 = double precision, so the kinds match.
    interface
      subroutine dgbsv(n_in, kl_in, ku_in, nrhs, ab_in, ldab, ipiv_in, b_in, ldb, info_out)
        integer, intent(in) :: n_in, kl_in, ku_in, nrhs, ldab, ldb
        double precision, intent(inout) :: ab_in(ldab, *)
        double precision, intent(inout) :: b_in(ldb, *)
        integer, intent(out) :: ipiv_in(*)
        integer, intent(out) :: info_out
      end subroutine dgbsv
    end interface

    call dgbsv(n, kl, ku, 1, ab, 2 * kl + ku + 1, ipiv, b, n, info)
  end subroutine band_lapack_solve

  ! ---------------------------------------------------------------------------
  !> Shared Newton + banded-solve inner loop for the implicit steppers.
  !!
  !! Solves  A·ΔQ = c_resid·dt·R(Q^k) - ((Q^k - c1·Q_n) + c2·Q_{n-1}),
  !! A = I - c_resid·dt·J, iterating Q^{k+1}=Q^k+ΔQ up to n_newton times with a
  !! column-wise FD Jacobian.  The history coefficients select the scheme:
  !!   * backward Euler : c_resid=1,   c1=1,   c2=0    (Q_{n-1} unused)
  !!   * BDF2           : c_resid=2/3, c1=4/3, c2=1/3
  !! The fixed assembly order `(Q^k - c1·Q_n) + c2·Q_{n-1}` reproduces both
  !! callers' previous arithmetic bit-for-bit (·1.0 and +0.0 are exact in IEEE).
  !! Operates entirely on st%ub / st%resid; the caller owns the band geometry
  !! and the ab/rhs/dq/resid scratch.  Does NOT call compute_resid_glob.
  ! ---------------------------------------------------------------------------
  subroutine run_newton(st, ub_n_loc, ub_nm1_loc, c_resid, hist_c1, hist_c2, &
                        scheme_label, n_dof_loc, kl_loc, ku_loc, ldab_loc, &
                        diag_row_loc, ab_loc, rhs_loc, dq_loc, &
                        resid_base_loc, resid_pert_loc)
    use spatial_discretization, only: compute_resid
    type(solver_state_t), intent(inout) :: st
    real(wp), intent(in) :: ub_n_loc(neq, st % n_pt), ub_nm1_loc(neq, st % n_pt)
    real(wp), intent(in) :: c_resid, hist_c1, hist_c2
    character(len=*), intent(in) :: scheme_label
    integer, intent(in) :: n_dof_loc, kl_loc, ku_loc, ldab_loc, diag_row_loc
    real(wp), intent(inout) :: ab_loc(ldab_loc, n_dof_loc)
    real(wp), intent(inout) :: rhs_loc(n_dof_loc), dq_loc(n_dof_loc)
    real(wp), intent(inout) :: resid_base_loc(n_dof_loc), resid_pert_loc(n_dof_loc)

    integer :: it, jj, eq_loc, cell_loc, info_la_loc
    real(wp) :: h_loc, ub_save_loc, delta_max_loc
    real(wp) :: q_n_flat(n_dof_loc), q_nm1_flat(n_dof_loc)

    call pack_field(ub_n_loc, q_n_flat, neq, st % n_pt)
    call pack_field(ub_nm1_loc, q_nm1_flat, neq, st % n_pt)

    newton_loop: do it = 1, n_newton

      call compute_resid(st)
      call pack_field(st % resid(:, 1:st % n_pt), resid_base_loc, neq, st % n_pt)

      ab_loc = 0.0_wp
      do jj = 1, n_dof_loc
        eq_loc = mod(jj - 1, neq) + 1
        cell_loc = (jj - 1) / neq + 1

        ub_save_loc = st % ub(eq_loc, cell_loc)
        h_loc = eps_jac * max(1.0_wp, abs(ub_save_loc))
        st % ub(eq_loc, cell_loc) = ub_save_loc + h_loc

        call compute_resid(st)
        call pack_field(st % resid(:, 1:st % n_pt), resid_pert_loc, neq, st % n_pt)

        st % ub(eq_loc, cell_loc) = ub_save_loc

        call jac_store_col(ab_loc, jj, (resid_pert_loc - resid_base_loc) / h_loc, &
                           kl_loc, ku_loc, diag_row_loc, n_dof_loc)
      end do

      ab_loc = -c_resid * st % dt * ab_loc
      do jj = 1, n_dof_loc
        ab_loc(diag_row_loc, jj) = ab_loc(diag_row_loc, jj) + 1.0_wp
      end do

      call pack_field(st % ub(:, 1:st % n_pt), rhs_loc, neq, st % n_pt)
      rhs_loc = c_resid * st % dt * resid_base_loc &
                - ((rhs_loc - hist_c1 * q_n_flat) + hist_c2 * q_nm1_flat)

      if (st % cfg % lapack_solver) then
        dq_loc = rhs_loc
        call band_lapack_solve(ab_loc, kl_loc, ku_loc, n_dof_loc, dq_loc, info_la_loc)
        if (info_la_loc /= 0) then
          block
            character(len=16) :: code_buf
            character(len=:), allocatable :: cause
            write (code_buf, '(I0)') info_la_loc
            ! dgbsv: info < 0 => illegal i-th argument; info > 0 => U(i,i)=0 (singular).
            if (info_la_loc < 0) then
              cause = 'illegal argument to dgbsv'
            else
              cause = 'singular matrix'
            end if
            call parallel_fatal(trim(scheme_label)//': LAPACK dgbsv failed ('//trim(cause)// &
                                ', info='//trim(code_buf)//')')
          end block
        end if
      else
        call band_lu_solve(ab_loc, n_dof_loc, kl_loc, ku_loc, rhs_loc, dq_loc)
      end if

      delta_max_loc = maxval(abs(dq_loc))
      call unpack_add(dq_loc, st % ub(:, 1:st % n_pt), neq, st % n_pt)

      if (delta_max_loc < tol_newton) exit newton_loop
    end do newton_loop

    if (it > n_newton) then
      block
        character(len=24) :: buf
        write (buf, '(ES12.4)') delta_max_loc
        call log_warn(trim(scheme_label)//': Newton did not converge; max_norm(dQ) = '//trim(buf))
      end block
    end if
  end subroutine run_newton

  ! ---------------------------------------------------------------------------
  !> Serial (rank-aware-decomp-agnostic) Newton + banded-solve body for
  !! backward Euler. Operates entirely on state%ub / state%resid; the
  !! caller is responsible for sizing the state correctly. Does NOT call
  !! compute_resid_glob — the wrapper does, after restore_local_layout
  !! brings every rank back to its local view so the par_sum_real call
  !! involves all ranks.
  !!
  !! Backward Euler is the (c_resid, c1, c2) = (1, 1, 0) degenerate case of the
  !! shared run_newton BDF solve, so it simply delegates.
  ! ---------------------------------------------------------------------------
  subroutine beuler_step_serial_body(state)
    type(solver_state_t), intent(inout) :: state

    integer :: n_dof, kl, ku, ldab, diag_row
    real(wp), allocatable :: ub_n(:, :)
    real(wp), allocatable :: resid_base(:)
    real(wp), allocatable :: resid_pert(:)
    real(wp), allocatable :: ab(:, :)
    real(wp), allocatable :: rhs(:)
    real(wp), allocatable :: dq(:)
    integer :: info

    call setup_band_storage(state, n_dof, kl, ku, ldab, diag_row)

    allocate (ub_n(neq, state % n_pt), stat=info)
    if (info /= 0) error stop 'beuler_step_serial_body: allocation failed (ub_n)'
    allocate (resid_base(n_dof), resid_pert(n_dof), stat=info)
    if (info /= 0) error stop 'beuler_step_serial_body: allocation failed (resid)'
    allocate (ab(ldab, n_dof), rhs(n_dof), dq(n_dof), stat=info)
    if (info /= 0) error stop 'beuler_step_serial_body: allocation failed (ab/rhs/dq)'

    ub_n = state % ub(:, 1:state % n_pt)

    ! A = I - dt·J,  rhs = dt·R(Q^k) - (Q^k - Q^n).
    call run_newton(state, ub_n, ub_n, 1.0_wp, 1.0_wp, 0.0_wp, 'beuler_step', &
                    n_dof, kl, ku, ldab, diag_row, ab, rhs, dq, resid_base, resid_pert)

    deallocate (ub_n, resid_base, resid_pert, ab, rhs, dq, stat=info)
    if (info /= 0) error stop 'beuler_step_serial_body: deallocation failed'
  end subroutine beuler_step_serial_body

  ! ---------------------------------------------------------------------------
  !> Advance the solution by one time step using backward (implicit) Euler.
  !!
  !! Solves Q^{n+1} = Q^n + dt · R(Q^{n+1}) by Newton-Raphson iteration:
  !!
  !!   Q^{0} = Q^n
  !!   for k = 0 .. n_newton-1:
  !!     compute  R(Q^k)
  !!     form FD Jacobian  J = ∂R/∂Q |_{Q^k}  via column-wise perturbation
  !!     assemble  A = I - dt·J  (banded structure, kl=ku=neq·halo_width)
  !!     solve  A·ΔQ = dt·R(Q^k) - (Q^k - Q^n)
  !!     Q^{k+1} = Q^k + ΔQ
  !!     if max|ΔQ| < tol_newton: exit
  !!
  !! WARNING: each Newton step performs (neq·n_pt + 1) residual evaluations.
  !! This is expensive; use modest grid sizes (n_cell <= 200) when testing.
  !! The main benefit is unconditional linear stability, which permits CFL > 1.
  !!
  !! Also computes the global L2 residual norm for convergence monitoring.
  !!
  !! References:
  !!   LeVeque, "Finite Volume Methods for Hyperbolic Problems" (2002), Ch. 12.
  ! ---------------------------------------------------------------------------
  subroutine beuler_step(state)
    type(solver_state_t), intent(inout), target :: state

    type(local_layout_save_t) :: snap
    real(wp), allocatable :: q_global(:, :)   ! (neq, n_global) on rank 0; (neq, 0) on others
    integer :: n_global, halo_w, info
    logical :: is_periodic

    if (n_ranks() == 1) then
      ! Single-rank fast path: no gather / scatter overhead.
      call beuler_step_serial_body(state)
      call compute_resid_glob(state)
      return
    end if

    ! ---- (1) Gather Q^n to rank 0 ----
    n_global = state % decomp % n_global
    halo_w = state % decomp % halo_width
    is_periodic = state % decomp % is_periodic

    if (my_rank() == 0) then
      allocate (q_global(neq, n_global), stat=info)
    else
      allocate (q_global(neq, 0), stat=info)
    end if
    if (info /= 0) error stop 'beuler_step: allocation failed (q_global)'

    call gather_field_to_root(state % ub(:, 1:state % n_pt), state % n_pt, &
                              state % decomp, q_global)

    ! ---- (2) On rank 0: swap into single-rank decomp and run the body ----
    if (my_rank() == 0) then
      call save_local_layout(state, snap)
      call release_work_arrays(state)
      state % decomp = decompose(0, 1, n_global, halo_w, is_periodic)
      call allocate_work_arrays(state)
      ! Re-seed boundary halos for the new global decomp. For dirichlet
      ! / inflow BCs apply_bcs assumes halos are preset (no_write); after
      ! release_work_arrays + allocate_work_arrays the halos are zero, so
      ! compute_resid would read garbage. apply_initial_condition seeds
      ! them via seed_dirichlet_halos; for periodic / reflecting it is
      ! also safe (apply_bcs handles those). The interior overwrite below
      ! restores the gathered Q after the wasted IC compute.
      call apply_initial_condition(state, state % cfg)
      state % ub(:, 1:state % n_pt) = q_global

      call beuler_step_serial_body(state)

      ! Capture the global result before we tear down the global view.
      q_global = state % ub(:, 1:state % n_pt)

      call restore_local_layout(state, snap)
    end if

    ! ---- (3) Scatter Q^{n+1} back to every rank ----
    call scatter_field_from_root(q_global, state % decomp, &
                                 state % ub(:, 1:state % n_pt), state % n_pt)

    deallocate (q_global, stat=info)
    if (info /= 0) error stop 'beuler_step: deallocation failed (q_global)'

    ! ---- (4) Global L2 residual: every rank must call into this ----
    call compute_resid_glob(state)
  end subroutine beuler_step

  ! ---------------------------------------------------------------------------
  !> Serial Newton + banded-solve body for BDF2 (including the
  !! backward-Euler bootstrap path). Operates entirely on state%ub /
  !! state%bdf2_ub_prev / state%resid at whatever size state%decomp%n_local
  !! says. Does NOT call compute_resid_glob — the wrapper does, from
  !! every rank, after restore_local_layout.
  ! ---------------------------------------------------------------------------
  subroutine bdf2_step_serial_body(state)
    type(solver_state_t), intent(inout) :: state

    integer :: n_dof, kl, ku, ldab, diag_row
    real(wp) :: coeff

    real(wp), allocatable :: ub_n(:, :)
    real(wp), allocatable :: ub_nm1(:, :)
    real(wp), allocatable :: resid_base(:)
    real(wp), allocatable :: resid_pert(:)
    real(wp), allocatable :: ab(:, :)
    real(wp), allocatable :: rhs(:)
    real(wp), allocatable :: dq(:)
    integer :: info

    call setup_band_storage(state, n_dof, kl, ku, ldab, diag_row)

    allocate (ub_n(neq, state % n_pt), ub_nm1(neq, state % n_pt), stat=info)
    if (info /= 0) error stop 'bdf2_step_serial_body: allocation failed (ub_n/ub_nm1)'
    allocate (resid_base(n_dof), resid_pert(n_dof), stat=info)
    if (info /= 0) error stop 'bdf2_step_serial_body: allocation failed (resid)'
    allocate (ab(ldab, n_dof), rhs(n_dof), dq(n_dof), stat=info)
    if (info /= 0) error stop 'bdf2_step_serial_body: allocation failed (ab/rhs/dq)'

    if (.not. state % bdf2_initialized) then
      if (.not. allocated(state % bdf2_ub_prev)) then
        allocate (state % bdf2_ub_prev(neq, state % n_pt), stat=info)
        if (info /= 0) error stop 'bdf2_step_serial_body: allocation failed (bdf2_ub_prev)'
      end if
      state % bdf2_ub_prev = state % ub(:, 1:state % n_pt)
      coeff = 1.0_wp

      ub_n = state % ub(:, 1:state % n_pt)
      ! Bootstrap: one backward-Euler-equivalent step, but assembled with the
      ! BDF2 history coefficients (q_{n-1} = q_n) so the result is bit-identical
      ! to the previous nested-run_newton bootstrap.
      call run_newton(state, ub_n, ub_n, coeff, 4.0_wp / 3.0_wp, 1.0_wp / 3.0_wp, &
                      'bdf2_step', n_dof, kl, ku, ldab, diag_row, &
                      ab, rhs, dq, resid_base, resid_pert)
      state % bdf2_initialized = .true.

      deallocate (ub_n, ub_nm1, resid_base, resid_pert, ab, rhs, dq, stat=info)
      if (info /= 0) error stop 'bdf2_step_serial_body: deallocation failed (bootstrap)'
      return
    end if

    coeff = 2.0_wp / 3.0_wp

    ub_nm1 = state % bdf2_ub_prev
    state % bdf2_ub_prev = state % ub(:, 1:state % n_pt)
    ub_n = state % ub(:, 1:state % n_pt)

    call run_newton(state, ub_n, ub_nm1, coeff, 4.0_wp / 3.0_wp, 1.0_wp / 3.0_wp, &
                    'bdf2_step', n_dof, kl, ku, ldab, diag_row, &
                    ab, rhs, dq, resid_base, resid_pert)

    deallocate (ub_n, ub_nm1, resid_base, resid_pert, ab, rhs, dq, stat=info)
    if (info /= 0) error stop 'bdf2_step_serial_body: deallocation failed'
  end subroutine bdf2_step_serial_body

  ! ---------------------------------------------------------------------------
  !> Advance the solution by one time step using the BDF2 (Gear) scheme.
  !!
  !! Second-order two-step implicit formula (Gear, 1971):
  !!
  !!   Q^{n+1} = 4/3 * Q^n - 1/3 * Q^{n-1} + (2/3) * dt * R(Q^{n+1})
  !!
  !! Solved via Newton-Raphson with the same banded-LU infrastructure as
  !! beuler_step().  LHS: A = I - (2/3)*dt*J.  RHS:
  !!   b = (2/3)*dt*R(Q^k) - (Q^k - 4/3*Q^n + 1/3*Q^{n-1})
  !!
  !! Bootstrap: the very first call performs one backward Euler step
  !! (A = I - dt*J, standard beuler RHS) to produce Q^1 from Q^0, storing Q^0
  !! in state%bdf2_ub_prev so that subsequent calls can use the BDF2 formula.
  !!
  !! At -np > 1: gather Q^n (and bdf2_ub_prev when past bootstrap) to rank 0,
  !! temporarily switch state into a single-rank decomp on rank 0, run
  !! bdf2_step_serial_body on the global vector, scatter Q^{n+1} and the
  !! updated bdf2_ub_prev back to per-rank slices, then call compute_resid_glob
  !! from every rank (so the par_sum_real inside it is a proper collective).
  !!
  !! References:
  !!   C.W. Gear, "Numerical Initial Value Problems in Ordinary Differential
  !!   Equations," Prentice-Hall, 1971.
  ! ---------------------------------------------------------------------------
  subroutine bdf2_step(state)
    type(solver_state_t), intent(inout), target :: state

    type(local_layout_save_t) :: snap
    real(wp), allocatable :: q_global(:, :)           ! (neq, n_global) on rank 0
    real(wp), allocatable :: q_prev_global(:, :)      ! (neq, n_global) on rank 0
    integer :: n_global, halo_w, info
    logical :: is_periodic, gather_prev

    if (n_ranks() == 1) then
      call bdf2_step_serial_body(state)
      call compute_resid_glob(state)
      return
    end if

    n_global = state % decomp % n_global
    halo_w = state % decomp % halo_width
    is_periodic = state % decomp % is_periodic
    gather_prev = state % bdf2_initialized .and. allocated(state % bdf2_ub_prev)

    ! ---- (1) Gather Q^n and (if past bootstrap) Q^{n-1} to rank 0 ----
    if (my_rank() == 0) then
      allocate (q_global(neq, n_global), stat=info)
    else
      allocate (q_global(neq, 0), stat=info)
    end if
    if (info /= 0) error stop 'bdf2_step: allocation failed (q_global)'

    call gather_field_to_root(state % ub(:, 1:state % n_pt), state % n_pt, &
                              state % decomp, q_global)

    if (gather_prev) then
      if (my_rank() == 0) then
        allocate (q_prev_global(neq, n_global), stat=info)
      else
        allocate (q_prev_global(neq, 0), stat=info)
      end if
      if (info /= 0) error stop 'bdf2_step: allocation failed (q_prev_global)'
      call gather_field_to_root(state % bdf2_ub_prev, state % n_pt, &
                                state % decomp, q_prev_global)
    end if

    ! ---- (2) On rank 0: swap into single-rank decomp and run the body ----
    if (my_rank() == 0) then
      call save_local_layout(state, snap)
      call release_work_arrays(state)
      state % decomp = decompose(0, 1, n_global, halo_w, is_periodic)
      call allocate_work_arrays(state)
      ! Re-seed boundary halos for the new global decomp; see beuler_step
      ! for the rationale (dirichlet apply_bcs is no_write, so halos need
      ! seed_dirichlet_halos to fire via apply_initial_condition).
      call apply_initial_condition(state, state % cfg)
      state % ub(:, 1:state % n_pt) = q_global

      if (gather_prev) then
        ! Pre-populate the global bdf2_ub_prev so the serial body's
        ! "past-bootstrap" branch sees the correct Q^{n-1}.
        allocate (state % bdf2_ub_prev(neq, state % n_pt), stat=info)
        if (info /= 0) error stop 'bdf2_step: allocation failed (rank-0 bdf2_ub_prev)'
        state % bdf2_ub_prev = q_prev_global
        state % bdf2_initialized = .true.
      else
        ! Bootstrap path: bdf2_ub_prev does not yet exist.
        state % bdf2_initialized = .false.
      end if

      call bdf2_step_serial_body(state)

      ! Capture the global Q^{n+1} and the (now-advanced) global Q^n
      ! before tearing down the global view.
      q_global = state % ub(:, 1:state % n_pt)
      if (.not. allocated(q_prev_global)) then
        ! Bootstrap path produced bdf2_ub_prev for the first time.
        allocate (q_prev_global(neq, n_global), stat=info)
        if (info /= 0) error stop 'bdf2_step: allocation failed (bootstrap q_prev_global)'
      end if
      q_prev_global = state % bdf2_ub_prev

      call restore_local_layout(state, snap)
    end if

    ! Non-rank-0 ranks must also allocate a receive buffer for the
    ! bdf2_ub_prev scatter on the bootstrap path (where gather_prev was
    ! .false. and they never allocated q_prev_global above).
    if (.not. allocated(q_prev_global)) then
      allocate (q_prev_global(neq, 0), stat=info)
      if (info /= 0) error stop 'bdf2_step: allocation failed (non-rank-0 q_prev_global)'
    end if

    ! ---- (3) Scatter Q^{n+1} and Q^{n-1}(new) back to every rank ----
    call scatter_field_from_root(q_global, state % decomp, &
                                 state % ub(:, 1:state % n_pt), state % n_pt)

    if (.not. allocated(state % bdf2_ub_prev)) then
      allocate (state % bdf2_ub_prev(neq, state % n_pt), stat=info)
      if (info /= 0) error stop 'bdf2_step: allocation failed (local bdf2_ub_prev)'
    end if
    call scatter_field_from_root(q_prev_global, state % decomp, &
                                 state % bdf2_ub_prev, state % n_pt)
    state % bdf2_initialized = .true.

    deallocate (q_global, q_prev_global, stat=info)
    if (info /= 0) error stop 'bdf2_step: deallocation failed'

    ! ---- (4) Global L2 residual: every rank must call into this ----
    call compute_resid_glob(state)
  end subroutine bdf2_step

  ! ---------------------------------------------------------------------------
  !> Flatten a (neq x n_pt) field into a length-neq*n_pt 1-D array.
  !! Ordering: (eq=1,cell=1), (eq=2,cell=1), ..., (eq=neq,cell=n_pt).
  pure subroutine pack_field(src, dst, nq, np)
    integer, intent(in) :: nq, np
    real(wp), intent(in) :: src(nq, np)
    real(wp), intent(out) :: dst(nq * np)
    integer :: ipt, ieq
    do ipt = 1, np
      do ieq = 1, nq
        dst((ipt - 1) * nq + ieq) = src(ieq, ipt)
      end do
    end do
  end subroutine pack_field

  ! ---------------------------------------------------------------------------
  !> Add a flattened 1-D correction to a (neq x n_pt) 2-D field in place.
  subroutine unpack_add(src, dst, nq, np)
    integer, intent(in) :: nq, np
    real(wp), intent(in) :: src(nq * np)
    real(wp), intent(inout) :: dst(nq, np)
    integer :: ipt, ieq
    do ipt = 1, np
      do ieq = 1, nq
        dst(ieq, ipt) = dst(ieq, ipt) + src((ipt - 1) * nq + ieq)
      end do
    end do
  end subroutine unpack_add

  ! ---------------------------------------------------------------------------
  !> Store one Jacobian column into band storage AB(diag_row_loc+i-j, j) = J(i,j).
  !! Works for both the custom (diag_row_loc = ku+1) and LAPACK
  !! (diag_row_loc = kl+ku+1) storage layouts.
  pure subroutine jac_store_col(ab_loc, j_col, col, kl_loc, ku_loc, diag_row_loc, n)
    integer, intent(in) :: j_col, kl_loc, ku_loc, diag_row_loc, n
    real(wp), intent(in) :: col(n)
    real(wp), intent(inout) :: ab_loc(:, :)
    integer :: i
    do i = max(1, j_col - ku_loc), min(n, j_col + kl_loc)
      ab_loc(diag_row_loc + i - j_col, j_col) = col(i)
    end do
  end subroutine jac_store_col

  ! ---------------------------------------------------------------------------
  !> Banded Gaussian elimination without pivoting.
  !! Band storage: AB(ku+1+i-j, j) = A(i,j).
  !! On exit x = A^{-1} b.  The matrix ab_loc is overwritten with LU factors.
  subroutine band_lu_solve(ab_loc, n, kl_loc, ku_loc, b, x)
    integer, intent(in) :: n, kl_loc, ku_loc
    real(wp), intent(inout) :: ab_loc(kl_loc + ku_loc + 1, n)
    real(wp), intent(in) :: b(n)
    real(wp), intent(out) :: x(n)

    integer :: i, j_col, k
    real(wp) :: factor
    integer :: diag   ! band-row of diagonal = ku_loc + 1

    diag = ku_loc + 1
    x = b

    ! --- Forward elimination (LU factorisation in place) ---
    do j_col = 1, n
      do i = j_col + 1, min(n, j_col + kl_loc)
        if (abs(ab_loc(diag, j_col)) < tiny(1.0_wp)) cycle
        factor = ab_loc(diag + i - j_col, j_col) / ab_loc(diag, j_col)
        ab_loc(diag + i - j_col, j_col) = factor   ! store L multiplier
        do k = j_col + 1, min(n, j_col + ku_loc)
          ab_loc(diag + i - k, k) = ab_loc(diag + i - k, k) &
              & - factor * ab_loc(diag + j_col - k, k)
        end do
        x(i) = x(i) - factor * x(j_col)
      end do
    end do

    ! --- Back substitution ---
    do j_col = n, 1, -1
      if (abs(ab_loc(diag, j_col)) < tiny(1.0_wp)) then
        ! Singular pivot: fail loudly rather than silently zeroing the component
        ! (which would return a wrong delta-Q and silently break Newton). This
        ! matches the LAPACK path, which error-stops on info /= 0. Only reachable
        ! when lapack_solver = .false. (non-default).
        call parallel_fatal('band_lu_solve: singular pivot (matrix singular); set lapack_solver=.true. or check the Jacobian')
      else
        x(j_col) = x(j_col) / ab_loc(diag, j_col)
      end if
      do i = max(1, j_col - ku_loc), j_col - 1
        x(i) = x(i) - ab_loc(diag + i - j_col, j_col) * x(j_col)
      end do
    end do
  end subroutine band_lu_solve

  ! ---------------------------------------------------------------------------
  !> Accumulate the global L2 norm of the residual into state%resid_glob.
  !!
  !! resid_glob = sqrt( sum_{i,k} resid(i,k)^2 / (n_pt_global * neq) )
  !!
  !! The denominator uses n_pt_global (the total number of interior points
  !! across all MPI ranks) so that the norm is independent of decomposition.
  !!
  !! Called at the end of every time step for convergence monitoring.
  ! ---------------------------------------------------------------------------
  subroutine compute_resid_glob(state)
    type(solver_state_t), intent(inout) :: state
    real(wp) :: local_sumsq, global_sumsq
    integer :: i, ipt

    local_sumsq = 0.0_wp
    do ipt = 1, state % n_pt
      do i = 1, neq
        local_sumsq = local_sumsq + state % resid(i, ipt)**2
      end do
    end do
    global_sumsq = par_sum_real(local_sumsq)
    state % resid_glob = sqrt(global_sumsq / (real(state % n_pt_global, wp) * real(neq, wp)))
  end subroutine compute_resid_glob

end module time_integration
time_integration.f90 Source File

Contents

Modules

Source Code

Source Code