diff --git a/LIB/EQUATION/ACMnew/module_ACM.f90 b/LIB/EQUATION/ACMnew/module_ACM.f90
index 65c076ac..3325c993 100644
--- a/LIB/EQUATION/ACMnew/module_ACM.f90
+++ b/LIB/EQUATION/ACMnew/module_ACM.f90
@@ -83,7 +83,7 @@ module module_acm
     real(kind=rk) :: mask_volume=0.0_rk, meanflow_channel(1:3) = 0.0_rk
 
     logical :: use_passive_scalar = .false.
-    integer(kind=ik) :: N_scalars = 0, nsave_stats = 999999
+    integer(kind=ik) :: N_scalars = 0
     real(kind=rk), allocatable :: schmidt_numbers(:), x0source(:), y0source(:), &
     z0source(:), scalar_Ceta(:), widthsource(:)
     character(len=cshort), allocatable :: scalar_inicond(:), scalar_source_type(:)
@@ -192,9 +192,9 @@ subroutine READ_PARAMETERS_ACM( filename, N_mask_components, g )
 
     character(len=*), intent(in) :: filename
     integer(kind=ik) :: mpicode, nx_max, n_entries
-    real(kind=rk) :: dx_min, dt_min
+    real(kind=rk) :: dx_min, dt_min_c0, dt_min_vpm, dt_min_nu
     character(len=cshort) :: Bs_str, Bs_conc
-    character(len=16834) :: input_files
+    character(len=400) :: input_files
     character(len=12) :: timestamp
     character(:), allocatable :: Bs_short
     real(kind=rk), dimension(3) :: ddx
@@ -348,7 +348,6 @@ subroutine READ_PARAMETERS_ACM( filename, N_mask_components, g )
 
     call read_param_mpi(FILE, 'Time', 'time_max', params_acm%T_end, 1.0_rk   )
 
-    call read_param_mpi(FILE, 'Statistics', 'nsave_stats', params_acm%nsave_stats, 999999   )
     call read_param_mpi(FILE, 'FreeFlightSolver', 'use_free_flight_solver', params_acm%use_free_flight_solver, .false.   )
 
     call read_param_mpi(FILE, 'Blocks', 'max_treelevel', params_acm%Jmax, 1   )
@@ -428,11 +427,23 @@ subroutine READ_PARAMETERS_ACM( filename, N_mask_components, g )
     ! uniqueGrid modification
     nx_max = maxval( (params_acm%Bs) * 2**(params_acm%Jmax) )
 
+    ! print some time-step related information
     if (params_acm%c_0 > 0.0_rk) then
-        dt_min = params_acm%CFL*dx_min/params_acm%c_0
+        dt_min_c0 = params_acm%CFL*dx_min/params_acm%c_0
     else
-        dt_min = 0.0_rk
+        dt_min_c0 = 0.0_rk
     endif
+    if (params_acm%nu > 0.0_rk) then
+        dt_min_nu = params_acm%CFL_nu*dx_min**2/params_acm%nu
+    else
+        dt_min_nu = 0.0_rk
+    endif
+    if (params_acm%penalization .and. params_acm%C_eta > 0.0_rk) then
+        dt_min_vpm = params_acm%CFL_eta*params_acm%C_eta
+    else
+        dt_min_vpm = 0.0_rk
+    endif
+
     ! nice to have this elsewhere in the ACM module:
     params_acm%dx_min = dx_min
 
@@ -442,15 +453,17 @@ subroutine READ_PARAMETERS_ACM( filename, N_mask_components, g )
 
     if (params_acm%mpirank==0) then
       write(*,'(80("<"))')
-      write(*,*) "Some information:"
-      write(*,'("c0=",g12.4," C_eta=",g12.4," CFL=",g12.4)') params_acm%c_0, params_acm%C_eta, params_acm%CFL
-      write(*,'("dx_min=",g12.4," dt(CFL,c0,dx_min)=",g12.4)') dx_min, dt_min
-      write(*,'("if all blocks were at Jmax, the resolution would be nx=",i5)') nx_max
+      write(*,'(A)') "Some information:"
+      write(*,'("   c0=",g12.4," CFL=",g12.4, "CFL_eta=",g12.4, "CFL_nu=",g12.4)') params_acm%c_0, params_acm%CFL, params_acm%CFL_eta, params_acm%CFL_nu
+      write(*,'("   dx_min=",g12.4)') dx_min
+      write(*,'("   dt(CFL,c0,dx_min)=",g12.4)') dt_min_c0
+      write(*,'("   dt(CFL_nu,nu,dx_min)=",g12.4)') dt_min_nu
+      write(*,'("   dt(CFL_vpm,C_eta,dx_min)=",g12.4)') dt_min_vpm
+      write(*,'("   if all blocks were at Jmax, the resolution would be nx=",i5)') nx_max
       if (params_acm%penalization) then
-          write(*,'("C_eta=",es12.4," K_eta=",es12.4)') params_acm%C_eta, sqrt(params_acm%C_eta*params_acm%nu)/dx_min
+          write(*,'("   C_eta=",es12.4," K_eta=",es12.4)') params_acm%C_eta, sqrt(params_acm%C_eta*params_acm%nu)/dx_min
       endif
-      write(*,'("N_mask_components=",i1)') N_mask_components
-      write(*,'("N_scalars=",i2)') params_acm%N_scalars
+      write(*,'("N_mask_components=",i1, " N_scalars=",i1, " N_time_statistics=",i1)') N_mask_components, params_acm%N_scalars, params_acm%N_time_statistics
       write(*,'(80("<"))')
     endif
 
diff --git a/LIB/EQUATION/ACMnew/rhs_ACM.f90 b/LIB/EQUATION/ACMnew/rhs_ACM.f90
index 7fb29c4a..6b62572b 100644
--- a/LIB/EQUATION/ACMnew/rhs_ACM.f90
+++ b/LIB/EQUATION/ACMnew/rhs_ACM.f90
@@ -817,6 +817,7 @@ end subroutine RHS_2D_acm
 
 
 subroutine RHS_3D_acm(g, Bs, dx, x0, phi, order_discretization, time, rhs, mask, n_domain)
+    use module_operators
     implicit none
 
     !> grid parameter
@@ -875,6 +876,9 @@ subroutine RHS_3D_acm(g, Bs, dx, x0, phi, order_discretization, time, rhs, mask,
     real(kind=rk), parameter :: a_FD6(-3:3) = (/-1.0_rk/60.0_rk, 3.0_rk/20.0_rk, -3.0_rk/4.0_rk, 0.0_rk, 3.0_rk/4.0_rk, -3.0_rk/20.0_rk, 1.0_rk/60.0_rk/) ! 1st derivative
     real(kind=rk), parameter :: b_FD6(-3:3) = (/ 1.0_rk/90.0_rk, -3.0_rk/20.0_rk, 3.0_rk/2.0_rk, -49.0_rk/18.0_rk, 3.0_rk/2.0_rk, -3.0_rk/20.0_rk, 1.0_rk/90.0_rk/) ! 2nd derivative
 
+    real(kind=rk), allocatable, dimension(:) :: FD1_l, FD2
+    integer(kind=ik) :: FD1_ls, FD1_le, FD2_s, FD2_e
+
 
     ! set parameters for readability
     c_0         = params_acm%c_0
@@ -1453,7 +1457,131 @@ subroutine RHS_3D_acm(g, Bs, dx, x0, phi, order_discretization, time, rhs, mask,
         endif
 
     case default
-        call abort(441167, "3d Discretization unkown "//order_discretization//", I ll walk into the light now." )
+        ! call abort(44167, "3d Discretization unknown "// trim(order_discretization)//", I'll walk into the light now.")
+
+        ! Flexible operator version for 3D ACM using generalized stencils
+
+        call setup_FD1_left_stencil(order_discretization, FD1_l, FD1_ls, FD1_le)
+        call setup_FD2_stencil(order_discretization, FD2, FD2_s, FD2_e)
+
+        if (params_acm%skew_symmetry) then
+            do iz = g+1, Bs(3)+g
+                do iy = g+1, Bs(2)+g
+                    do ix = g+1, Bs(1)+g
+                        u = phi(ix,iy,iz,1)
+                        v = phi(ix,iy,iz,2)
+                        w = phi(ix,iy,iz,3)
+                        p = phi(ix,iy,iz,4)
+
+                        chi = mask(ix,iy,iz,1) * C_eta_inv
+                        penalx = -chi * (u - mask(ix,iy,iz,2))
+                        penaly = -chi * (v - mask(ix,iy,iz,3))
+                        penalz = -chi * (w - mask(ix,iy,iz,4))
+
+                        ! Generalized first derivatives
+                        u_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,1)) * dx_inv
+                        u_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,1)) * dy_inv
+                        u_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,1)) * dz_inv
+
+                        v_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,2)) * dx_inv
+                        v_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,2)) * dy_inv
+                        v_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,2)) * dz_inv
+
+                        w_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,3)) * dx_inv
+                        w_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,3)) * dy_inv
+                        w_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,3)) * dz_inv
+
+                        p_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,4)) * dx_inv
+                        p_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,4)) * dy_inv
+                        p_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,4)) * dz_inv
+
+                        ! Generalized nonlinear terms
+                        uu_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,1)*phi(ix+FD1_ls:ix+FD1_le,iy,iz,1)) * dx_inv
+                        uv_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,1)*phi(ix,iy+FD1_ls:iy+FD1_le,iz,2)) * dy_inv
+                        uw_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,1)*phi(ix,iy,iz+FD1_ls:iz+FD1_le,3)) * dz_inv
+
+                        vu_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,2)*phi(ix+FD1_ls:ix+FD1_le,iy,iz,1)) * dx_inv
+                        vv_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,2)*phi(ix,iy+FD1_ls:iy+FD1_le,iz,2)) * dy_inv
+                        vw_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,2)*phi(ix,iy,iz+FD1_ls:iz+FD1_le,3)) * dz_inv
+
+                        wu_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,3)*phi(ix+FD1_ls:ix+FD1_le,iy,iz,1)) * dx_inv
+                        wv_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,3)*phi(ix,iy+FD1_ls:iy+FD1_le,iz,2)) * dy_inv
+                        ww_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,3)*phi(ix,iy,iz+FD1_ls:iz+FD1_le,3)) * dz_inv
+
+                        ! Generalized second derivatives
+                        u_dxdx = sum(FD2(FD2_s:FD2_e) * phi(ix+FD2_s:ix+FD2_e,iy,iz,1)) * dx2_inv
+                        u_dydy = sum(FD2(FD2_s:FD2_e) * phi(ix,iy+FD2_s:iy+FD2_e,iz,1)) * dy2_inv
+                        u_dzdz = sum(FD2(FD2_s:FD2_e) * phi(ix,iy,iz+FD2_s:iz+FD2_e,1)) * dz2_inv
+
+                        v_dxdx = sum(FD2(FD2_s:FD2_e) * phi(ix+FD2_s:ix+FD2_e,iy,iz,2)) * dx2_inv
+                        v_dydy = sum(FD2(FD2_s:FD2_e) * phi(ix,iy+FD2_s:iy+FD2_e,iz,2)) * dy2_inv
+                        v_dzdz = sum(FD2(FD2_s:FD2_e) * phi(ix,iy,iz+FD2_s:iz+FD2_e,2)) * dz2_inv
+
+                        w_dxdx = sum(FD2(FD2_s:FD2_e) * phi(ix+FD2_s:ix+FD2_e,iy,iz,3)) * dx2_inv
+                        w_dydy = sum(FD2(FD2_s:FD2_e) * phi(ix,iy+FD2_s:iy+FD2_e,iz,3)) * dy2_inv
+                        w_dzdz = sum(FD2(FD2_s:FD2_e) * phi(ix,iy,iz+FD2_s:iz+FD2_e,3)) * dz2_inv
+
+                        ! Skew-symmetric formulation
+                        rhs(ix,iy,iz,1) = -0.5_rk*(uu_dx + uv_dy + uw_dz   + u*u_dx + v*u_dy + w*u_dz) -p_dx + nu*(u_dxdx + u_dydy + u_dzdz) + penalx
+                        rhs(ix,iy,iz,2) = -0.5_rk*(vu_dx + vv_dy + vw_dz   + u*v_dx + v*v_dy + w*v_dz) -p_dy + nu*(v_dxdx + v_dydy + v_dzdz) + penaly
+                        rhs(ix,iy,iz,3) = -0.5_rk*(wu_dx + wv_dy + ww_dz   + u*w_dx + v*w_dy + w*w_dz) -p_dz + nu*(w_dxdx + w_dydy + w_dzdz) + penalz
+                        rhs(ix,iy,iz,4) = -(c_0**2)*(u_dx + v_dy + w_dz) - gamma*p
+                    end do
+                end do
+            end do
+        else
+            do iz = g+1, Bs(3)+g
+                do iy = g+1, Bs(2)+g
+                    do ix = g+1, Bs(1)+g
+                        u = phi(ix,iy,iz,1)
+                        v = phi(ix,iy,iz,2)
+                        w = phi(ix,iy,iz,3)
+                        p = phi(ix,iy,iz,4)
+
+                        chi = mask(ix,iy,iz,1) * C_eta_inv
+                        penalx = -chi * (u - mask(ix,iy,iz,2))
+                        penaly = -chi * (v - mask(ix,iy,iz,3))
+                        penalz = -chi * (w - mask(ix,iy,iz,4))
+
+                        ! Generalized first derivatives
+                        u_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,1)) * dx_inv
+                        u_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,1)) * dy_inv
+                        u_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,1)) * dz_inv
+
+                        v_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,2)) * dx_inv
+                        v_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,2)) * dy_inv
+                        v_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,2)) * dz_inv
+
+                        w_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,3)) * dx_inv
+                        w_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,3)) * dy_inv
+                        w_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,3)) * dz_inv
+
+                        p_dx = sum(FD1_l(FD1_ls:FD1_le) * phi(ix+FD1_ls:ix+FD1_le,iy,iz,4)) * dx_inv
+                        p_dy = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy+FD1_ls:iy+FD1_le,iz,4)) * dy_inv
+                        p_dz = sum(FD1_l(FD1_ls:FD1_le) * phi(ix,iy,iz+FD1_ls:iz+FD1_le,4)) * dz_inv
+
+                        ! Generalized second derivatives
+                        u_dxdx = sum(FD2(FD2_s:FD2_e) * phi(ix+FD2_s:ix+FD2_e,iy,iz,1)) * dx2_inv
+                        u_dydy = sum(FD2(FD2_s:FD2_e) * phi(ix,iy+FD2_s:iy+FD2_e,iz,1)) * dy2_inv
+                        u_dzdz = sum(FD2(FD2_s:FD2_e) * phi(ix,iy,iz+FD2_s:iz+FD2_e,1)) * dz2_inv
+
+                        v_dxdx = sum(FD2(FD2_s:FD2_e) * phi(ix+FD2_s:ix+FD2_e,iy,iz,2)) * dx2_inv
+                        v_dydy = sum(FD2(FD2_s:FD2_e) * phi(ix,iy+FD2_s:iy+FD2_e,iz,2)) * dy2_inv
+                        v_dzdz = sum(FD2(FD2_s:FD2_e) * phi(ix,iy,iz+FD2_s:iz+FD2_e,2)) * dz2_inv
+
+                        w_dxdx = sum(FD2(FD2_s:FD2_e) * phi(ix+FD2_s:ix+FD2_e,iy,iz,3)) * dx2_inv
+                        w_dydy = sum(FD2(FD2_s:FD2_e) * phi(ix,iy+FD2_s:iy+FD2_e,iz,3)) * dy2_inv
+                        w_dzdz = sum(FD2(FD2_s:FD2_e) * phi(ix,iy,iz+FD2_s:iz+FD2_e,3)) * dz2_inv
+
+                        ! Standard formulation
+                        rhs(ix,iy,iz,1) = (-u*u_dx - v*u_dy - w*u_dz) -p_dx + nu*(u_dxdx + u_dydy + u_dzdz) + penalx
+                        rhs(ix,iy,iz,2) = (-u*v_dx - v*v_dy - w*v_dz) -p_dy + nu*(v_dxdx + v_dydy + v_dzdz) + penaly
+                        rhs(ix,iy,iz,3) = (-u*w_dx - v*w_dy - w*w_dz) -p_dz + nu*(w_dxdx + w_dydy + w_dzdz) + penalz
+                        rhs(ix,iy,iz,4) = -(c_0**2)*(u_dx + v_dy + w_dz) - gamma*p
+                    end do
+                end do
+            end do
+        endif
 
     end select
 
@@ -1483,7 +1611,6 @@ subroutine RHS_3D_acm(g, Bs, dx, x0, phi, order_discretization, time, rhs, mask,
 
     ! --------------------------------------------------------------------------
     ! HIT linear forcing
-    ! ATTENTION! This is at last position because I modify phi to avoid a 3-nested do-loop to subtract mean-flow
     ! --------------------------------------------------------------------------
     if (params_acm%HIT_linear_forcing) then
         G_gain = params_acm%HIT_gain
diff --git a/LIB/EQUATION/ACMnew/sponge.f90 b/LIB/EQUATION/ACMnew/sponge.f90
index 44c00aac..3604c7fb 100644
--- a/LIB/EQUATION/ACMnew/sponge.f90
+++ b/LIB/EQUATION/ACMnew/sponge.f90
@@ -78,22 +78,30 @@ subroutine sponge_3D(sponge, x0, dx, Bs, g)
 
 
     if (params_acm%sponge_type == "rect") then
-        ! rectangular sponge with 45deg edges
-        do iz = g+1, Bs(3)+g
-            z = dble(iz-(g+1)) * dx(3) + x0(3)
-            do iy = g+1, Bs(2)+g
-                y = dble(iy-(g+1)) * dx(2) + x0(2)
-                do ix = g+1, Bs(1)+g
-                    x = dble(ix-(g+1)) * dx(1) + x0(1)
-
-                    ! distance to borders of domain
-                    tmp = minval( (/x,y,z,-(x-params_acm%domain_size(1)),&
-                         -(y-params_acm%domain_size(2)),-(z-params_acm%domain_size(3))/) )
 
-                    sponge(ix,iy,iz) = smoothstep( tmp, 0.5_rk*params_acm%L_sponge, 0.5_rk*params_acm%L_sponge)
+        ! check if this block is in sponge layer
+        ! find the point in the block closest to the domain boundary
+        tmp = min(minval(x0), minval(params_acm%domain_size - (x0 + dx*dble(Bs))))
+        if (tmp > params_acm%L_sponge) then
+            sponge = 0.0_rk
+        else
+            ! rectangular sponge with 45deg edges
+            do iz = g+1, Bs(3)+g
+                z = dble(iz-(g+1)) * dx(3) + x0(3)
+                do iy = g+1, Bs(2)+g
+                    y = dble(iy-(g+1)) * dx(2) + x0(2)
+                    do ix = g+1, Bs(1)+g
+                        x = dble(ix-(g+1)) * dx(1) + x0(1)
+
+                        ! distance to borders of domain
+                        tmp = minval( (/x,y,z,-(x-params_acm%domain_size(1)),&
+                            -(y-params_acm%domain_size(2)),-(z-params_acm%domain_size(3))/) )
+
+                        sponge(ix,iy,iz) = smoothstep( tmp, 0.5_rk*params_acm%L_sponge, 0.5_rk*params_acm%L_sponge)
+                    end do
                 end do
             end do
-        end do
+        endif
 
         ! sponge for using with symmetry_BC
         ! insect is supposed to be at y=0
@@ -149,6 +157,7 @@ subroutine sponge_3D(sponge, x0, dx, Bs, g)
         ! p-norm sponge. The shape of the sponge is dictated as the p-norm
         ! https://de.wikipedia.org/wiki/P-Norm
         ! which is a nice and simple way to get a rectangle with round corners.
+        ! For 2 it is the eucledian norm (circle), for infinity it is a rectangle and for everything in between a rounded rectangle
 
         if ( maxval(abs(params_acm%domain_size-params_acm%domain_size(1))) > 1.0e-10_rk) then
             call abort(1610184,"ERROR: for the p-norm sponge, the domain has to be same size in all directions.")
@@ -158,21 +167,31 @@ subroutine sponge_3D(sponge, x0, dx, Bs, g)
         pinv = 1.0_rk / p
         offset = 0.5_rk * params_acm%domain_size(1)
 
-        do iz = g+1, Bs(3)+g
-            z = (dble(iz-(g+1)) * dx(3) + x0(3) - offset)**p
-            do iy = g+1, Bs(2)+g
-                y = (dble(iy-(g+1)) * dx(2) + x0(2) - offset)**p
-                do ix = g+1, Bs(1)+g
-                    x = (dble(ix-(g+1)) * dx(1) + x0(1) - offset)**p
-
-                    ! distance to borders of domain
-                    tmp = -( (x + y + z)**pinv - offset)
-
-                    sponge(ix,iy,iz) = smoothstep( tmp, 0.5_rk*params_acm%L_sponge, &
-                    0.5_rk*params_acm%L_sponge)
+        ! check if this block is in sponge layer
+        ! Find the point in the block furthest from the domain center
+        x = max(abs(x0(1)-offset), abs(x0(1)+dx(1)*dble(Bs(1))-offset))**p
+        y = max(abs(x0(2)-offset), abs(x0(2)+dx(2)*dble(Bs(2))-offset))**p
+        z = max(abs(x0(3)-offset), abs(x0(3)+dx(3)*dble(Bs(3))-offset))**p
+        tmp = offset - (x + y + z)**pinv  ! we invert distance to center to distance from boundary
+        if (tmp > params_acm%L_sponge) then
+            sponge = 0.0_rk
+        else
+            do iz = g+1, Bs(3)+g
+                z = (dble(iz-(g+1)) * dx(3) + x0(3) - offset)**p
+                do iy = g+1, Bs(2)+g
+                    y = (dble(iy-(g+1)) * dx(2) + x0(2) - offset)**p
+                    do ix = g+1, Bs(1)+g
+                        x = (dble(ix-(g+1)) * dx(1) + x0(1) - offset)**p
+
+                        ! distance to borders of domain
+                        tmp = -( (x + y + z)**pinv - offset)
+
+                        sponge(ix,iy,iz) = smoothstep( tmp, 0.5_rk*params_acm%L_sponge, &
+                        0.5_rk*params_acm%L_sponge)
+                    end do
                 end do
             end do
-        end do
+        endif
 
     elseif (params_acm%sponge_type == "p-norm-insect-centered") then
         ! p-norm sponge. The shape of the sponge is dictated as the p-norm
diff --git a/LIB/EQUATION/ACMnew/time_statistics_ACM.f90 b/LIB/EQUATION/ACMnew/time_statistics_ACM.f90
index 5c566ecd..1af04ba7 100644
--- a/LIB/EQUATION/ACMnew/time_statistics_ACM.f90
+++ b/LIB/EQUATION/ACMnew/time_statistics_ACM.f90
@@ -34,7 +34,7 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
     ! non-ghost point has the coordinate x0, from then on its just cartesian with dx spacing
     real(kind=rk), intent(in) :: x0(1:3), dx(1:3)
 
-    integer(kind=ik) :: iB, ix, iy, iz, Bs(3), i_ts, N_offset
+    integer(kind=ik) :: iB, ix, iy, iz, Bs(3), i_ts, N_offset, mean_idx1, mean_idx2, j
     real(kind=rk) :: time_diff
 
     ! compute the size of blocks
@@ -45,13 +45,12 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
     N_offset = params_acm%dim + 1 + params_acm%N_scalars
     time_diff = time-time_start
 
-    do i_ts = 1, params_acm%N_time_statistics
-        ! at the beginning of the simulation, initialize the array to zero
-        if ( abs(time - time_start) < 1.0e-12_rk ) then
-            u(:,:,:,N_offset + i_ts) = 0.0_rk
-            cycle
-        end if
+    ! if time is right at time_start, we do nothing, we already set the values to zero or load them in earlier
+    if ( time - time_start < 1.0e-12_rk ) then
+        return
+    end if
 
+    do i_ts = 1, params_acm%N_time_statistics
         select case (trim(params_acm%time_statistics_names(i_ts)))
         case ("ux-int")
             ! compute the integral of ux over time
@@ -61,16 +60,14 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * u(:,:,:,1)
         case ("ux-var")
             ! compute the variance of ux over time using Welford's method
-            ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_acm%N_time_statistics .or. &
-                (trim(params_acm%time_statistics_names(i_ts+1)) /= "ux-avg" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "ux-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'ux-avg' or 'ux-mean' right after 'ux-var' in time_statistics_names.")
-            end if
+            ! This needs the computation of the average somewhere after this field
+            call find_single_mean_index(i_ts, "ux-avg", "ux-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'ux-avg' or 'ux-mean' somewhere after 'ux-var' in time_statistics_names.", 251016_ik)
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
-                 (u(:,:,:,1) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (u(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) - dt/(time_diff)*u(:,:,:,1))
+                 (u(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*u(:,:,:,1))
         
         case ("uy-int")
             ! compute the integral of uy over time
@@ -80,16 +77,14 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * u(:,:,:,2)
         case ("uy-var")
             ! compute the variance of uy over time using Welford's method
-            ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_acm%N_time_statistics .or. &
-                (trim(params_acm%time_statistics_names(i_ts+1)) /= "uy-avg" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "uy-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'uy-avg' or 'uy-mean' right after 'uy-var' in time_statistics_names.")
-            end if
+            ! This needs the computation of the average somewhere after this field
+            call find_single_mean_index(i_ts, "uy-avg", "uy-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'uy-avg' or 'uy-mean' somewhere after 'uy-var' in time_statistics_names.", 251016_ik)
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
-                 (u(:,:,:,2) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (u(:,:,:,2) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) - dt/(time_diff)*u(:,:,:,2))
+                 (u(:,:,:,2) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,2) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*u(:,:,:,2))
         
         case ("uz-int")
             if (params_acm%dim == 2) call abort(250915, "[TIME_STATISTICS_ACM]: uz not available in 2D simulations.")
@@ -102,16 +97,14 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
         case ("uz-var")
             if (params_acm%dim == 2) call abort(250916, "[TIME_STATISTICS_ACM]: uz not available in 2D simulations.")
             ! compute the variance of uz over time using Welford's method
-            ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_acm%N_time_statistics .or. &
-                (trim(params_acm%time_statistics_names(i_ts+1)) /= "uz-avg" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "uz-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'uz-avg' or 'uz-mean' right after 'uz-var' in time_statistics_names.")
-            end if
+            ! This needs the computation of the average somewhere after this field
+            call find_single_mean_index(i_ts, "uz-avg", "uz-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'uz-avg' or 'uz-mean' somewhere after 'uz-var' in time_statistics_names.", 251016_ik)
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
-                 (u(:,:,:,3) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (u(:,:,:,3) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) - dt/(time_diff)*u(:,:,:,3))
+                 (u(:,:,:,3) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,3) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*u(:,:,:,3))
         
         case ("p-int")
             ! compute the integral of p over time
@@ -121,16 +114,14 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * u(:,:,:,params_acm%dim+1)
         case ("p-var")
             ! compute the variance of p over time using Welford's method
-            ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_acm%N_time_statistics .or. &
-                (trim(params_acm%time_statistics_names(i_ts+1)) /= "p-avg" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "p-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'p-avg' or 'p-mean' right after 'p-var' in time_statistics_names.")
-            end if
+            ! This needs the computation of the average somewhere after this field
+            call find_single_mean_index(i_ts, "p-avg", "p-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'p-avg' or 'p-mean' somewhere after 'p-var' in time_statistics_names.", 251016_ik)
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
-                 (u(:,:,:,params_acm%dim+1) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (u(:,:,:,params_acm%dim+1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) - dt/(time_diff)*u(:,:,:,params_acm%dim+1))
+                 (u(:,:,:,params_acm%dim+1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,params_acm%dim+1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*u(:,:,:,params_acm%dim+1))
         
         case ("umag-int")
             ! compute the integral of velocity magnitude over time
@@ -144,19 +135,17 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * sqrt( work(:,:,:,1) )
         case ("umag-var")
             ! compute the variance of velocity magnitude over time using Welford's method
-            ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_acm%N_time_statistics .or. &
-                (trim(params_acm%time_statistics_names(i_ts+1)) /= "umag-avg" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "umag-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'umag-avg' or 'umag-mean' right after 'umag-var' in time_statistics_names.")
-            end if
+            ! This needs the computation of the average somewhere after this field
+            call find_single_mean_index(i_ts, "umag-avg", "umag-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'umag-avg' or 'umag-mean' somewhere after 'umag-var' in time_statistics_names.", 251016_ik)
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             work (:,:,:,1) = u(:,:,:,1)**2 + u(:,:,:,2)**2
             if (params_acm%dim == 3) work(:,:,:,1) = work(:,:,:,1) + u(:,:,:,3)**2
             work(:,:,:,1) = sqrt( work(:,:,:,1) )  ! current umag value
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
-                 (work(:,:,:,1) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) - dt/(time_diff)*work(:,:,:,1))
+                 (work(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*work(:,:,:,1))
 
         case ("divu-int", "div-int")
             ! compute the integral, average or minmax of div(u) over time
@@ -168,18 +157,29 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
         case ("divu-var", "div-var")
             ! compute the variance of div(u) over time using Welford's method
-            ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_acm%N_time_statistics .or. &
-                (trim(params_acm%time_statistics_names(i_ts+1)) /= "divu-avg" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "div-avg" .and. &
-                 trim(params_acm%time_statistics_names(i_ts+1)) /= "divu-mean" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "div-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'divu-avg', 'div-avg', 'divu-mean', or 'div-mean' right after 'divu-var' or 'div-var' in time_statistics_names.")
+            ! This needs the computation of the average somewhere after this field
+            ! NOTE: Using manual search loop instead of find_single_mean_index helper because
+            ! this case has 4 possible mean names (divu-avg, div-avg, divu-mean, div-mean)
+            ! and the helper function only supports 2 name options
+            mean_idx1 = -1
+            do j = i_ts + 1, params_acm%N_time_statistics
+                if (trim(params_acm%time_statistics_names(j)) == "divu-avg" .or. &
+                    trim(params_acm%time_statistics_names(j)) == "div-avg" .or. &
+                    trim(params_acm%time_statistics_names(j)) == "divu-mean" .or. &
+                    trim(params_acm%time_statistics_names(j)) == "div-mean") then
+                    mean_idx1 = j
+                    exit
+                end if
+            end do
+            if (mean_idx1 == -1) then
+                call abort(251016, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'divu-avg', 'div-avg', 'divu-mean', or 'div-mean' somewhere after 'divu-var' or 'div-var' in time_statistics_names.")
             end if
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             call compute_divergence(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,1))
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
-                 (work(:,:,:,1) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) - dt/(time_diff)*work(:,:,:,1))
+                 (work(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*work(:,:,:,1))
         case ("divu-minmax", "div-minmax")
             ! compute the minmax of div(u) over time
             call compute_divergence(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,1))
@@ -202,20 +202,31 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
             call compute_vorticity(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,:))
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
         case ("vort-var", "vor-var")
-            if (params_acm%dim == 3) call abort(250916, "[TIME_STATISTICS_ACM]: vorticity is not a scalar in 3D simulations - use vorx, vory, vorz or vorabs instead.")
+            if (params_acm%dim == 3) call abort(251016, "[TIME_STATISTICS_ACM]: vorticity is not a scalar in 3D simulations - use vorx, vory, vorz or vorabs instead.")
             ! compute the variance of vorticity magnitude over time using Welford's method
             ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_acm%N_time_statistics .or. &
-                (trim(params_acm%time_statistics_names(i_ts+1)) /= "vort-avg" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "vor-avg" .and. &
-                 trim(params_acm%time_statistics_names(i_ts+1)) /= "vort-mean" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "vor-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'vort-avg', 'vor-avg', 'vort-mean', or 'vor-mean' right after 'vort-var' or 'vor-var' in time_statistics_names.")
+            ! NOTE: Using manual search loop instead of find_single_mean_index helper because
+            ! this case has 4 possible mean names (vort-avg, vor-avg, vort-mean, vor-mean)
+            ! and the helper function only supports 2 name options
+            mean_idx1 = -1
+            do j = i_ts + 1, params_acm%N_time_statistics
+                if (trim(params_acm%time_statistics_names(j)) == "vort-avg" .or. &
+                    trim(params_acm%time_statistics_names(j)) == "vor-avg" .or. &
+                    trim(params_acm%time_statistics_names(j)) == "vort-mean" .or. &
+                    trim(params_acm%time_statistics_names(j)) == "vor-mean") then
+                    mean_idx1 = j
+                    exit
+                end if
+            end do
+            if (mean_idx1 == -1) then
+                call abort(251016, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'vort-avg', 'vor-avg', 'vort-mean', or 'vor-mean' AFTER 'vort-var' or 'vor-var' in time_statistics_names.")
             end if
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             call compute_vorticity(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,:))
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
-                 (work(:,:,:,1) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) - dt/(time_diff)*work(:,:,:,1))
+                 (work(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*work(:,:,:,1))
         
         case ("vorabs-int")
             if (params_acm%dim == 2) call abort(250916, "[TIME_STATISTICS_ACM]: vorticity is a scalar in 2D simulations - use vor instead.")
@@ -228,20 +239,39 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
             call compute_vorticity_abs(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,1))
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
         case ("vorabs-var")
-            if (params_acm%dim == 2) call abort(250916, "[TIME_STATISTICS_ACM]: vorticity is a scalar in 2D simulations - use vor instead.")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: vorticity is a scalar in 2D simulations - use vor instead.")
             ! compute the variance of vorticity magnitude over time using Welford's method
             ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_acm%N_time_statistics .or. &
-                (trim(params_acm%time_statistics_names(i_ts+1)) /= "vorabs-avg" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "vorabs-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'vorabs-avg' or 'vorabs-mean' right after 'vorabs-var' in time_statistics_names.")
-            end if
+            call find_single_mean_index(i_ts, "vorabs-avg", "vorabs-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'vorabs-avg' or 'vorabs-mean' AFTER 'vorabs-var' in time_statistics_names.", 251016_ik)
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             call compute_vorticity_abs(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,1))
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
-                 (work(:,:,:,1) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) - dt/(time_diff)*work(:,:,:,1))
-        
+                 (work(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*work(:,:,:,1))
+
+        ! Helicity magnitude cases
+        case ("helabs-int")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: helabs not available in 2D simulations.")
+            ! compute the integral of helicity magnitude over time
+            call compute_helicity_abs(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,1))
+            u(:,:,:,N_offset + i_ts) = u(:,:,:,N_offset + i_ts) + dt* work(:,:,:,1)
+        case ("helabs-avg", "helabs-mean")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: helabs not available in 2D simulations.")
+            ! compute the average of helicity magnitude over time
+            call compute_helicity_abs(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,1))
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
+        case ("helabs-var")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: helabs not available in 2D simulations.")
+            ! compute the variance of helicity magnitude over time using Welford's method
+            call find_single_mean_index(i_ts, "helabs-avg", "helabs-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'helabs-avg' or 'helabs-mean' AFTER 'helabs-var' in time_statistics_names.", 251021_ik)
+            call compute_helicity_abs(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,1))
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
+                 (work(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*work(:,:,:,1))
+
         case ("dissipation-int")
             ! compute the integral of dissipation over time
             call compute_dissipation(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,1))
@@ -253,16 +283,14 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
         case ("dissipation-var")
             ! compute the variance of dissipation over time using Welford's method
             ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_acm%N_time_statistics .or. &
-                (trim(params_acm%time_statistics_names(i_ts+1)) /= "dissipation-avg" .and. trim(params_acm%time_statistics_names(i_ts+1)) /= "dissipation-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'dissipation-avg' or 'dissipation-mean' right after 'dissipation-var' in time_statistics_names.")
-            end if
+            call find_single_mean_index(i_ts, "dissipation-avg", "dissipation-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the variance together with the mean value. Insert 'dissipation-avg' or 'dissipation-mean' AFTER 'dissipation-var' in time_statistics_names.", 251016_ik)
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             call compute_dissipation(u(:,:,:,1:params_acm%dim), dx, Bs, g, params_acm%discretization, work(:,:,:,1))
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
-                 (work(:,:,:,1) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) - dt/(time_diff)*work(:,:,:,1))
+                 (work(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (work(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*work(:,:,:,1))
 
         ! Velocity derivative cases
         case ("uxdx-avg", "uxdx-mean")
@@ -374,10 +402,162 @@ subroutine TIME_STATISTICS_ACM( time, dt, time_start, u, g, x0, dx, work, mask )
             work(:,:,:,1) = u(:,:,:,3)**2
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
 
+        ! Cross-product velocity component averages
+        case ("uxuy-avg", "uxuy-mean")
+            ! compute the average of ux*uy over time
+            work(:,:,:,1) = u(:,:,:,1) * u(:,:,:,2)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
+        case ("uxuz-avg", "uxuz-mean")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: uxuz-avg not available in 2D simulations.")
+            ! compute the average of ux*uz over time
+            work(:,:,:,1) = u(:,:,:,1) * u(:,:,:,3)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
+        case ("uyuz-avg", "uyuz-mean")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: uyuz-avg not available in 2D simulations.")
+            ! compute the average of uy*uz over time
+            work(:,:,:,1) = u(:,:,:,2) * u(:,:,:,3)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
+
+        ! Velocity-pressure cross-product averages
+        case ("uxp-avg", "uxp-mean")
+            ! compute the average of ux*p over time
+            work(:,:,:,1) = u(:,:,:,1) * u(:,:,:,params_acm%dim+1)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
+        case ("uyp-avg", "uyp-mean")
+            ! compute the average of uy*p over time
+            work(:,:,:,1) = u(:,:,:,2) * u(:,:,:,params_acm%dim+1)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
+        case ("uzp-avg", "uzp-mean")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: uzp-avg not available in 2D simulations.")
+            ! compute the average of uz*p over time
+            work(:,:,:,1) = u(:,:,:,3) * u(:,:,:,params_acm%dim+1)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/time_diff * u(:,:,:,N_offset + i_ts) + dt/time_diff * work(:,:,:,1)
+
+        ! Covariance cases using Welford's online algorithm
+        case ("uxuy-cov")
+            ! compute the covariance of ux and uy over time using Welford's method
+            ! This needs the computation of both averages somewhere after this field
+            call find_single_mean_index(i_ts, "ux-avg", "ux-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'ux-avg' or 'ux-mean' somewhere after 'uxuy-cov' in time_statistics_names.", 251016_ik)
+            call find_single_mean_index(i_ts, "uy-avg", "uy-mean", mean_idx2, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'uy-avg' or 'uy-mean' somewhere after 'uxuy-cov' in time_statistics_names.", 251016_ik)
+            ! cov_new = (time_diff-dt)/time_diff*cov_old + dt/time_diff*(x - mean_x_old)*(y - mean_y_new)
+            ! mean_x_new = (time_diff-dt)/time_diff*mean_x_old + dt/time_diff*x
+            ! mean_y_new = (time_diff-dt)/time_diff*mean_y_old + dt/time_diff*y
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
+                 (u(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,2) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx2) - dt/(time_diff)*u(:,:,:,2))
+        case ("uxuz-cov")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: uxuz-cov not available in 2D simulations.")
+            ! compute the covariance of ux and uz over time using Welford's method
+            ! This needs the computation of both averages somewhere after this field
+            call find_single_mean_index(i_ts, "ux-avg", "ux-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'ux-avg' or 'ux-mean' somewhere after 'uxuz-cov' in time_statistics_names.", 251016_ik)
+            call find_single_mean_index(i_ts, "uz-avg", "uz-mean", mean_idx2, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'uz-avg' or 'uz-mean' somewhere after 'uxuz-cov' in time_statistics_names.", 251016_ik)
+            ! cov_new = (time_diff-dt)/time_diff*cov_old + dt/time_diff*(x - mean_x_old)*(y - mean_y_new)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
+                 (u(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,3) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx2) - dt/(time_diff)*u(:,:,:,3))
+        case ("uyuz-cov")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: uyuz-cov not available in 2D simulations.")
+            ! compute the covariance of uy and uz over time using Welford's method
+            ! This needs the computation of both averages somewhere after this field
+            call find_single_mean_index(i_ts, "uy-avg", "uy-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'uy-avg' or 'uy-mean' somewhere after 'uyuz-cov' in time_statistics_names.", 251016_ik)
+            call find_single_mean_index(i_ts, "uz-avg", "uz-mean", mean_idx2, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'uz-avg' or 'uz-mean' somewhere after 'uyuz-cov' in time_statistics_names.", 251016_ik)
+            ! cov_new = (time_diff-dt)/time_diff*cov_old + dt/time_diff*(x - mean_x_old)*(y - mean_y_new)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
+                 (u(:,:,:,2) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,3) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx2) - dt/(time_diff)*u(:,:,:,3))
+
+        ! Velocity-pressure covariances
+        case ("uxp-cov")
+            ! compute the covariance of ux and p over time using Welford's method
+            ! This needs the computation of both averages somewhere after this field
+            call find_single_mean_index(i_ts, "ux-avg", "ux-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'ux-avg' or 'ux-mean' somewhere after 'uxp-cov' in time_statistics_names.", 251016_ik)
+            call find_single_mean_index(i_ts, "p-avg", "p-mean", mean_idx2, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'p-avg' or 'p-mean' somewhere after 'uxp-cov' in time_statistics_names.", 251016_ik)
+            ! cov_new = (time_diff-dt)/time_diff*cov_old + dt/time_diff*(x - mean_x_old)*(y - mean_y_new)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
+                 (u(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,params_acm%dim+1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx2) - dt/(time_diff)*u(:,:,:,params_acm%dim+1))
+        case ("uyp-cov")
+            ! compute the covariance of uy and p over time using Welford's method
+            ! This needs the computation of both averages somewhere after this field
+            call find_single_mean_index(i_ts, "uy-avg", "uy-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'uy-avg' or 'uy-mean' somewhere after 'uyp-cov' in time_statistics_names.", 251016_ik)
+            call find_single_mean_index(i_ts, "p-avg", "p-mean", mean_idx2, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'p-avg' or 'p-mean' somewhere after 'uyp-cov' in time_statistics_names.", 251016_ik)
+            ! cov_new = (time_diff-dt)/time_diff*cov_old + dt/time_diff*(x - mean_x_old)*(y - mean_y_new)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
+                 (u(:,:,:,2) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,params_acm%dim+1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx2) - dt/(time_diff)*u(:,:,:,params_acm%dim+1))
+        case ("uzp-cov")
+            if (params_acm%dim == 2) call abort(251016, "[TIME_STATISTICS_ACM]: uzp-cov not available in 2D simulations.")
+            ! compute the covariance of uz and p over time using Welford's method
+            ! This needs the computation of both averages somewhere after this field
+            call find_single_mean_index(i_ts, "uz-avg", "uz-mean", mean_idx1, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'uz-avg' or 'uz-mean' somewhere after 'uzp-cov' in time_statistics_names.", 251016_ik)
+            call find_single_mean_index(i_ts, "p-avg", "p-mean", mean_idx2, &
+                "[TIME_STATISTICS_ACM]: You need to compute the covariance together with both mean values. Insert 'p-avg' or 'p-mean' somewhere after 'uzp-cov' in time_statistics_names.", 251016_ik)
+            ! cov_new = (time_diff-dt)/time_diff*cov_old + dt/time_diff*(x - mean_x_old)*(y - mean_y_new)
+            u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* &
+                 (u(:,:,:,3) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,params_acm%dim+1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx2) - dt/(time_diff)*u(:,:,:,params_acm%dim+1))
+
         case default
             call abort(2153000, "[TIME_STATISTICS_ACM]: time_statistics_name "// &
                  trim(params_acm%time_statistics_names(i_ts))//" not recognized.")
         end select
     end do
 
-end subroutine TIME_STATISTICS_ACM
\ No newline at end of file
+end subroutine TIME_STATISTICS_ACM
+
+!-----------------------------------------------------------------------------
+! Helper subroutine to find a single mean value index after a given position
+!
+! This helper function was created to eliminate code duplication across variance 
+! and covariance calculations. Previously, each variance/covariance case contained
+! identical search loops to find the corresponding mean values. This helper:
+!
+! 1. Reduces code duplication by centralizing the search logic
+! 2. Provides consistent validation behavior across all cases
+! 3. Enables flexible positioning of mean values anywhere AFTER the variance/covariance
+! 4. Supports exactly 2 possible mean names (e.g., "ux-avg", "ux-mean")
+! 5. Provides consistent error handling with custom messages and error codes
+!
+! Note: Cases with more than 2 possible mean names (like vort-var with 4 options:
+! vort-avg, vor-avg, vort-mean, vor-mean) still use manual search loops since
+! this helper is designed for the common case of 2 name variants.
+!-----------------------------------------------------------------------------
+subroutine find_single_mean_index(current_idx, mean_name1, mean_name2, mean_idx, error_message, error_code)
+    implicit none
+    
+    integer(kind=ik), intent(in) :: current_idx
+    character(len=*), intent(in) :: mean_name1, mean_name2  ! Two possible names (e.g., "ux-avg", "ux-mean")
+    integer(kind=ik), intent(out) :: mean_idx
+    character(len=*), intent(in) :: error_message
+    integer(kind=ik), intent(in) :: error_code
+    
+    integer(kind=ik) :: j
+    
+    mean_idx = -1  ! Initialize to not found
+    
+    ! Search for either mean name after current index
+    do j = current_idx + 1, params_acm%N_time_statistics
+        if (trim(params_acm%time_statistics_names(j)) == trim(mean_name1) .or. &
+            trim(params_acm%time_statistics_names(j)) == trim(mean_name2)) then
+            mean_idx = j
+            exit
+        end if
+    end do
+    
+    ! If mean is not found, abort with error
+    if (mean_idx == -1) then
+        call abort(error_code, error_message)
+    end if
+    
+end subroutine find_single_mean_index
diff --git a/LIB/EQUATION/convection-diffusion/module_ConvDiff_new.f90 b/LIB/EQUATION/convection-diffusion/module_ConvDiff_new.f90
index a51c9b3e..25e96fa8 100644
--- a/LIB/EQUATION/convection-diffusion/module_ConvDiff_new.f90
+++ b/LIB/EQUATION/convection-diffusion/module_ConvDiff_new.f90
@@ -226,26 +226,23 @@ subroutine PREPARE_SAVE_DATA_convdiff( time, u, g, x0, dx, work )
             endif
         endif
 
-        ! saving of velocity field
+        ! saving of velocity field, this is done similar to save_data_ACM
         if (name(1:2) == 'ux' .or. name(1:2) == 'uy' .or. name(1:2) == 'uz') then
             read( name(4:4), * ) i_var
             if (params_convdiff%dim == 2) then
+                ! let's save a 2D velocity field, for this we need two work arrays
                 if (size(work,4) - k < 1) then
-                    call abort(250922,"CONVDIFF: Not enough space to compute velocity")
+                    call abort(250922,"CONVDIFF: Not enough space to compute velocity, put atleast one other save variable afterwards.")
                 endif
-                if (k > params_convdiff%N_scalars+2) then
-                    call abort(250922, "CONVDIFF: You try to save a variable that does not exist")
-                endif
-                call create_velocity_field_2d( time, g, Bs, dx, x0, work(:,:,:,k), i_var, u(:,:,1,i_var) )
+                call create_velocity_field_2d( time, g, Bs, dx, x0, work(:,:,1,k:k+1), i_var, u(:,:,1,i_var) )
             elseif (params_convdiff%dim == 3) then
+                ! let's save a 3D velocity field, for this we need three work arrays
                 if (size(work,4) - k < 2) then
-                    call abort(250922,"CONVDIFF: Not enough space to compute velocity")
-                endif
-                if (k > params_convdiff%N_scalars+3) then
-                    call abort(250922, "CONVDIFF: You try to save a variable that does not exist " // trim(name) )
+                    call abort(250922,"CONVDIFF: Not enough space to compute velocity, put atleast two other save variables afterwards.")
                 endif
                 call create_velocity_field_3d( time, g, Bs, dx, x0, work(:,:,:,k:k+2), i_var )
             endif
+            ! now copy the correct component to the correct place
             if (name(1:2) == 'uy') then
                 work(:,:,:,k) = work(:,:,:,k+1)
             elseif (name(1:2) == 'uz') then
@@ -258,7 +255,7 @@ subroutine PREPARE_SAVE_DATA_convdiff( time, u, g, x0, dx, work )
 
         ! if any of those endings is in the name, then it is a timestatistics variable
         if (index(name, "-avg") > 0 .or. index(name, "-mean") > 0 .or. index(name, "-var") > 0 &
-            .or. index(name, "-minmax") > 0 .or. index(name, "-min") > 0 .or. index(name, "-max") > 0) then
+            .or. index(name, "-minmax") > 0 .or. index(name, "-min") > 0 .or. index(name, "-max") > 0 .or. index(name, "-cov") > 0) then
             ! now we have to find the index of it
             do i_time_statistics = 1, params_convdiff%N_time_statistics
                 if (name == trim(params_convdiff%time_statistics_names(i_time_statistics))) exit
@@ -399,7 +396,7 @@ subroutine INICOND_convdiff( time, u, g, x0, dx )
     ! non-ghost point has the coordinate x0, from then on its just cartesian with dx spacing
     real(kind=rk), intent(in) :: x0(1:3), dx(1:3)
 
-    integer(kind=ik) :: ix, iy, iz, i, iblob, bx, by, bz
+    integer(kind=ik) :: ix, iy, iz, i, iblob, bx, by, bz, i_time_statistics
     integer(kind=ik), dimension(3) :: Bs
     real(kind=rk) :: x, y, c0x, c0y, z, c0z, lambd, delta
 
@@ -542,6 +539,13 @@ subroutine INICOND_convdiff( time, u, g, x0, dx )
 
     enddo
 
+    ! initialize time statistics
+    if (params_convdiff%N_time_statistics > 0) then
+        do i_time_statistics = 1, params_convdiff%N_time_statistics
+            u(:,:,:, params_convdiff%N_scalars + i_time_statistics) = 0.0_rk
+        end do
+    endif
+
 
   end subroutine INICOND_convdiff
 
diff --git a/LIB/EQUATION/convection-diffusion/rhs_convdiff.f90 b/LIB/EQUATION/convection-diffusion/rhs_convdiff.f90
index 1ce59fa4..038a1bbc 100644
--- a/LIB/EQUATION/convection-diffusion/rhs_convdiff.f90
+++ b/LIB/EQUATION/convection-diffusion/rhs_convdiff.f90
@@ -419,14 +419,15 @@ end subroutine RHS_convdiff_new
 
 
 
+!> Create a 2D velocity field for advecting a scalar field
 subroutine create_velocity_field_2D( time, g, Bs, dx, x0, u0, i, u )
     implicit none
-    real(kind=rk), intent(in) :: time
-    integer(kind=ik), intent(in) :: g, i
-    integer(kind=ik), dimension(3), intent(in) :: Bs
-    real(kind=rk), intent(in) :: dx(1:2), x0(1:2)
-    real(kind=rk), intent(inout) :: u0(:,:,:)
-    real(kind=rk), intent(in) :: u(:,:)
+    real(kind=rk), intent(in) :: time  !< current time
+    integer(kind=ik), intent(in) :: g, i  !< ghost cells, scalar index (in case we solve several transport equations which different scalars)
+    integer(kind=ik), dimension(3), intent(in) :: Bs  !< block size
+    real(kind=rk), intent(in) :: dx(1:2), x0(1:2)  !< grid spacing and origin of the block
+    real(kind=rk), intent(inout) :: u0(:,:,:)  !< output velocity field
+    real(kind=rk), intent(in) :: u(:,:)  !< scalar input field (for some velocity fields)
     ! note you cannot change these values without recomputing the coefficients
     real(kind=rk), parameter :: tau= 0.30_rk, t0=0.0_rk, t1=0.55_rk, t2=1.0_rk, u1=1.0_rk, u2=-1.2221975311385904
     real(kind=rk) :: u_this
@@ -522,13 +523,14 @@ subroutine create_velocity_field_2D( time, g, Bs, dx, x0, u0, i, u )
 
 
 
+!> Create a 3D velocity field for advecting a scalar field
 subroutine create_velocity_field_3D( time, g, Bs, dx, x0, u0, i )
     implicit none
-    real(kind=rk), intent(in) :: time
-    integer(kind=ik), intent(in) :: g, i
-    integer(kind=ik), dimension(3), intent(in) :: Bs
-    real(kind=rk), intent(in) :: dx(1:3), x0(1:3)
-    real(kind=rk), intent(inout) :: u0(:,:,:,1:)
+    real(kind=rk), intent(in) :: time  !< current time
+    integer(kind=ik), intent(in) :: g, i  !< ghost cells, scalar index
+    integer(kind=ik), dimension(3), intent(in) :: Bs  !< block size
+    real(kind=rk), intent(in) :: dx(1:3), x0(1:3)  !< grid spacing and origin of the block
+    real(kind=rk), intent(inout) :: u0(:,:,:,1:)  !< output velocity field
 
     integer(kind=ik) :: ix, iy, iz
     real(kind=rk) :: x, y, z, c0x, c0y, c0z, T
diff --git a/LIB/EQUATION/convection-diffusion/time_statistics_convdiff.f90 b/LIB/EQUATION/convection-diffusion/time_statistics_convdiff.f90
index 2dc2eab3..28a87a7f 100644
--- a/LIB/EQUATION/convection-diffusion/time_statistics_convdiff.f90
+++ b/LIB/EQUATION/convection-diffusion/time_statistics_convdiff.f90
@@ -34,7 +34,7 @@ subroutine TIME_STATISTICS_convdiff( time, dt, time_start, u, g, x0, dx, work, m
     ! non-ghost point has the coordinate x0, from then on its just cartesian with dx spacing
     real(kind=rk), intent(in) :: x0(1:3), dx(1:3)
 
-    integer(kind=ik) :: iB, ix, iy, iz, Bs(3), i_ts, N_offset, i_scalar
+    integer(kind=ik) :: iB, ix, iy, iz, Bs(3), i_ts, N_offset, i_scalar, mean_idx1
     real(kind=rk) :: time_diff
     character(len=cshort) :: name_phi, stat_name
 
@@ -79,15 +79,18 @@ subroutine TIME_STATISTICS_convdiff( time, dt, time_start, u, g, x0, dx, work, m
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* u(:,:,:,1)
         elseif (stat_name == trim(name_phi) // "-var") then
             ! compute the variance of phi over time using Welford's method
-            ! This needs the computation of the average in the variable afterwards to work
-            if (i_ts == params_convdiff%N_time_statistics .or. (trim(params_convdiff%time_statistics_names(i_ts+1)) /= trim(name_phi) // "-avg" .and. trim(params_convdiff%time_statistics_names(i_ts+1)) /= trim(name_phi) // "-mean")) then
-                call abort(2153001, "[TIME_STATISTICS_CONVDIFF]: You need to compute the variance together with the mean value. Insert 'phi-avg' right after 'phi-var' in time_statistics_names.")
-            end if
+            ! This needs the computation of the average somewhere after this field
+            call find_single_mean_index(i_ts, trim(name_phi)//"-avg", trim(name_phi)//"-mean", mean_idx1, &
+                "[TIME_STATISTICS_CONVDIFF]: You need to compute the variance together with the mean value. Insert '"//trim(name_phi)//"-avg' or '"//trim(name_phi)//"-mean' somewhere after '"//trim(name_phi)//"-var' in time_statistics_names.", 251023_ik)
             ! var_new = (time_diff-dt)/time_diff*var_old + dt/time_diff*(x - mean_old)*(x - mean_new)
             ! mean_new = (time_diff-dt)/time_diff*mean_old + dt/time_diff*x
             u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff) * &
-                 (u(:,:,:,1) - u(:,:,:,N_offset + i_ts + 1)) * &
-                 (u(:,:,:,1) - ((time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts + 1) + dt/(time_diff)*u(:,:,:,1)))
+                 (u(:,:,:,1) - u(:,:,:,N_offset + mean_idx1)) * &
+                 (u(:,:,:,1) - (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + mean_idx1) - dt/(time_diff)*u(:,:,:,1))
+            elseif (stat_name == trim(name_phi) // "-avg" .or. &
+                    stat_name == trim(name_phi) // "-mean") then
+                ! compute the average of phi over time (required for variance)
+                u(:,:,:,N_offset + i_ts) = (time_diff-dt)/(time_diff)*u(:,:,:,N_offset + i_ts) + dt/(time_diff)* u(:,:,:,1)
         elseif (stat_name == trim(name_phi) // "-minmax") then
             ! compute the minmax of phi over time
             do iz = merge(1,g+1,params_convdiff%dim==2), merge(1,Bs(3)+g,params_convdiff%dim==2)
@@ -182,4 +185,37 @@ subroutine TIME_STATISTICS_convdiff( time, dt, time_start, u, g, x0, dx, work, m
         end if
     end do
 
-end subroutine TIME_STATISTICS_convdiff
\ No newline at end of file
+end subroutine TIME_STATISTICS_convdiff
+
+
+!----------------------------------------------------------------------------- 
+! Helper subroutine, returns the index of the AVERAGE of a quantity in the list of variables to save.
+! For example, if the vector is
+!        params_convdiff%time_statistics_names = [phi1, ux, ux-mean ....] 
+! then the call to this routine
+!        call find_single_mean_index(2, 'mean', 'avg', mean_idx, "Did not find it", 17)
+! will return the index "3".
+! Note: oddly, the same search on the array
+!        params_convdiff%time_statistics_names = [ux-mean, ux, phi1 ....]
+! will fail.
+!----------------------------------------------------------------------------- 
+subroutine find_single_mean_index(current_idx, mean_name1, mean_name2, mean_idx, error_message, error_code)
+    implicit none
+    integer(kind=ik), intent(in) :: current_idx
+    character(len=*), intent(in) :: mean_name1, mean_name2
+    integer(kind=ik), intent(out) :: mean_idx
+    character(len=*), intent(in) :: error_message
+    integer(kind=ik), intent(in) :: error_code
+    integer(kind=ik) :: j
+    mean_idx = -1
+    do j = current_idx + 1, params_convdiff%N_time_statistics
+        if (trim(params_convdiff%time_statistics_names(j)) == trim(mean_name1) .or. &
+            trim(params_convdiff%time_statistics_names(j)) == trim(mean_name2)) then
+            mean_idx = j
+            exit
+        end if
+    end do
+    if (mean_idx == -1) then
+        call abort(error_code, error_message)
+    end if
+end subroutine find_single_mean_index
diff --git a/LIB/EQUATION/insects/kineloader.f90 b/LIB/EQUATION/insects/kineloader.f90
index 8a2b51dd..822ed166 100644
--- a/LIB/EQUATION/insects/kineloader.f90
+++ b/LIB/EQUATION/insects/kineloader.f90
@@ -13,16 +13,27 @@ subroutine load_kine_init(kine)
     read (10, *) t_period ! stroke period in s, for normalization
     read (10, *) r_wing   ! wing length in mm, for normalization
     read (10, *) kine%nk
-    if (kine%nk > nhrmt_max) then
-      write(*,*) "WARNING(load_kine_init): adjusting the value kine%nk to nhrmt_max"
-      kine%nk = nhrmt_max
-    endif
     write(*,'("nk=",i5," t_period=",g12.4," r_wing=",g12.4)') kine%nk, t_period, r_wing
   endif
 
   call MPI_BCAST(kine%nk,1,MPI_INTEGER,0,MPI_COMM_WORLD,mpicode)
   nk = kine%nk
 
+  ! Allocate the kineloader arrays
+  if (.not. allocated(kine%vec_t)) allocate(kine%vec_t(max(1,nk)))
+  if (.not. allocated(kine%vec_phi)) allocate(kine%vec_phi(max(1,nk)))
+  if (.not. allocated(kine%vec_alpha)) allocate(kine%vec_alpha(max(1,nk)))
+  if (.not. allocated(kine%vec_theta)) allocate(kine%vec_theta(max(1,nk)))
+  if (.not. allocated(kine%vec_pitch)) allocate(kine%vec_pitch(max(1,nk)))
+  if (.not. allocated(kine%vec_vert)) allocate(kine%vec_vert(max(1,nk)))
+  if (.not. allocated(kine%vec_horz)) allocate(kine%vec_horz(max(1,nk)))
+  if (.not. allocated(kine%vec_phi_dt)) allocate(kine%vec_phi_dt(max(1,nk)))
+  if (.not. allocated(kine%vec_alpha_dt)) allocate(kine%vec_alpha_dt(max(1,nk)))
+  if (.not. allocated(kine%vec_theta_dt)) allocate(kine%vec_theta_dt(max(1,nk)))
+  if (.not. allocated(kine%vec_pitch_dt)) allocate(kine%vec_pitch_dt(max(1,nk)))
+  if (.not. allocated(kine%vec_vert_dt)) allocate(kine%vec_vert_dt(max(1,nk)))
+  if (.not. allocated(kine%vec_horz_dt)) allocate(kine%vec_horz_dt(max(1,nk)))
+
   ! intitalize vectors as zero (note: not all of the space is actually used)
   kine%vec_t = 0.0_rk
   kine%vec_vert = 0.0_rk
@@ -66,19 +77,19 @@ subroutine load_kine_init(kine)
     kine%vec_horz_dt = kine%vec_horz_dt * t_period / r_wing
   endif
 
-  call MPI_BCAST( kine%vec_t, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_phi, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_alpha, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_theta, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_pitch, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_vert, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_horz, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_phi_dt, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_alpha_dt, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_theta_dt, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_pitch_dt, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_vert_dt, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
-  call MPI_BCAST( kine%vec_horz_dt, nhrmt_max, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_t, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_phi, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_alpha, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_theta, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_pitch, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_vert, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_horz, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_phi_dt, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_alpha_dt, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_theta_dt, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_pitch_dt, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_vert_dt, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
+  call MPI_BCAST( kine%vec_horz_dt, nk, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, mpicode )
 
   kine%initialized = .true.
   if (root) write(*,*) "done initializing kineoader!"
diff --git a/LIB/EQUATION/insects/module_insects.f90 b/LIB/EQUATION/insects/module_insects.f90
index cb43bd43..80ee85ae 100644
--- a/LIB/EQUATION/insects/module_insects.f90
+++ b/LIB/EQUATION/insects/module_insects.f90
@@ -38,9 +38,6 @@ module module_insects
    ! arrays for fourier coefficients are fixed size (avoiding issues with allocatable
    ! elements in derived datatypes) this is their length:
    integer, parameter :: nfft_max = 1024
-   ! Maximum number of Hermite interpolation nodes (hardcoded because of sxf90 compiler requirements)
-   ! JB: Array sizes resulting from this number result currently in 50% of WABBITs program size
-   integer, parameter :: nhrmt_max = 10000
 
    ! Allocatable arrays used in Insect object
    ! this will hold the surface markers and their normals used for particles:
@@ -84,8 +81,9 @@ module module_insects
    type wingkinematics
       ! Fourier coefficients
       real(kind=rk) :: a0_alpha=0.0_rk, a0_phi=0.0_rk, a0_theta=0.0_rk
-      real(kind=rk), dimension(1:nfft_max) :: ai_phi=0.0_rk, bi_phi=0.0_rk, ai_theta=0.0_rk, &
-         bi_theta=0.0_rk, ai_alpha=0.0_rk, bi_alpha=0.0_rk
+      ! JB: Made allocatable to avoid bloating the object file size
+      real(kind=rk), allocatable :: ai_phi(:), bi_phi(:), ai_theta(:), &
+         bi_theta(:), ai_alpha(:), bi_alpha(:)
       integer :: nfft_phi=0, nfft_alpha=0, nfft_theta=0
       ! coefficients are read only once from file (or set differently)
       logical :: initialized = .false.
@@ -93,10 +91,11 @@ module module_insects
       character(len=clong) :: infile_convention="", infile_type="", infile_units="", infile=""
       ! variables for kineloader (which uses non-periodic hermite interpolation)
       integer :: nk=0
-      real(kind=rk), dimension (1:nhrmt_max) :: vec_t=0.0_rk, &
-         vec_phi=0.0_rk,vec_alpha=0.0_rk,vec_theta=0.0_rk,vec_pitch=0.0_rk,vec_vert=0.0_rk,vec_horz=0.0_rk,  &
-         vec_phi_dt=0.0_rk,vec_alpha_dt=0.0_rk,vec_theta_dt=0.0_rk,vec_pitch_dt=0.0_rk,vec_vert_dt=0.0_rk, &
-         vec_horz_dt=0.0_rk
+      ! JB: Made allocatable to avoid bloating the object file size
+      real(kind=rk), allocatable :: vec_t(:), &
+         vec_phi(:),vec_alpha(:),vec_theta(:),vec_pitch(:),vec_vert(:),vec_horz(:),  &
+         vec_phi_dt(:),vec_alpha_dt(:),vec_theta_dt(:),vec_pitch_dt(:),vec_vert_dt(:), &
+         vec_horz_dt(:)
    end type
 
 
@@ -197,13 +196,12 @@ module module_insects
       !-------------------------------------------------------------
       ! wing shape fourier coefficients. Note notation:
       ! R = a0/2 + SUM ( ai cos(2pi*i) + bi sin(2pi*i)  )
-      ! to avoid compatibility issues, the array is of fixed size, although only
-      ! the first nftt_wings entries will be used
-      real(kind=rk), dimension(1:nfft_max,1:4) :: ai_wings=0.0_rk, bi_wings=0.0_rk
+      ! JB: Made allocatable to avoid bloating the object file size
+      real(kind=rk), allocatable :: ai_wings(:,:), bi_wings(:,:)
       real(kind=rk), dimension(1:4) :: a0_wings=0.0_rk
       ! fill the R0(theta) array once, then only table-lookup instead of Fseries
-      ! JB: This array increases WABBIT program size quite a bit, consider making it allocatable
-      real(kind=rk), dimension(1:25000,1:4) :: R0_table=0.0_rk
+      ! JB: Made allocatable to avoid bloating the object file size
+      real(kind=rk), allocatable :: R0_table(:,:)
       ! describes the origin of the wings system
       real(kind=rk), dimension(1:4) :: xc=0.0_rk, yc=0.0_rk
       ! number of fft coefficients for wing geometry
diff --git a/LIB/EQUATION/insects/wings_geometry.f90 b/LIB/EQUATION/insects/wings_geometry.f90
index 7e5c1bd1..0c532527 100644
--- a/LIB/EQUATION/insects/wings_geometry.f90
+++ b/LIB/EQUATION/insects/wings_geometry.f90
@@ -46,6 +46,9 @@ subroutine draw_insect_wings(time, xx0, ddx, mask, mask_color, us, Insect, delet
       endif
   endif
 
+  ! sometimes we have the geometry type insect but it has no wings (for example for fractal_tree), we then want to skip the rest
+  if (.not. Insect%RightWing=="yes" .and. .not. Insect%LeftWing=="yes" .and. Insect%RightWing2=="yes" .and. Insect%LeftWing2=="yes") return
+
   if ((dabs(Insect%time-time)>1.0d-10) .and. root) then
       write(*,'("error! time=",es15.8," but Insect%time=",es15.8)') time, Insect%time
       write(*,'("Did you call Update_Insect before draw_insect_wings?")')
@@ -1526,6 +1529,21 @@ subroutine Setup_Wing_Fourier_coefficients(Insect, wingID)
     return
   endif
 
+  ! Allocate wing Fourier coefficient arrays if not already allocated
+  if (.not. allocated(Insect%ai_wings)) then
+    allocate(Insect%ai_wings(nfft_max, 4))
+    Insect%ai_wings = 0.0_rk
+  endif
+  if (.not. allocated(Insect%bi_wings)) then
+    allocate(Insect%bi_wings(nfft_max, 4))
+    Insect%bi_wings = 0.0_rk
+  endif
+  ! Allocate R0_table if not already allocated
+  if (.not. allocated(Insect%R0_table)) then
+    allocate(Insect%R0_table(25000, 4))
+    Insect%R0_table = 0.0_rk
+  endif
+
   Insect%a0_wings(wingID) = 0.0_rk
   Insect%ai_wings(:,wingID) = 0.0_rk
   Insect%bi_wings(:,wingID) = 0.0_rk
diff --git a/LIB/EQUATION/insects/wings_motion.f90 b/LIB/EQUATION/insects/wings_motion.f90
index 70e7c0d2..f7a4d117 100644
--- a/LIB/EQUATION/insects/wings_motion.f90
+++ b/LIB/EQUATION/insects/wings_motion.f90
@@ -142,16 +142,22 @@ subroutine FlappingMotion(time, Insect, protocoll, phi, alpha, theta, phi_dt, &
       !------------------------------------------------------------------------------------
       
       call read_param_mpi(kinefile,"kinematics","nfft_phi",kine%nfft_phi,0)
+      if (.not. allocated(kine%ai_phi)) allocate(kine%ai_phi(max(1,kine%nfft_phi)))
+      if (.not. allocated(kine%bi_phi)) allocate(kine%bi_phi(max(1,kine%nfft_phi)))
       call read_param_mpi(kinefile,"kinematics","a0_phi",kine%a0_phi,0.0_rk)
       call read_param_mpi(kinefile,"kinematics","ai_phi",kine%ai_phi(1:kine%nfft_phi))
       call read_param_mpi(kinefile,"kinematics","bi_phi",kine%bi_phi(1:kine%nfft_phi))
       
       call read_param_mpi(kinefile,"kinematics","nfft_alpha",kine%nfft_alpha,0)
+      if (.not. allocated(kine%ai_alpha)) allocate(kine%ai_alpha(max(1,kine%nfft_alpha)))
+      if (.not. allocated(kine%bi_alpha)) allocate(kine%bi_alpha(max(1,kine%nfft_alpha)))
       call read_param_mpi(kinefile,"kinematics","a0_alpha",kine%a0_alpha,0.0_rk)
       call read_param_mpi(kinefile,"kinematics","ai_alpha",kine%ai_alpha(1:kine%nfft_alpha))
       call read_param_mpi(kinefile,"kinematics","bi_alpha",kine%bi_alpha(1:kine%nfft_alpha))
       
       call read_param_mpi(kinefile,"kinematics","nfft_theta",kine%nfft_theta,0)
+      if (.not. allocated(kine%ai_theta)) allocate(kine%ai_theta(max(1,kine%nfft_theta)))
+      if (.not. allocated(kine%bi_theta)) allocate(kine%bi_theta(max(1,kine%nfft_theta)))
       call read_param_mpi(kinefile,"kinematics","a0_theta",kine%a0_theta,0.0_rk)
       call read_param_mpi(kinefile,"kinematics","ai_theta",kine%ai_theta(1:kine%nfft_theta))
       call read_param_mpi(kinefile,"kinematics","bi_theta",kine%bi_theta(1:kine%nfft_theta))
@@ -359,6 +365,14 @@ subroutine FlappingMotion(time, Insect, protocoll, phi, alpha, theta, phi_dt, &
       kine%nfft_theta = 10
       kine%nfft_phi   = 10
 
+      ! Allocate arrays
+      if (.not. allocated(kine%ai_phi)) allocate(kine%ai_phi(kine%nfft_phi))
+      if (.not. allocated(kine%bi_phi)) allocate(kine%bi_phi(kine%nfft_phi))
+      if (.not. allocated(kine%ai_alpha)) allocate(kine%ai_alpha(kine%nfft_alpha))
+      if (.not. allocated(kine%bi_alpha)) allocate(kine%bi_alpha(kine%nfft_alpha))
+      if (.not. allocated(kine%ai_theta)) allocate(kine%ai_theta(kine%nfft_theta))
+      if (.not. allocated(kine%bi_theta)) allocate(kine%bi_theta(kine%nfft_theta))
+
       kine%a0_phi   =25.4649398
       kine%a0_alpha =-0.3056968
       kine%a0_theta =-17.8244658  ! - sign (Dmitry, 10 Nov 2013)
@@ -401,6 +415,14 @@ subroutine FlappingMotion(time, Insect, protocoll, phi, alpha, theta, phi_dt, &
       kine%nfft_theta = 10
       kine%nfft_phi   = 10
 
+      ! Allocate arrays
+      if (.not. allocated(kine%ai_phi)) allocate(kine%ai_phi(kine%nfft_phi))
+      if (.not. allocated(kine%bi_phi)) allocate(kine%bi_phi(kine%nfft_phi))
+      if (.not. allocated(kine%ai_alpha)) allocate(kine%ai_alpha(kine%nfft_alpha))
+      if (.not. allocated(kine%bi_alpha)) allocate(kine%bi_alpha(kine%nfft_alpha))
+      if (.not. allocated(kine%ai_theta)) allocate(kine%ai_theta(kine%nfft_theta))
+      if (.not. allocated(kine%bi_theta)) allocate(kine%bi_theta(kine%nfft_theta))
+
       kine%a0_phi   =38.2280144124915
       kine%a0_alpha =1.09156750841542
       kine%a0_theta =-17.0396438317138
diff --git a/LIB/INDICATORS/refinementIndicator_tree.f90 b/LIB/INDICATORS/refinementIndicator_tree.f90
index 97352408..93be8d2d 100644
--- a/LIB/INDICATORS/refinementIndicator_tree.f90
+++ b/LIB/INDICATORS/refinementIndicator_tree.f90
@@ -193,6 +193,12 @@ subroutine refinementIndicator_tree(params, hvy_block, tree_ID, indicator)
         ! sync light data, as only root sets random refinement
         call MPI_BCAST( lgt_block(:, IDX_REFINE_STS), size(lgt_block,1), MPI_INTEGER4, 0, WABBIT_COMM, ierr )
 
+    ! In some cases the refinement_status is set up by a routine other than this one. This is the case
+    ! in some forest processing (=handling multiple trees). In such a case, we do nothing here (in particular
+    ! we do not reset the refinement status)
+    case ('nothing (external)')
+        return
+
     case default
         call abort("ERROR: refine_tree: the refinement indicator is unkown")
 
diff --git a/LIB/MAIN/main.f90 b/LIB/MAIN/main.f90
index e103d44e..c09ff8ca 100644
--- a/LIB/MAIN/main.f90
+++ b/LIB/MAIN/main.f90
@@ -56,7 +56,7 @@ program main
     !!!!!! => renaming: hvy_tmp -> hvy_work
 
     ! time loop variables
-    real(kind=rk)                       :: time, time_start, output_time
+    real(kind=rk)                       :: time, output_time
     integer(kind=ik)                    :: iteration
     ! filename of *.ini file used to read parameters
     character(len=clong)                :: filename
@@ -177,16 +177,6 @@ program main
     ! On all blocks, set the initial condition (incl. synchronize ghosts)
     call setInitialCondition_tree( params, hvy_block, tree_ID_flow, params%adapt_inicond, time, iteration, hvy_mask, hvy_tmp, hvy_work=hvy_work)
 
-    !*******************************************************************
-    ! initial time statistics (integral, average, minmax or similar over time) - initializes the arrays (to 0)
-    !*******************************************************************
-    ! backup startup time, usefull for time statistics (averaging)
-    time_start = time
-    if (params%time_statistics) then
-        ! no sync needed as all values will simply be set to 0
-        call time_statistics_wrapper(time, dt, time_start, params, hvy_block, hvy_tmp, hvy_mask, tree_ID_flow)
-    end if
-
     if ((.not. params%read_from_files .or. params%adapt_inicond).and.(time>=params%write_time_first)) then
         ! NOTE: new versions (>16/12/2017) call physics module routines call prepare_save_data. These
         ! routines create the fields to be stored in the work array hvy_work in the first 1:params%N_fields_saved
@@ -236,14 +226,14 @@ program main
     !*******************************************************************
     ! initial statistics (usefull for checking IC)
     !*******************************************************************
-    t4 = MPI_wtime()
     if ( (modulo(iteration, params%nsave_stats)==0).or.(abs(time - params%next_stats_time)<1e-12_rk) ) then
+        t4 = MPI_wtime()
         ! we need to sync ghost nodes for some derived qtys, for sure
         call sync_ghosts_RHS_tree( params, hvy_block, tree_ID_flow )
 
         call statistics_wrapper(time, 0.0_rk, params, hvy_block, hvy_tmp, hvy_mask, tree_ID_flow)
+        call toc( "TOPLEVEL: statistics", 13, MPI_wtime()-t4)
     endif
-    call toc( "TOPLEVEL: statistics", 13, MPI_wtime()-t4)
 
 
     ! timing
@@ -292,9 +282,9 @@ program main
             ! and use the "everywhere" indicator. Note: after resuming a run from backup, this works as well, because
             ! the refinement_flag is 0 and this results in "significant" refining in fact all blocks.
             if (params%refinement_indicator == "significant" .and. iteration == 0) then
-                call refine_tree( params, hvy_block, "everywhere", tree_ID=tree_ID_flow, error_OOM=error_OOM, check_full_tree=.true.)
+                call refine_tree( params, hvy_block, "everywhere", tree_ID=tree_ID_flow, error_OOM=error_OOM, check_full_tree=.true., time=time )
             else
-                call refine_tree( params, hvy_block, params%refinement_indicator, tree_ID=tree_ID_flow, error_OOM=error_OOM, check_full_tree=.true.)
+                call refine_tree( params, hvy_block, params%refinement_indicator, tree_ID=tree_ID_flow, error_OOM=error_OOM, check_full_tree=.true., time=time )
             endif
             ! if refine_tree runs out-of-memory (OOM), it does not actually do the refinement, but returns after realizing
             ! there won't be enough mem. We can thus jump to 17 to save a backup and terminate.
@@ -364,20 +354,23 @@ program main
                 ! synching of n_eqn_rhs is enough, as time statistics need no sync
                 call sync_ghosts_RHS_tree( params, hvy_block(:,:,:,1:params%n_eqn_rhs,:), tree_ID_flow )
 
-                call time_statistics_wrapper(time, dt, time_start, params, hvy_block, hvy_tmp, hvy_mask, tree_ID_flow)
+                call time_statistics_wrapper(time, dt, params, hvy_block, hvy_tmp, hvy_mask, tree_ID_flow)
             end if
 
             !*******************************************************************
             ! statistics
             !*******************************************************************
-            t4 = MPI_wtime()
             if ( (modulo(iteration, params%nsave_stats)==0).or.(abs(time - params%next_stats_time)<1e-12_rk) ) then
+                t4 = MPI_wtime()
                 ! we need to sync ghost nodes for some derived qtys, for sure
                 call sync_ghosts_RHS_tree( params, hvy_block, tree_ID_flow )
 
                 call statistics_wrapper(time, dt, params, hvy_block, hvy_tmp, hvy_mask, tree_ID_flow)
+                call toc( "TOPLEVEL: statistics", 13, MPI_wtime()-t4)
+
+                ! update next stats time
+                if (abs(time - params%next_stats_time)<1e-12_rk) params%next_stats_time = params%next_stats_time + params%tsave_stats
             endif
-            call toc( "TOPLEVEL: statistics", 13, MPI_wtime()-t4)
 
             ! if multiple time steps are performed on the same grid, we have to be careful
             ! not to skip past saving time intervals. Therefore, if it is time to save, we
@@ -400,12 +393,38 @@ program main
                 call createMask_tree(params, time, hvy_mask, hvy_tmp)
 
                 ! actual coarsening (including the mask function)
-                call adapt_tree( time, params, hvy_block, tree_ID_flow, params%coarsening_indicator, hvy_tmp, &
-                    hvy_mask=hvy_mask, hvy_work=hvy_work, neqn_WD=params%n_eqn-params%N_time_statistics)
+
+                ! High-level optimization for time statistics:
+                ! Check if any time statistics components should be adapted (threshold_state_vector_component > 0)
+                ! Time statistics are always at the end of the state vector
+                ! If we do not want to adapt them, we can reduce the workload of adapt_tree significantly.
+                if (params%time_statistics .and. params%N_time_statistics > 0 .and. &
+                    any(params%threshold_state_vector_component(params%n_eqn-params%N_time_statistics+1:params%n_eqn) > 0)) then
+                    ! Include time statistics in adaptation
+                    call adapt_tree( time, params, hvy_block, tree_ID_flow, params%coarsening_indicator, hvy_tmp, &
+                        hvy_mask=hvy_mask, hvy_work=hvy_work, neqn_adapt=params%n_eqn)
+                else
+                    ! Exclude time statistics from adaptation to reduce workload
+                    call adapt_tree( time, params, hvy_block, tree_ID_flow, params%coarsening_indicator, hvy_tmp, &
+                        hvy_mask=hvy_mask, hvy_work=hvy_work, neqn_adapt=params%n_eqn-params%N_time_statistics)
+                endif
             else
                 ! actual coarsening (no mask function is required)
-                call adapt_tree( time, params, hvy_block, tree_ID_flow, params%coarsening_indicator, hvy_tmp, &
-                    hvy_work=hvy_work, neqn_WD=params%n_eqn-params%N_time_statistics)
+
+                ! High-level optimization for time statistics:
+                ! Check if any time statistics components should be adapted (threshold_state_vector_component > 0)
+                ! Time statistics are always at the end of the state vector
+                ! If we do not want to adapt them, we can reduce the workload of adapt_tree significantly.
+                if (params%time_statistics .and. params%N_time_statistics > 0 .and. &
+                    any(params%threshold_state_vector_component(params%n_eqn-params%N_time_statistics+1:params%n_eqn) > 0)) then
+                    ! Include time statistics in adaptation
+                    call adapt_tree( time, params, hvy_block, tree_ID_flow, params%coarsening_indicator, hvy_tmp, &
+                        hvy_work=hvy_work, neqn_adapt=params%n_eqn)
+                else
+                    ! Exclude time statistics from adaptation to reduce workload
+                    call adapt_tree( time, params, hvy_block, tree_ID_flow, params%coarsening_indicator, hvy_tmp, &
+                        hvy_work=hvy_work, neqn_adapt=params%n_eqn-params%N_time_statistics)
+                endif
             endif
         endif
         call toc( "TOPLEVEL: adapt mesh", 14, MPI_wtime()-t4)
@@ -483,13 +502,16 @@ program main
 
 
     !*******************************************************************
-    ! statistics ( last time )
+    ! statistics ( last time - always done if statistics is enabled, but skipped if it was already done at this time to prevent duplicates )
     !*******************************************************************
-    if ( (modulo(iteration, params%nsave_stats)==0).or.(abs(time - params%next_stats_time)<1e-12_rk) ) then
+    if ( (params%nsave_stats /= 99999999_ik .or. (abs(params%tsave_stats - 9999999.9_rk) > 1e-12)) &
+        .and. .not. (modulo(iteration, params%nsave_stats) == 0 .or. abs(time - params%next_stats_time + params%tsave_stats)<1e-12_rk ) ) then
+        t4 = MPI_wtime()
         ! we need to sync ghost nodes for some derived qtys, for sure
         call sync_ghosts_RHS_tree( params, hvy_block, tree_ID_flow)
 
         call statistics_wrapper(time, dt, params, hvy_block, hvy_tmp, hvy_mask, tree_ID_flow)
+        call toc( "TOPLEVEL: statistics", 13, MPI_wtime()-t4)
     endif
 
 
diff --git a/LIB/MESH/adapt_tree.f90 b/LIB/MESH/adapt_tree.f90
index 6a4fb94b..06653a37 100644
--- a/LIB/MESH/adapt_tree.f90
+++ b/LIB/MESH/adapt_tree.f90
@@ -8,7 +8,7 @@
 !
 !> \note It is well possible to start with a very fine mesh and end up with only one active
 !! block after this routine. You do *NOT* have to call it several times.
-subroutine adapt_tree( time, params, hvy_block, tree_ID, indicator, hvy_tmp, hvy_work, hvy_mask, ignore_coarsening, ignore_maxlevel, init_full_tree_grid, neqn_WD )
+subroutine adapt_tree( time, params, hvy_block, tree_ID, indicator, hvy_tmp, hvy_work, hvy_mask, ignore_coarsening, ignore_maxlevel, init_full_tree_grid, neqn_adapt )
     ! it is not technically required to include the module here, but for VS code it reduces the number of wrong "errors"
     use module_params
     
@@ -36,17 +36,19 @@ subroutine adapt_tree( time, params, hvy_block, tree_ID, indicator, hvy_tmp, hvy
     logical, intent(in), optional       :: ignore_maxlevel
     !> Maybe we already have a full tree grid, so we do not need to initialize it
     logical, intent(in), optional       :: init_full_tree_grid
-    !> In rare cases, not all equations need to be wavelet decomposed, this saves a lot of extra work.
-    !> If not present, all equations are used. Applying this decomposes only SC and reconstructs by copying
-    integer(kind=ik), optional, intent(in) :: neqn_WD
+    !> In some cases, not all component of the state vector need to be wavelet decomposed, and in those cases
+    !> excluding those saves a lot of extra work. If present, the code wavelet decomposes the first [1:neqn_adapt components] of 
+    !> the vector. For the remaining ones [neqn_adapt+1:neqn], only scaling function coefficients are computed (and no WC); reconstruction
+    !> is done by simple copying. If neqn_adapt is not present, all components are fully decomposed (SC AND WC computed).
+    integer(kind=ik), optional, intent(in) :: neqn_adapt
     
     ! loop variables
     integer(kind=ik)                    :: iteration, k, lgt_id
     real(kind=rk)                       :: t_block, t_all, t_loop
-    integer(kind=ik)                    :: Jmax_active, Jmin_active, level, ierr, k1, hvy_id, neqn_WD_apply
+    integer(kind=ik)                    :: Jmax_active, Jmin_active, level, ierr, k1, hvy_id, neqn_adapt_apply
     logical                             :: ignore_coarsening_apply, ignore_maxlevel_apply, initFullTreeGrid, iterate, toBeManipulated
     integer(kind=ik)                    :: level_me, ref_stat, Jmin, lgt_n_old, g_this
-    character(len=clong)                :: toc_statement
+    character(len=clong)                :: format_string
     integer(kind=tsize)                 :: treecode
 
     real(kind=rk) :: thresh(1:params%n_eqn), thresh_old(1:params%n_eqn), thresh_save(1:params%n_eqn+1)
@@ -81,8 +83,8 @@ subroutine adapt_tree( time, params, hvy_block, tree_ID, indicator, hvy_tmp, hvy
     if (present(ignore_maxlevel)) ignore_maxlevel_apply = ignore_maxlevel
     initFullTreeGrid = .true.
     if (present(init_full_tree_grid)) initFullTreeGrid = init_full_tree_grid
-    neqn_WD_apply = size(hvy_block, 4)
-    if (present(neqn_WD)) neqn_WD_apply = neqn_WD
+    neqn_adapt_apply = size(hvy_block, 4)
+    if (present(neqn_adapt)) neqn_adapt_apply = neqn_adapt
 
 
     ! To avoid that the incoming hvy_neighbor array and active lists are outdated
@@ -97,7 +99,7 @@ subroutine adapt_tree( time, params, hvy_block, tree_ID, indicator, hvy_tmp, hvy
     ! Blocks always pass their SC to their mother (pendant to excuteCoarsening of old function), new blocks only synch from medium neighbors to avoid CE
     ! This is repeated until all blocks on the layer JMin are present with wavelet decomposed values
     t_block = MPI_Wtime()
-    call wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init_full_tree_grid=initFullTreeGrid)
+    call wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init_full_tree_grid=initFullTreeGrid, neqn_adapt=neqn_adapt_apply, time=time)
     call toc( "adapt_tree (decompose_tree)", 102, MPI_Wtime()-t_block )
     ! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -234,13 +236,13 @@ subroutine adapt_tree( time, params, hvy_block, tree_ID, indicator, hvy_tmp, hvy
         ! In a special case reconstruction can be skipped as it has already been done, therefore the if-condition
         if (.not. ((indicator == "threshold-cvs" .or. indicator == "threshold-image-denoise") .and. ignore_coarsening_apply)) then
             ! generally, we wavelet reconstruct all equations. In some rare cases, only a subset is required, for example for time statistics
-            ! in this case, we reconstruct only the first neqn_WD_apply equations and the rest is copied from the backup
-            ! Note that hvy_tmp contains the original values, so we copy them back after reconstructing the first neqn_WD_apply equations
-            call wavelet_reconstruct_full_tree(params, hvy_block(:,:,:,1:neqn_WD_apply,:), hvy_tmp(:,:,:,1:neqn_WD_apply,:), tree_ID)
-            if (neqn_WD_apply < size(hvy_block, 4)) then
+            ! in this case, we reconstruct only the first neqn_adapt_apply equations and the rest is copied from the backup
+            ! Note that hvy_tmp contains the original values, so we copy them back after reconstructing the first neqn_adapt_apply equations
+            call wavelet_reconstruct_full_tree(params, hvy_block(:,:,:,1:neqn_adapt_apply,:), hvy_tmp(:,:,:,1:neqn_adapt_apply,:), tree_ID, time=time)
+            if (neqn_adapt_apply < size(hvy_block, 4)) then
                 do k = 1, hvy_n(tree_ID)
                     hvy_ID = hvy_active(k, tree_ID)
-                    hvy_block(:, :, : ,neqn_WD_apply+1:size(hvy_block, 4), hvy_ID) = hvy_tmp(:, :, : ,neqn_WD_apply+1:size(hvy_block, 4), hvy_ID)  ! restore original values
+                    hvy_block(:, :, : ,neqn_adapt_apply+1:size(hvy_block, 4), hvy_ID) = hvy_tmp(:, :, : ,neqn_adapt_apply+1:size(hvy_block, 4), hvy_ID)  ! restore original values
                 enddo
             endif
         endif
@@ -255,7 +257,7 @@ subroutine adapt_tree( time, params, hvy_block, tree_ID, indicator, hvy_tmp, hvy
     ! they may have passed several level also and non-leafs have been deleted. Rebalance for optimal loadbalancing
     ! ToDo: we sync directly afterwards so we could only transfer inner points, but this needs the buffering from xfer_bock_data
     t_block = MPI_Wtime()
-    call balanceLoad_tree( params, hvy_block, tree_ID)
+    call balanceLoad_tree( params, hvy_block, tree_ID, balance_name='adapt', time=time )
     call toc( "adapt_tree (balanceLoad_tree)", 106, MPI_Wtime()-t_block )
 
     ! synchronize ghost nodes - final synch to update all neighbours with the new values
@@ -274,7 +276,7 @@ subroutine adapt_tree( time, params, hvy_block, tree_ID, indicator, hvy_tmp, hvy
 !! First, the full tree grid is prepared, then the leaf-layer is decomposed in one go
 !! in order to have load-balanced work there, then the mothers are lvl-wise updated and consequently
 !! decomposed until all blocks have the decomposed values. Afterwards, the grid will be in full tree format.
-subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init_full_tree_grid, compute_SC_only, scaling_filter, neqn_WD, verbose_check)
+subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init_full_tree_grid, compute_SC_only, scaling_filter, neqn_adapt, time, verbose_check)
     implicit none
 
     type (type_params), intent(in)      :: params
@@ -289,14 +291,18 @@ subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init
     logical, intent(in), optional       :: verbose_check  !< No matter the value, if this is present we debug
     !> In rare cases, not all equations need to be wavelet decomposed, this saves a lot of extra work.
     !> If not present, all equations are used. Applying this decomposes only SC and reconstructs by copying
-    integer(kind=ik), optional, intent(in) :: neqn_WD
+    integer(kind=ik), optional, intent(in) :: neqn_adapt
+    !> current simulation time, only used for development debugging output
+    real(kind=rk), intent(in), optional  :: time
 
-    integer(kind=ik)     :: k, lgt_ID, hvy_ID, lgt_n_old, g_this, level_me, ref_stat, iteration, level, neqn_WD_apply
+    integer(kind=ik)     :: k, lgt_ID, hvy_ID, lgt_n_old, g_this, level_me, ref_stat, iteration, level, neqn_adapt_apply, ierr
     real(kind=rk)        :: t_block, t_loop
-    character(len=clong) :: toc_statement
+    character(len=clong) :: format_string, string_prepare
     logical              :: iterate, use_leaf_first, initFullTreeGrid, computeSCOnly
     real(kind=rk), dimension(:), allocatable :: scalingFilter
     integer(kind=ik), dimension(:), allocatable, save :: lgt_refinementStatus_backup
+    integer(kind=ik)                    :: blocks_decomposed
+    integer(kind=ik), allocatable, save :: blocks_decomposed_list(:)
 
     initFullTreeGrid = .true.
     if (present(init_full_tree_grid)) initFullTreeGrid = init_full_tree_grid
@@ -311,8 +317,8 @@ subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init
         allocate(scalingFilter(lbound(params%HD, dim=1):ubound(params%HD, dim=1)))
         scalingFilter(:) = params%HD(:)
     endif
-    neqn_WD_apply = size(hvy_block, 4)
-    if (present(neqn_WD)) neqn_WD_apply = neqn_WD
+    neqn_adapt_apply = size(hvy_block, 4)
+    if (present(neqn_adapt)) neqn_adapt_apply = neqn_adapt
 
     ! the refinement_status is used in this routine for things other than the refinement_status
     ! but that may be problematic if we have set the refinement_status externally and like to keep it.
@@ -405,25 +411,28 @@ subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init
         ! first point is SC and last WC, so we need left(HD) or right(GD) for sync
         g_this = max(abs(lbound(params%HD,1)),ubound(params%GD,1))
 
+        write(string_prepare, '(A, i0)') "wavelet_decompose_syncN_lvl", level  !< prepare format string for sync debug
+
         ! ! for coarse extension we are not dependend on coarser neighbors so lets skip the syncing
         ! if (params%isLiftedWavelet) then
-        !     call sync_TMP_from_MF( params, hvy_block, tree_ID, -1, g_minus=g_this, g_plus=g_this, hvy_tmp=hvy_tmp)
-        !     call toc( "decompose_tree (sync TMP <- MF)", 111, MPI_Wtime()-t_block )
+        !     call sync_TMP_from_MF( params, hvy_block, tree_ID, -1, g_minus=g_this, g_plus=g_this, hvy_tmp=hvy_tmp, sync_debug_name=string_prepare)
+        !     call toc( "decompose_tree (sync TMP <- MF)", 111, MPI_Wtime()-t_block)
 
-        !     write(toc_statement, '(A, i0, A)') "decompose_tree (it ", iteration, " sync lvl <- MF)"
-        !     call toc( toc_statement, 1100+iteration, MPI_Wtime()-t_block )
+        !     write(format_string, '(A, i0, A)') "decompose_tree (it ", iteration, " sync lvl <- MF)"
+        !     call toc( format_string, 1100+iteration, MPI_Wtime()-t_block )
         ! ! unlifted wavelets need coarser neighbor values for their WC so we need to sync them too
         ! else
-            call sync_TMP_from_all( params, hvy_block, tree_ID, -1, g_minus=g_this, g_plus=g_this, hvy_tmp=hvy_tmp)
-            call toc( "decompose_tree (sync TMP <- all)", 111, MPI_Wtime()-t_block )
+            call sync_TMP_from_all( params, hvy_block, tree_ID, -1, g_minus=g_this, g_plus=g_this, hvy_tmp=hvy_tmp, sync_debug_name=string_prepare)
+            call toc( "decompose_tree (sync TMP <- all)", 111, MPI_Wtime()-t_block)
 
-            write(toc_statement, '(A, i0, A)') "decompose_tree (it ", iteration, " sync lvl <- all)"
-            call toc( toc_statement, 1100+iteration, MPI_Wtime()-t_block )
+            write(format_string, '(A, i0, A)') "decompose_tree (it ", iteration, " sync lvl <- all)"
+            call toc( format_string, 1100+iteration, MPI_Wtime()-t_block )
         ! endif
 
         ! Wavelet-transform all blocks which are untreated
         ! From now on until wavelet retransform hvy_block will hold the wavelet decomposed values in spaghetti form for these blocks
         t_block = MPI_Wtime()
+        blocks_decomposed = 0
         do k = 1, hvy_n(tree_ID)
             hvy_ID = hvy_active(k, tree_ID)
             call hvy2lgt( lgt_ID, hvy_ID, params%rank, params%number_blocks )
@@ -431,6 +440,7 @@ subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init
     
             ! FWT required for a block that is on the level
             if (ref_stat == -1) then
+                blocks_decomposed = blocks_decomposed + 1
                 ! hvy_tmp now is a copy with sync'ed ghost points.
                 ! We actually copy here a second time but this is to ensure we have the correct ghost points saved, and copying is not too expensive anyways
                 hvy_tmp(:,:,:,1:size(hvy_block, 4),hvy_ID) = hvy_block(:,:,:,1:size(hvy_block, 4),hvy_ID)
@@ -448,18 +458,37 @@ subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init
                         lbound(params%HD, dim=1), ubound(params%HD, dim=1), do_restriction=.true.)
                 else
                     ! generally, we wavelet decompose all equations. In some rare cases, only a subset is required, for example for time statistics
-                    ! in this case, we decompose only the first neqn_WD_apply equations and the rest only with the scaling function
-                    call waveletDecomposition_block(params, hvy_block(:,:,:,1:neqn_WD_apply,hvy_ID))
-                    if (neqn_WD_apply < size(hvy_block, 4)) then
-                        call blockFilterXYZ_vct( params, hvy_tmp(:,:,:,neqn_WD_apply+1:size(hvy_block, 4),hvy_ID), hvy_block(:,:,:,neqn_WD_apply+1:size(hvy_block, 4),hvy_ID), params%HD, lbound(params%HD, dim=1), ubound(params%HD, dim=1), do_restriction=.true.)
+                    ! in this case, we decompose only the first neqn_adapt_apply equations and the rest only with the scaling function
+                    call waveletDecomposition_block(params, hvy_block(:,:,:,1:neqn_adapt_apply,hvy_ID))
+                    if (neqn_adapt_apply < size(hvy_block, 4)) then
+                        call blockFilterXYZ_vct( params, hvy_tmp(:,:,:,neqn_adapt_apply+1:size(hvy_block, 4),hvy_ID), hvy_block(:,:,:,neqn_adapt_apply+1:size(hvy_block, 4),hvy_ID), params%HD, lbound(params%HD, dim=1), ubound(params%HD, dim=1), do_restriction=.true.)
                     endif
                 endif
             endif
         end do
         call toc( "decompose_tree (FWT)", 112, MPI_Wtime()-t_block )
 
-        write(toc_statement, '(A, i0, A)') "decompose_tree (it ", iteration, " FWT)"
-        call toc( toc_statement, 1150+iteration, MPI_Wtime()-t_block )
+        write(format_string, '(A, i0, A)') "decompose_tree (it ", iteration, " FWT)"
+        call toc( format_string, 1150+iteration, MPI_Wtime()-t_block )
+
+            ! debug output to see how many blocks have been decomposed, gather information on rank 0 and print to file
+        if (params%debug_wavelet_decompose) then
+            if (.not. allocated(blocks_decomposed_list)) allocate(blocks_decomposed_list(1:params%number_procs))
+
+            call MPI_GATHER(blocks_decomposed, 1, MPI_INTEGER, blocks_decomposed_list, 1, MPI_INTEGER, 0, WABBIT_COMM, ierr)
+            if (params%rank == 0) then
+                open(unit=99, file=trim("debug_wavelet_decompose.csv"), status="unknown", position="append")
+                string_prepare = "-1.0E+00,"  ! set negative time, just to have csv with the same length in every row
+                if (present(time)) write(string_prepare,'(es16.6,",")') time
+                if (params%number_procs == 1) then
+                    write(99,'(A, i0, ",", i0)') trim(adjustl(string_prepare)), level, blocks_decomposed_list(1)
+                else
+                    write(format_string, '("(A, i0,",i0,"("","",i0))")') params%number_procs
+                    write(99,format_string) trim(adjustl(string_prepare)), level, blocks_decomposed_list(1), blocks_decomposed_list(2:params%number_procs)
+                endif
+                close(99)
+            endif
+        endif
 
         ! ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
         ! coarseExtension: remove wavelet coefficients near a fine/coarse interface
@@ -494,12 +523,13 @@ subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init
         !     Update mother blocks
         ! We update mother blocks level-wise, with this we always know that all blocks on this level are ready and need no lgt_data update
         ! This uses the already wavelet decomposed blocks and copies their SC into the new mother block
+        write(string_prepare, '(A, i0)') "wavelet_decompose_syncF_lvl", level  !< prepare format string for sync debug
         t_block = MPI_Wtime()
-        call sync_D2M(params, hvy_block, tree_ID, sync_case="level", s_val=level)
-        call toc( "decompose_tree (sync level 2 mothers)", 113, MPI_Wtime()-t_block )
+        call sync_D2M(params, hvy_block, tree_ID, sync_case="level", s_val=level, sync_debug_name=string_prepare)
+        call toc( "decompose_tree (sync level 2 mothers)", 113, MPI_Wtime()-t_block)
 
-        write(toc_statement, '(A, i0, A)') "decompose_tree (it ", iteration, " sync level 2 mothers)"
-        call toc( toc_statement, 1200+iteration, MPI_Wtime()-t_block )
+        write(format_string, '(A, i0, A)') "decompose_tree (it ", iteration, " sync level 2 mothers)"
+        call toc( format_string, 1200+iteration, MPI_Wtime()-t_block )
 
         ! iteration counter
         iteration = iteration + 1
@@ -507,18 +537,18 @@ subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init
         ! loop continues until we are on the lowest level.
         iterate = (level >= params%Jmin)
 
-        write(toc_statement, '(A, i0, A)') "decompose_tree (it ", iteration, " TOTAL)"
-        call toc( toc_statement, 1250+iteration, MPI_Wtime()-t_loop )
+        write(format_string, '(A, i0, A)') "decompose_tree (it ", iteration, " TOTAL)"
+        call toc( format_string, 1250+iteration, MPI_Wtime()-t_loop )
 
         if (present(verbose_check)) then
             write(*, '(A, i0, A, i0, A)') "Loop ", iteration, " with ", lgt_n(tree_ID), " blocks"
         endif
 
         ! ! for debugging purposes - savin does a sync if any ghost values are NaN
-        ! write( toc_statement,'(A, i6.6, A)') 'TestWD_', iteration, "000000.h5"
-        ! call saveHDF5_tree(toc_statement, dble(iteration), iteration, 1, params, hvy_block, tree_ID)
-        ! write( toc_statement,'(A, i6.6, A)') 'TestN_', iteration, "000000.h5"
-        ! call saveHDF5_tree(toc_statement, dble(iteration), iteration, 1, params, hvy_tmp, tree_ID)
+        ! write( format_string,'(A, i6.6, A)') 'TestWD_', iteration, "000000.h5"
+        ! call saveHDF5_tree(format_string, dble(iteration), iteration, 1, params, hvy_block, tree_ID)
+        ! write( format_string,'(A, i6.6, A)') 'TestN_', iteration, "000000.h5"
+        ! call saveHDF5_tree(format_string, dble(iteration), iteration, 1, params, hvy_tmp, tree_ID)
     end do
 
     ! copy the original refinement_status (when entering this routine) back to lgt_block
@@ -527,10 +557,10 @@ subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init
     enddo
 
     ! ! for debugging purposes - savin does a sync if any ghost values are NaN
-    ! write( toc_statement,'(A, i6.6, A)') 'TestWD_', 100, "000000.h5"
-    ! call saveHDF5_tree(toc_statement, dble(100), 100, 1, params, hvy_block, tree_ID)
-    ! write( toc_statement,'(A, i6.6, A)') 'TestN_', 100, "000000.h5"
-    ! call saveHDF5_tree(toc_statement, dble(100), 100, 1, params, hvy_tmp, tree_ID)
+    ! write( format_string,'(A, i6.6, A)') 'TestWD_', 100, "000000.h5"
+    ! call saveHDF5_tree(format_string, dble(100), 100, 1, params, hvy_block, tree_ID)
+    ! write( format_string,'(A, i6.6, A)') 'TestN_', 100, "000000.h5"
+    ! call saveHDF5_tree(format_string, dble(100), 100, 1, params, hvy_tmp, tree_ID)
 end subroutine
 
 
@@ -540,7 +570,7 @@ subroutine wavelet_decompose_full_tree(params, hvy_block, tree_ID, hvy_tmp, init
 !! The input has to be in full grid format in order for the mother-update cycle to work.
 !! Blocks are reconstructed level-wise starting and JMin and mothers are updated until all blocks are reconstructed.
 !! Afterwards the grid is still in full tree format, but can be pruned by deleting all non-leafs
-subroutine wavelet_reconstruct_full_tree(params, hvy_block, hvy_tmp, tree_ID)
+subroutine wavelet_reconstruct_full_tree(params, hvy_block, hvy_tmp, tree_ID, time)
     ! it is not technically required to include the module here, but for VS code it reduces the number of wrong "errors"
     use module_params
     
@@ -552,11 +582,14 @@ subroutine wavelet_reconstruct_full_tree(params, hvy_block, hvy_tmp, tree_ID)
     !> heavy tmp data array - block data.
     real(kind=rk), intent(inout)        :: hvy_tmp(:, :, :, :, :)
     integer(kind=ik), intent(in)        :: tree_ID
+    real(kind=rk), intent(in), optional :: time  !< current simulation time, only for debug output
 
     ! loop variables
-    integer(kind=ik)                    :: iteration, k, Jmax_active, Jmin_active, level, hvy_ID, lgt_ID, level_me, Jmin
+    integer(kind=ik)                    :: iteration, k, Jmax_active, Jmin_active, level, hvy_ID, lgt_ID, level_me, Jmin, ierr
     real(kind=rk)                       :: t_block, t_all, t_loop
-    character(len=clong)                :: toc_statement
+    character(len=clong)                :: format_string, string_prepare
+    integer(kind=ik)                    :: blocks_reconstructed
+    integer(kind=ik), allocatable, save :: blocks_reconstructed_list(:)
 
 
     ! NOTE: after 24/08/2022, the arrays lgt_active/lgt_n hvy_active/hvy_n as well as lgt_sortednumlist,
@@ -577,12 +610,13 @@ subroutine wavelet_reconstruct_full_tree(params, hvy_block, hvy_tmp, tree_ID)
 
         ! synch SC and WC from coarser neighbours and same-level neighbours in order to apply the correct wavelet reconstruction
         ! Attention: This uses hvy_temp for coarser neighbors to predict the data, as we want the correct SC from coarser neighbors
+        write(string_prepare, '(A, i0)') "wavelet_reconstruct_syncN_lvl", level  !< prepare format string for sync debug
         t_block = MPI_Wtime()
-        call sync_SCWC_from_MC( params, hvy_block, tree_ID, hvy_tmp, g_minus=params%g, g_plus=params%g, level=level)
-        call toc( "reconstruct_tree (sync lvl <- MC)", 115, MPI_Wtime()-t_block )
+        call sync_SCWC_from_MC( params, hvy_block, tree_ID, hvy_tmp, g_minus=params%g, g_plus=params%g, level=level, sync_debug_name=string_prepare)
+        call toc( "reconstruct_tree (sync lvl <- MC)", 115, MPI_Wtime()-t_block)
 
-        write(toc_statement, '(A, i0, A)') "reconstruct_tree (it ", iteration, " sync level <- MC)"
-        call toc( toc_statement, 1300+iteration, MPI_Wtime()-t_block )
+        write(format_string, '(A, i0, A)') "reconstruct_tree (it ", iteration, " sync level <- MC)"
+        call toc( format_string, 1300+iteration, MPI_Wtime()-t_block )
 
         ! In theory, for full WT this CE_modify is only here for two reasons:
         !    1. We need to clear the WC from coarser neighbors, as the values in there are interpolated SC and we need to set those to the correct WC being 0
@@ -596,6 +630,7 @@ subroutine wavelet_reconstruct_full_tree(params, hvy_block, hvy_tmp, tree_ID)
 
         ! Wavelet-reconstruct blocks on level
         t_block = MPI_Wtime()
+        blocks_reconstructed = 0
         do k = 1, hvy_n(tree_ID)
             hvy_ID = hvy_active(k, tree_ID)
             call hvy2lgt( lgt_ID, hvy_ID, params%rank, params%number_blocks )
@@ -610,32 +645,53 @@ subroutine wavelet_reconstruct_full_tree(params, hvy_block, hvy_tmp, tree_ID)
                 ! One might be concerned if we sync from hvy_tmp and do prediction/interpolation, as it does not have it's ghost layers synced
                 ! however, we discard all interpolated values with the CE so we cleverly circumvent problems there
                 hvy_tmp(:,:,:,1:size(hvy_block, 4),hvy_ID) = hvy_block(:,:,:,1:size(hvy_block, 4),hvy_ID)
+                blocks_reconstructed = blocks_reconstructed + 1
             endif
         end do
         call toc( "reconstruct_tree (RWT)", 116, MPI_Wtime()-t_block )
 
-        write(toc_statement, '(A, i0, A)') "reconstruct_tree (it ", iteration, " RWT)"
-        call toc( toc_statement, 1350+iteration, MPI_Wtime()-t_block )
+        ! debug output to see how many blocks have been reconstructed, gather information on rank 0 and print to file
+        if (params%debug_wavelet_reconstruct) then
+            if (.not. allocated(blocks_reconstructed_list)) allocate(blocks_reconstructed_list(1:params%number_procs))
+
+            call MPI_GATHER(blocks_reconstructed, 1, MPI_INTEGER, blocks_reconstructed_list, 1, MPI_INTEGER, 0, WABBIT_COMM, ierr)
+            if (params%rank == 0) then
+                open(unit=99, file=trim("debug_wavelet_reconstruct.csv"), status="unknown", position="append")
+                string_prepare = "-1.0E+00,"  ! set negative time, just to have csv with the same length in every row
+                if (present(time)) write(string_prepare,'(es16.6,",")') time
+                if (params%number_procs == 1) then
+                    write(99,'(A, i0, ",", i0)') trim(adjustl(string_prepare)), level, blocks_reconstructed_list(1)
+                else
+                    write(format_string, '("(A, i0,",i0,"("","",i0))")') params%number_procs
+                    write(99,format_string) trim(adjustl(string_prepare)), level, blocks_reconstructed_list(1), blocks_reconstructed_list(2:params%number_procs)
+                endif
+                close(99)
+            endif
+        endif
+
+        write(format_string, '(A, i0, A)') "reconstruct_tree (it ", iteration, " RWT)"
+        call toc( format_string, 1350+iteration, MPI_Wtime()-t_block )
 
         ! now we need to update the SC of all daughters
         t_block = MPI_Wtime()
-        call sync_M2D(params, hvy_block, tree_ID, sync_case="level", s_val=level)
+        write(string_prepare, '(A, i0)') "wavelet_reconstruct_syncF_lvl", level  !< prepare format string for sync debug
+        call sync_M2D(params, hvy_block, tree_ID, sync_case="level", s_val=level, sync_debug_name=string_prepare)
         call toc( "reconstruct_tree (sync level 2 daughters)", 117, MPI_Wtime()-t_block )
 
-        write(toc_statement, '(A, i0, A)') "reconstruct_tree (it ", iteration, " sync level 2 daughters)"
-        call toc( toc_statement, 1400+iteration, MPI_Wtime()-t_block )
+        write(format_string, '(A, i0, A)') "reconstruct_tree (it ", iteration, " sync level 2 daughters)"
+        call toc( format_string, 1400+iteration, MPI_Wtime()-t_block )
 
-        write(toc_statement, '(A, i0, A)') "reconstruct_tree (it ", iteration, " TOTAL)"
-        call toc( toc_statement, 1450+iteration, MPI_Wtime()-t_loop )
+        write(format_string, '(A, i0, A)') "reconstruct_tree (it ", iteration, " TOTAL)"
+        call toc( format_string, 1450+iteration, MPI_Wtime()-t_loop )
 
         iteration = iteration + 1
     enddo
 
     ! ! for debugging purposes - savin does a sync if any ghost values are NaN
-    ! write( toc_statement,'(A, i6.6, A)') 'TestWD_', 100, "000000.h5"
-    ! call saveHDF5_tree(toc_statement, dble(100), 100, 1, params, hvy_block, tree_ID)
-    ! write( toc_statement,'(A, i6.6, A)') 'TestN_', 100, "000000.h5"
-    ! call saveHDF5_tree(toc_statement, dble(100), 100, 1, params, hvy_tmp, tree_ID)
+    ! write( format_string,'(A, i6.6, A)') 'TestWD_', 100, "000000.h5"
+    ! call saveHDF5_tree(format_string, dble(100), 100, 1, params, hvy_block, tree_ID)
+    ! write( format_string,'(A, i6.6, A)') 'TestN_', 100, "000000.h5"
+    ! call saveHDF5_tree(format_string, dble(100), 100, 1, params, hvy_tmp, tree_ID)
 end subroutine
 
 
@@ -654,7 +710,7 @@ subroutine init_full_tree(params, tree_ID, set_ref, verbose_check)
 
     integer(kind=ik)     :: i_level, j, k, N, lgt_ID, hvy_ID, level_b, iteration, level, data_rank, lgt_merge_id, hvy_merge_id, hvy_n_old, digit, refset
     real(kind=rk)        :: t_block, t_loop
-    character(len=clong) :: toc_statement
+    character(len=clong) :: format_string
     logical              :: iterate
     ! list of block ids, proc ranks
     integer(kind=ik)                    :: lgt_daughters(1:8), rank_daughters(1:8)
@@ -927,7 +983,7 @@ subroutine cvs_thresholding_local(params, hvy_block, hvy_tmp, hvy_work, indicato
     ! loop variables
     integer(kind=ik)                    :: iteration, k, k1, Jmax_active, Jmin_active, level, hvy_ID, lgt_ID, level_me, Jmin, g_spaghetti
     real(kind=rk)                       :: t_block, t_all, t_loop
-    character(len=clong)                :: toc_statement
+    character(len=clong)                :: format_string
 
     ! testing
     integer(kind=ik)                    :: ix, iy
@@ -1110,7 +1166,7 @@ subroutine cvs_thresholding_global(params, hvy_block, hvy_tmp, hvy_work, indicat
     ! loop variables
     integer(kind=ik)                    :: iteration, k, k1, Jmax_active, Jmin_active, level, hvy_ID, lgt_ID, level_me, Jmin, g_spaghetti
     real(kind=rk)                       :: t_block, t_all, t_loop
-    character(len=clong)                :: toc_statement
+    character(len=clong)                :: format_string
 
     real(kind=rk) :: thresh_old(1:params%n_eqn), thresh_save(1:params%n_eqn+1), norm(1:params%n_eqn)
 
diff --git a/LIB/MESH/balanceLoad_tree.f90 b/LIB/MESH/balanceLoad_tree.f90
index ab1cee30..be01c909 100644
--- a/LIB/MESH/balanceLoad_tree.f90
+++ b/LIB/MESH/balanceLoad_tree.f90
@@ -1,6 +1,6 @@
 !> \image html balancing.svg "Load balancing" width=400
 ! ********************************************************************************************
-subroutine balanceLoad_tree( params, hvy_block, tree_ID, balanceForRefinement)
+subroutine balanceLoad_tree( params, hvy_block, tree_ID, balanceForRefinement, balance_name, time)
     ! it is not technically required to include the module here, but for VS code it reduces the number of wrong "errors"
     use module_params
     
@@ -12,16 +12,22 @@ subroutine balanceLoad_tree( params, hvy_block, tree_ID, balanceForRefinement)
     ! If true, we look at the refinement flags of each bock for distribution: blocks that have status +1
     ! wil be refined to eight blocks, all others will just remain unchanged. 
     logical, intent(in), optional       :: balanceForRefinement
+    !> balanceLoad is always the same, however we want to differentiate between them, so we can assign a name
+    character(len=*), intent(in), optional :: balance_name
+    !> current simulation time, used for development output only
+    real(kind=rk), intent(in), optional :: time
     !=============================================================================
 
     integer(kind=ik)  :: rank, mpirank_shouldBe, mpirank_currently, ierr, number_procs, &
-                         k, N, l, com_i, com_N, sfc_id, &
-                         lgt_free_id, hvy_free_id, hilbertcode(params%Jmax), lgt_ID, Nblocks_toDistribute
+        k, N, l, com_i, com_N, sfc_id, lgt_free_id, hvy_free_id, hilbertcode(params%Jmax), lgt_ID, Nblocks_toDistribute, &
+        num_blocks_count(3), i_var
     integer(kind=tsize) :: treecode, hilbertcode2
     ! block distribution lists
     integer(kind=ik), allocatable, save :: blocksPerRank_balanced(:), sfc_com_list(:,:), sfc_sorted_list(:,:)
     real(kind=rk) :: t0, t1
     logical :: balanceForRefinement2
+    character(len=cshort) :: format_string, string_prepare
+    character(len=4) :: string_kind(3)
 
 
     ! NOTE: after 24/08/2022, the arrays lgt_active/lgt_n hvy_active/hvy_n as well as lgt_sortednumlist,
@@ -45,6 +51,9 @@ subroutine balanceLoad_tree( params, hvy_block, tree_ID, balanceForRefinement)
         balanceForRefinement2 = balanceForRefinement
     endif
 
+    ! development counters, just to see how many blocks are send/received/kept
+    num_blocks_count(1:3) = 0
+
     ! allocate block to proc lists
     if (.not.allocated(blocksPerRank_balanced)) allocate( blocksPerRank_balanced(1:number_procs))
     ! allocate sfc com list, maximal number of communications is when every proc wants to send all of his blocks
@@ -210,6 +219,20 @@ subroutine balanceLoad_tree( params, hvy_block, tree_ID, balanceForRefinement)
             sfc_com_list(3, com_i) = sfc_sorted_list(1,k)   ! light id of block
         end if
 
+        if (params%debug_balanceLoad) then
+            ! Development checks - count how many blocks are send, received or kept
+            if ( params%rank == mpirank_currently .and. mpirank_currently /= mpirank_shouldBe ) then
+                ! block is send away from this mpirank
+                num_blocks_count(1) = num_blocks_count(1) + 1
+            elseif ( params%rank == mpirank_currently .and. mpirank_currently == mpirank_shouldBe ) then
+                ! block is kept on this mpirank
+                num_blocks_count(2) = num_blocks_count(2) + 1
+            elseif ( params%rank == mpirank_shouldBe .and. mpirank_currently /= mpirank_shouldBe ) then
+                ! block is received by this mpirank
+                num_blocks_count(3) = num_blocks_count(3) + 1
+            endif
+        endif
+
         ! The blocksPerRank_balanced defines how many blocks this rank should have, and
         ! we just treated one (which either already was on the mpirank or will be on
         ! it after communication), so remove one item from the blocksPerRank_balanced
@@ -240,6 +263,39 @@ subroutine balanceLoad_tree( params, hvy_block, tree_ID, balanceForRefinement)
     ! the block xfer changes the light data, and afterwards active lists are outdated.
     call updateMetadata_tree(params, tree_ID)
 
+    ! development output, gather information on rank 0 and print to file
+    ! reuse blocksPerRank_balanced for this
+    if (params%debug_balanceLoad) then
+        string_kind = (/ "send", "keep", "recv" /)
+        
+        do i_var = 1, 3
+            ! gather number of blocks send/received/kept on rank 0
+            call MPI_GATHER(num_blocks_count(i_var), 1, MPI_INTEGER, blocksPerRank_balanced, 1, MPI_INTEGER, 0, WABBIT_COMM, ierr)
+            if (params%rank == 0) then
+                if (present(balance_name)) then
+                    string_prepare = trim(adjustl(balance_name))//","
+                else
+                    string_prepare = "balanceLoad,"
+                endif
+                string_prepare = trim(adjustl(string_prepare))//trim(adjustl(string_kind(i_var)))//","
+                if (present(time)) then
+                    write(string_prepare,'(A,es12.6,",")') trim(adjustl(string_prepare)), time
+                else
+                    string_prepare = trim(adjustl(string_prepare))//"-1.0E+00,"  ! set negative time, just to have csv with the same length in every row
+                endif
+                ! Single IO operation with dynamic format
+                open(unit=99, file='debug_balanceLoad.csv', status="unknown", position="append")
+                if (params%number_procs == 1) then
+                    write(99, '(A, i0)') trim(adjustl(string_prepare)), blocksPerRank_balanced(1)
+                else
+                    write(format_string, '("(A, i0,",i0,"("","",i0))")') params%number_procs - 1
+                    write(99, format_string) trim(adjustl(string_prepare)), blocksPerRank_balanced(1), blocksPerRank_balanced(2:params%number_procs)
+                endif
+                close(99)
+            endif
+        enddo
+    endif
+
     ! timing
     call toc( "balanceLoad_tree (TOTAL)", 90, MPI_wtime()-t0 )
 end subroutine balanceLoad_tree
diff --git a/LIB/MESH/createEquidistantGrid_tree.f90 b/LIB/MESH/createEquidistantGrid_tree.f90
index 5ba4a41b..f0ecdf16 100644
--- a/LIB/MESH/createEquidistantGrid_tree.f90
+++ b/LIB/MESH/createEquidistantGrid_tree.f90
@@ -144,5 +144,5 @@ subroutine createEquidistantGrid_tree( params, hvy_block, Jmin, verbosity, tree_
 
     ! the grid we created above is not at all balanced - on the contrary, it fills up CPU1, then CPU2
     ! correct this by balancing:
-    call balanceLoad_tree(params, hvy_block, tree_ID)
+    call balanceLoad_tree(params, hvy_block, tree_ID, balance_name='createEquidistantGrid', time=0.0_rk )
 end subroutine createEquidistantGrid_tree
diff --git a/LIB/MESH/createMask_tree.f90 b/LIB/MESH/createMask_tree.f90
index 161c36b9..e4e5a765 100644
--- a/LIB/MESH/createMask_tree.f90
+++ b/LIB/MESH/createMask_tree.f90
@@ -10,11 +10,12 @@ subroutine createMask_tree(params, time, hvy_mask, hvy_tmp, all_parts)
     real(kind=rk), intent(inout)           :: hvy_tmp(:, :, :, :, :)
     logical, intent(in), optional          :: all_parts
     integer(kind=ik)                       :: k, lgt_id, Bs(1:3), g, hvy_id, iter, Jactive, Jmax
-    real(kind=rk)                          :: x0(1:3), dx(1:3), t0
+    real(kind=rk)                          :: x0(1:3), dx(1:3)
     logical, save                          :: time_independent_part_ready = .false.
     logical                                :: force_all_parts
+    real(kind=rk)                          :: t_block, t_cycle
 
-    t0 = MPI_wtime()
+    t_cycle = MPI_wtime()
     Bs      = params%Bs
     g       = params%g
     Jactive = maxActiveLevel_tree(tree_ID_flow)
@@ -31,6 +32,7 @@ subroutine createMask_tree(params, time, hvy_mask, hvy_tmp, all_parts)
     ! some mask functions have initialization routines (insects) which are to be called once and not for each
     ! block (efficiency). Usually, this would be a staging concept as well, but as only Thomas uses it anyways, cleanup
     ! is left as FIXME
+    ! You look outside the window and see Julius shaking his head in disapproval ...
     if (params_acm%geometry=="Insect") then
         call Update_Insect_wrapper(time)
     endif
@@ -67,9 +69,10 @@ subroutine createMask_tree(params, time, hvy_mask, hvy_tmp, all_parts)
     !-----------------------------------------------------------------------
     if (((Jactive < Jmax-1 .and. params%force_maxlevel_dealiasing) .or. Jactive < Jmax) .or. (params%threshold_mask .eqv. .false.) .or. (force_all_parts)) then
         ! generate complete mask (may be expensive)
+        t_block = MPI_Wtime()
         call createCompleteMaskDirect_tree(params, time, hvy_mask)
 
-        call toc( "create_mask_tree (createCompleteMaskDirect_tree)", 210, MPI_Wtime()-t0 )
+        call toc( "createMask_tree (createCompleteMaskDirect_tree)", 66, MPI_Wtime()-t_block )
 
         ! we're done, all parts of mask function are created, leave routine now
         return
@@ -84,14 +87,18 @@ subroutine createMask_tree(params, time, hvy_mask, hvy_tmp, all_parts)
     ! finest level (refined only where interesting).
     ! At most, mask in generated (Jmax-Jmin) times.
     if ((.not. time_independent_part_ready) .and. (params%mask_time_independent_part)) then
+        t_block = MPI_Wtime()
         call createTimeIndependentMask_tree(params, time, hvy_mask, hvy_tmp)
+        call toc( "createMask_tree (createTimeIndependentMask_tree)", 67, MPI_Wtime()-t_block )
 
         time_independent_part_ready = .true.
     endif
 
     ! create "time-dependent-part" here, add the existing "time-independent-part"
     ! if it is available, return the complete mask incl. all parts
+    t_block = MPI_Wtime()
     if ( params%mask_time_dependent_part ) then
+        t_block = MPI_Wtime()
         do k = 1, hvy_n(tree_ID_flow)
             hvy_id = hvy_active(k, tree_ID_flow)
 
@@ -118,9 +125,10 @@ subroutine createMask_tree(params, time, hvy_mask, hvy_tmp, all_parts)
             ! hvy_mask(:,:,:,6,hvy_id) = 0.0_rk ! sponge. as we keep it time-dependent, it is not required.
         enddo
     endif
-
+    call toc( "createMask_tree (time-dependent-part)", 68, MPI_Wtime()-t_block )
 
     if ( params%mask_time_independent_part ) then
+        t_block = MPI_Wtime()
         if (Jactive == params%Jmax ) then
             ! flow is on finest level: add complete mask on finest level
             ! this is the case when computing the right hand side.
@@ -144,9 +152,10 @@ subroutine createMask_tree(params, time, hvy_mask, hvy_tmp, all_parts)
         else
             if (params%rank==0) write(*,'(A, i0)') "WARNING: mask generation with time independent part fails (flow grid neither on Jmax nor Jmax-1 but on level ", Jactive," )"
         endif
+        call toc( "createMask_tree (time-independent-part)", 69, MPI_Wtime()-t_block )
     endif
 
-    call toc( "create_mask_tree (pruned_tree_logic)", 211, MPI_Wtime()-t0 )
+    call toc( "createMask_tree (TOTAL)", 65, MPI_Wtime()-t_cycle )
 
 end subroutine
 
diff --git a/LIB/MESH/createRandomGrid_tree.f90 b/LIB/MESH/createRandomGrid_tree.f90
index 7c10e8fd..ea4f1494 100644
--- a/LIB/MESH/createRandomGrid_tree.f90
+++ b/LIB/MESH/createRandomGrid_tree.f90
@@ -67,5 +67,5 @@ subroutine createRandomGrid_tree( params, hvy_block, hvy_tmp, level_init, verbos
         dble(lgt_n(tree_ID)) / dble(size(lgt_block, 1)), params%max_grid_density
     endif
 
-    call balanceLoad_tree( params, hvy_block, tree_ID)
+    call balanceLoad_tree( params, hvy_block, tree_ID, balance_name='createRandomGrid', time=0.0_rk )
 end subroutine createRandomGrid_tree
diff --git a/LIB/MESH/executeCoarsening_tree.f90 b/LIB/MESH/executeCoarsening_tree.f90
index baa2093b..5a867e9e 100644
--- a/LIB/MESH/executeCoarsening_tree.f90
+++ b/LIB/MESH/executeCoarsening_tree.f90
@@ -122,7 +122,7 @@ end subroutine executeCoarsening_tree
 
 
 !> For CVS decomposition we need to update mothers from decomposed daughter values, this does exactly that
-subroutine sync_D2M(params, hvy_block, tree_ID, sync_case, s_val)
+subroutine sync_D2M(params, hvy_block, tree_ID, sync_case, s_val, sync_debug_name)
     ! it is not technically required to include the module here, but for VS code it reduces the number of wrong "errors"
     use module_params
 
@@ -134,6 +134,7 @@ subroutine sync_D2M(params, hvy_block, tree_ID, sync_case, s_val)
     character(len=*)                    :: sync_case                    !< String representing which kind of syncing we want to do
     !> Additional value to be considered for syncing logic, can be level or refinement status from which should be synced, dependend on sync case
     integer(kind=ik), intent(in)        :: s_val
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
     ! loop variables
     integer(kind=ik)                    :: k, sync_case_id, nc, data_rank, n_xfer, lgt_ID, hvy_ID, level_me, ref_me
@@ -192,7 +193,7 @@ subroutine sync_D2M(params, hvy_block, tree_ID, sync_case, s_val)
 
     ! actual xfer, this works on all blocks that are on this level and have a daughter
     n_xfer = 0  ! transfer counter
-    call prepare_update_family_metadata(params, tree_ID, n_xfer, sync_case="D2M_" // sync_case, ncomponents=nc, s_val=s_val)
+    call prepare_update_family_metadata(params, tree_ID, n_xfer, sync_case="D2M_" // sync_case, ncomponents=nc, s_val=s_val, sync_debug_name=sync_debug_name)
     call xfer_block_data(params, hvy_block, tree_ID, n_xfer, verbose_check=.true.)
 
     ! now the daughter blocks need to be retransformed to spaghetti
@@ -220,7 +221,7 @@ end subroutine sync_D2M
 
 
 !> For CVS reconstruction we need to update daughters from reconstructed mothers, this does exactly that
-subroutine sync_M2D(params, hvy_block, tree_ID, sync_case, s_val)
+subroutine sync_M2D(params, hvy_block, tree_ID, sync_case, s_val, sync_debug_name)
     ! it is not technically required to include the module here, but for VS code it reduces the number of wrong "errors"
     use module_params
 
@@ -232,6 +233,7 @@ subroutine sync_M2D(params, hvy_block, tree_ID, sync_case, s_val)
     character(len=*)                    :: sync_case                    !< String representing which kind of syncing we want to do
     !> Additional value to be considered for syncing logic, can be level or refinement status from which should be synced, dependend on sync case
     integer(kind=ik), intent(in)        :: s_val
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
     ! loop variables
     integer(kind=ik)                    :: k, sync_case_id, n_xfer, lgt_ID, hvy_ID, lgt_ID_m, level_m, ref_m, nc
@@ -297,7 +299,7 @@ subroutine sync_M2D(params, hvy_block, tree_ID, sync_case, s_val)
 
     ! actual xfer, this works on all blocks that are on this level and have a daughter
     n_xfer = 0  ! transfer counter
-    call prepare_update_family_metadata(params, tree_ID, n_xfer, sync_case="M2D_" // sync_case, ncomponents=nc, s_val=s_val)
+    call prepare_update_family_metadata(params, tree_ID, n_xfer, sync_case="M2D_" // sync_case, ncomponents=nc, s_val=s_val, sync_debug_name=sync_debug_name)
     call xfer_block_data(params, hvy_block, tree_ID, n_xfer)
 
     !---------------------------------------------------------------------------
diff --git a/LIB/MESH/forest.f90 b/LIB/MESH/forest.f90
index dbfd6bd3..c89c9a10 100644
--- a/LIB/MESH/forest.f90
+++ b/LIB/MESH/forest.f90
@@ -215,17 +215,25 @@ subroutine add_pruned_to_full_tree( params, hvy_block, tree_ID_pruned, tree_ID_f
     integer(kind=ik), intent(in)      :: tree_ID_pruned, tree_ID_full
     real(kind=rk), intent(inout)      :: hvy_block(:, :, :, :, :) !< heavy data array - block data
 
-    integer(kind=ik) :: k, lgt_id, Jmax, hvy_id, rank, N, fsize, i
+    integer(kind=ik) :: k, lgt_id, Jmax, hvy_id, rank, N, fsize, i, ierr
     integer(kind=ik) :: lgt_id1, lgt_id2, hvy_id1, hvy_id2
     integer(kind=ik) :: level1, level2, rank_pruned, rank_full, n_comm
+    integer(kind=ik) :: num_blocks_count(3), i_var  ! debug counters: [send, recv, keep]
     logical :: exists
     integer(kind=ik), allocatable, save :: comm_list(:,:)
     integer(kind=tsize) :: treecode1, treecode2
+    real(kind=rk) :: t_cycle, t_block
+    character(len=4) :: string_kind(3)
+    character(len=cshort) :: format_string
 
     fsize = params%forest_size
     Jmax = params%Jmax ! max treelevel
     rank = params%rank
     N = params%number_blocks
+    t_cycle = MPI_Wtime()
+
+    ! debug counters: [send, recv, keep]
+    num_blocks_count(1:3) = 0
 
     ! init
     treecode1 = 0_tsize; treecode2 = 0_tsize
@@ -236,7 +244,9 @@ subroutine add_pruned_to_full_tree( params, hvy_block, tree_ID_pruned, tree_ID_f
 
     if (.not.allocated(comm_list)) allocate( comm_list( 3, params%number_procs*N ) )
 
+    t_block = MPI_Wtime()
     call createActiveSortedLists_forest(params)
+    call toc("add_pruned_to_full_tree (createActiveSortedLists_forest)", 261, MPI_Wtime() - t_block)
 
     ! a pruned tree has fewer entries: loop over it instead of the fuller (fluid) one!
     !
@@ -256,6 +266,7 @@ subroutine add_pruned_to_full_tree( params, hvy_block, tree_ID_pruned, tree_ID_f
     ! holding the corresponding full trees block.
 
     ! Step 1a: prepare for xfer, gather all required xfers
+    t_block = MPI_Wtime()
     n_comm = 0
     do k = 1, lgt_n(tree_ID_pruned)
 
@@ -279,18 +290,36 @@ subroutine add_pruned_to_full_tree( params, hvy_block, tree_ID_pruned, tree_ID_f
                 comm_list(1, n_comm) = rank_pruned   ! sender mpirank
                 comm_list(2, n_comm) = rank_full   ! receiver mpirank
                 comm_list(3, n_comm) = lgt_id1 ! block lgt_id to send
+                
+                ! count transferred blocks for this rank - used for development debugging output
+                if (params%rank == rank_pruned) then
+                    num_blocks_count(1) = num_blocks_count(1) + 1  ! send
+                endif
+                if (params%rank == rank_full) then
+                    num_blocks_count(2) = num_blocks_count(2) + 1  ! recv
+                endif
+            else
+                if (params%rank == rank_full) then
+                    num_blocks_count(3) = num_blocks_count(3) + 1  ! keep
+                endif
             endif
         endif
     enddo
+    call toc("add_pruned_to_full_tree (gather comms)", 262, MPI_Wtime() - t_block)
 
     ! Step 1b: actual xfer.
+    t_block = MPI_Wtime()
     call block_xfer( params, comm_list, n_comm, hvy_block )
+    call toc("add_pruned_to_full_tree (block_xfer)", 263, MPI_Wtime() - t_block)
 
     ! As some blocks have been transferred, the active lists are outdated.
+    t_block = MPI_Wtime()
     call createActiveSortedLists_tree(params, tree_ID_pruned)
+    call toc("add_pruned_to_full_tree (createActiveSortedLists_tree)", 264, MPI_Wtime() - t_block)
 
     ! Step 2: ADDITION. now we're sure that blocks existing in both trees are on the
     ! same mpirank. therefore, the responsible mpirank can just add them together.
+    t_block = MPI_Wtime()
     do k = 1, hvy_n(tree_ID_pruned)
 
         hvy_id1  = hvy_active(k, tree_ID_pruned)
@@ -339,6 +368,30 @@ subroutine add_pruned_to_full_tree( params, hvy_block, tree_ID_pruned, tree_ID_f
             ! But that is no problem - We just ignore it
         endif
     enddo
+    call toc("add_pruned_to_full_tree (addition)", 265, MPI_Wtime() - t_block)
+
+    if (params%debug_pruned2full) then
+        ! development output, gather information on rank 0 and print to file
+        string_kind = (/ "send", "recv", "keep" /)
+        
+        do i_var = 1, 3
+            ! gather number of blocks transferred/added/ignored on rank 0
+            call MPI_GATHER(num_blocks_count(i_var), 1, MPI_INTEGER, comm_list, 1, MPI_INTEGER, 0, WABBIT_COMM, ierr)
+            if (params%rank == 0) then
+                ! Single IO operation with dynamic format
+                open(unit=99, file="debug_pruned2full.csv", status="unknown", position="append")
+                if (params%number_procs == 1) then
+                    write(99, '(A,i0)') string_kind(i_var)//",", comm_list(1,1)
+                else
+                    write(format_string, '("(A, i0,",i0,"("","",i0))")') params%number_procs - 1
+                    write(99, format_string) string_kind(i_var)//",", comm_list(1,1), comm_list(1,2:params%number_procs)
+                endif
+                close(99)
+            endif
+        enddo
+    endif
+
+    call toc("add_pruned_to_full_tree (TOTAL)", 260, MPI_Wtime() - t_cycle)
 
 end subroutine
 
@@ -510,7 +563,7 @@ function compute_tree_L2norm(params, hvy_block, hvy_tmp, tree_ID, verbosity ) &
 !> This routine is usefull, when trying to go back to a old
 !> treestructure, after calling for example refine_trees2sametreelevel.
 subroutine coarse_tree_2_reference_mesh(params, lgt_block_ref, lgt_active_ref, lgt_n_ref, &
-    hvy_block, hvy_tmp, tree_ID, verbosity)
+    hvy_block, hvy_tmp, tree_ID, ref_can_be_finer, verbosity)
 
     implicit none
     !-----------------------------------------------------------------
@@ -521,16 +574,20 @@ subroutine coarse_tree_2_reference_mesh(params, lgt_block_ref, lgt_active_ref, l
     real(kind=rk), intent(inout)      :: hvy_block(:, :, :, :, :) !< heavy data array - block data
     integer(kind=ik), intent(inout)   :: lgt_active_ref(:) !< active lists
     real(kind=rk), intent(inout)      :: hvy_tmp(:, :, :, :, :) ! used for saving, filtering, and helper qtys
+    !> if true, the routine will also work if the reference mesh has some finer blocks, needs refine_tree_2_reference_mesh afterwards
+    logical, intent(in),optional      :: ref_can_be_finer
     logical, intent(in),optional      :: verbosity
     !-----------------------------------------------------------------
     integer(kind=ik)    :: rank, level_ref, level, Jmax, lgt_id_ref, lgt_id, fsize
     integer(kind=ik)    :: k1, k2, Nblocks_2coarsen, level_min
     integer(kind=tsize) :: treecode_ref, treecode
-    logical :: verbose, exists
+    logical :: verbose, exists, refCanBeFiner
 
     Jmax = params%Jmax ! max treelevel
     fsize= params%forest_size   ! maximal number of trees in forest
     verbose = .false.
+    refCanBeFiner = .false.
+    if (present(ref_can_be_finer)) refCanBeFiner = ref_can_be_finer
 
     if (present(verbosity)) verbose=verbosity
 
@@ -541,7 +598,8 @@ subroutine coarse_tree_2_reference_mesh(params, lgt_block_ref, lgt_active_ref, l
     ! gives all blocks with identical TC/level to reference tree the status 0 and deletes all finer blocks, voila
 
     ! init full grid structure on the actual tree
-    call init_full_tree(params, tree_ID, set_ref=0, verbose_check=.false.)
+    call init_full_tree(params, tree_ID, set_ref=-1, verbose_check=.false.)
+    if (params%rank==0 .and. verbose) write(*,'("coarse_tree_2_reference_mesh : init_full_tree completed, size Nb=",i7)') lgt_n(tree_ID)
 
     ! init all ref stats to -1
     do k1 = 1, lgt_n(tree_ID)
@@ -560,14 +618,30 @@ subroutine coarse_tree_2_reference_mesh(params, lgt_block_ref, lgt_active_ref, l
 
         call doesBlockExist_tree(treecode_ref, exists, lgt_id, dim=params%dim, level=level_ref, tree_id=tree_ID, max_level=params%Jmax)
 
-        if (.not. exists) then
-            call abort(20200806, "Something went wrong: Block on reference tree1 is too fine!!!")
+        ! if the block does not exists the reference block is finer than the actual tree (or Jmin is not respected)
+        ! we have two options:
+        !    1) panic, scream and abort, because something went wrong (default)
+        !    2) if refCanBeFiner is set to true, we coarsen the reference block until we find a block on the actual tree
+        !       or we fall below Jmin, means we just keep the finest mesh possible at that position and afterwards need to refine as well
+        if (.not. exists .and. .not. refCanBeFiner) then
+            call abort(250930, "Something went wrong: Block on reference tree1 is too fine (or lower than Jmin)!!!")
+        endif
+        if (.not. exists .and. refCanBeFiner) then
+            do while( .not. exists .and. level_ref >= params%Jmin )
+                level_ref = level_ref - 1
+                treecode_ref = tc_clear_until_level_b( treecode_ref, params%dim, level_ref, params%Jmax)
+                call doesBlockExist_tree(treecode_ref, exists, lgt_id, dim=params%dim, level=level_ref, tree_id=tree_ID, max_level=params%Jmax)
+            end do
+            if (.not. exists) then
+                call abort(250930, "Something went wrong: I have no Idea how you ended up here")
+            endif
         endif
 
         ! set lgt_id to be kept
         lgt_block(lgt_id, IDX_REFINE_STS) = 0
 
     end do ! loop over reference blocks
+    if (params%rank==0 .and. verbose) write(*,'("coarse_tree_2_reference_mesh : marking blocks completed")')
     !----------------------------
 
     !----------------------------
@@ -579,6 +653,109 @@ subroutine coarse_tree_2_reference_mesh(params, lgt_block_ref, lgt_active_ref, l
 end subroutine
 
 
+
+!##############################################################
+!> This function refines a tree to its reference mesh.
+!> The reference mesh is saved in lgt_..._ref and stores the treecode and
+!> all necessary light data.
+!> All blocks on the actual tree are refined to have the same
+!> treecode structure as the reference.
+subroutine refine_tree_2_reference_mesh(params, lgt_block_ref, lgt_active_ref, lgt_n_ref, &
+    hvy_block, hvy_tmp, tree_ID, ref_can_be_coarser, verbosity)
+
+    implicit none
+    !-----------------------------------------------------------------
+    type (type_params), intent(inout) :: params   !< params structure
+    integer(kind=ik), intent(in)      :: tree_ID !< number of the tree
+    integer(kind=ik), intent(inout)   :: lgt_n_ref !< number of light active blocks
+    integer(kind=ik), intent(inout)   :: lgt_block_ref(:, : )  !< light data array
+    real(kind=rk), intent(inout)      :: hvy_block(:, :, :, :, :) !< heavy data array - block data
+    integer(kind=ik), intent(inout)   :: lgt_active_ref(:) !< active lists
+    real(kind=rk), intent(inout)      :: hvy_tmp(:, :, :, :, :) ! used for saving, filtering, and helper qtys
+    !> if true, the routine will also work if the reference mesh has some coarser blocks, needs coarse_tree_2_reference_mesh afterwards
+    logical, intent(in),optional      :: ref_can_be_coarser
+    logical, intent(in),optional      :: verbosity
+    !-----------------------------------------------------------------
+    integer(kind=ik)    :: level_ref, level, Jmax, lgt_id_ref, lgt_id, fsize
+    integer(kind=ik)    :: k1, k2, Nblocks_2coarsen, level_min, mpierr, loop_i
+    integer(kind=tsize) :: treecode_ref, treecode
+    logical :: verbose, exists, refCanBeCoarser, continue_loop
+
+    Jmax = params%Jmax ! max treelevel
+    fsize= params%forest_size   ! maximal number of trees in forest
+    verbose = .false.
+    refCanBeCoarser = .false.
+    if (present(ref_can_be_coarser)) refCanBeCoarser = ref_can_be_coarser
+
+    if (present(verbosity)) verbose=verbosity
+
+    call updateMetadata_tree(params, tree_ID)
+
+    ! Operations (addition, multiplication, etc) using two trees (=grids) require both trees (=grids)
+    ! to be identical. Therefore, this routine flags all blocks for refinement which are not on the reference tree, but a higher one is
+
+    ! init full grid structure on the actual tree, so that we also check if reference mesh is coarser
+    if (refCanBeCoarser) call init_full_tree(params, tree_ID, set_ref=0, verbose_check=.false.)
+
+    continue_loop = .true.
+    loop_i = 1
+    do while (continue_loop)
+        continue_loop = .false.
+        if (params%rank==0 .and. verbose) write(*,'("refine_tree_2_reference_mesh : loop ",i3)') loop_i
+
+        ! init all ref stats to 0
+        do k1 = 1, lgt_n(tree_ID)
+            lgt_id = lgt_active(k1, tree_ID)
+            lgt_block(lgt_id, IDX_REFINE_STS) = 0
+        end do
+
+        !----------------------------
+        ! loop over all blocks of the reference tree and find the corresponding block with our sorted list to be kept
+        do k1 = 1, lgt_n_ref
+            lgt_id_ref = lgt_active_ref(k1)
+            level_ref  = lgt_block_ref(lgt_id_ref, IDX_MESH_LVL)
+
+            ! get the treecode of the reference block
+            treecode_ref = get_tc(lgt_block_ref(lgt_id_ref, IDX_TC_1 : IDX_TC_2))
+
+            call doesBlockExist_tree(treecode_ref, exists, lgt_id, dim=params%dim, level=level_ref, tree_id=tree_ID, max_level=params%Jmax)
+
+            ! if the block does not exists the reference block is finer than the actual tree (or Jmin is not respected)
+            ! we now go downwards in levels until we find the mother block and set it to be refined.
+            ! If this does not exist, then the reference mesh is coarser than the actual mesh or Jmin is not respected
+            if (.not. exists) then
+                do while( .not. exists .and. level_ref >= params%Jmin )
+                    level_ref = level_ref - 1
+                    treecode_ref = tc_clear_until_level_b( treecode_ref, params%dim, level_ref, params%Jmax)
+                    call doesBlockExist_tree(treecode_ref, exists, lgt_id, dim=params%dim, level=level_ref, tree_id=tree_ID, max_level=params%Jmax)
+                end do
+                if (.not. exists) then
+                    call abort(250930, "Something went wrong: Block on reference tree2 is too coarse (or lower than Jmin)!!!")
+                endif
+                ! set lgt_id to be refined
+                lgt_block(lgt_id, IDX_REFINE_STS) = 1
+            endif
+
+        end do ! loop over reference blocks
+        !----------------------------
+
+        !----------------------------
+        ! refine the tagged blocks
+        call refine_tree( params, hvy_block, indicator='nothing (external)', tree_id=tree_ID, error_OOM=exists, check_full_tree=.false.)
+        !----------------------------
+
+        ! gather continue_loop to see if any mpirank wants to continue
+        call MPI_Allreduce(MPI_IN_PLACE, continue_loop, 1, MPI_LOGICAL, MPI_LOR, WABBIT_COMM, mpierr)
+        
+    end do
+
+    if (params%rank==0 .and. verbose) write(*,'("refine_tree_2_reference_mesh : refinement completed, size Nb=",i7)') lgt_n(tree_ID)
+
+    ! remove full tree
+    if (refCanBeCoarser) call prune_fulltree2leafs(params, tree_ID)
+
+end subroutine
+
 !> \brief Stores the lgt data of two trees in an extra array as backup
 subroutine store_ref_meshes(lgt_block_ref, lgt_active_ref, lgt_n_ref, tree_ID1, tree_ID2)
 
diff --git a/LIB/MESH/ini_file_to_params.f90 b/LIB/MESH/ini_file_to_params.f90
index 893ba8d0..30acc61d 100644
--- a/LIB/MESH/ini_file_to_params.f90
+++ b/LIB/MESH/ini_file_to_params.f90
@@ -16,7 +16,7 @@ subroutine ini_file_to_params( params, filename )
    real(kind=rk)                                   :: maxmem, mem_per_block, nstages
    ! string read from command line call
    character(len=cshort)                           :: memstring
-   integer(kind=ik)                                :: d,i, Nblocks_Jmax, g, Neqn, Nrk, g_RHS_min, diff_L, diff_R, Bs(1:3)
+   integer(kind=ik)                                :: d,i, Nblocks_Jmax, g, N_files, Nrk, g_RHS_min, diff_L, diff_R, Bs(1:3)
 
    rank         = params%rank
    number_procs = params%number_procs
@@ -42,6 +42,17 @@ subroutine ini_file_to_params( params, filename )
    params%symmetry_vector_component = "0"
    call read_param_mpi(FILE, 'Domain', 'symmetry_vector_component', params%symmetry_vector_component, params%symmetry_vector_component )
 
+   !***************************************************************************
+   ! read time statistics parameters - before reading to decide how many files are read in
+   call read_param_mpi(FILE, 'Time-Statistics', 'time_statistics', params%time_statistics, .false.)
+   if (params%time_statistics) then
+      call read_param_mpi(FILE, 'Time-Statistics', 'N_time_statistics', params%N_time_statistics, 1)
+      allocate( params%time_statistics_names(1:params%N_time_statistics) )
+      call read_param_mpi(FILE, 'Time-Statistics', 'time_statistics_names', params%time_statistics_names, (/ "none" /))
+      call read_param_mpi(FILE, 'Time-Statistics', 'read_from_files_time_statistics', params%read_from_files_time_statistics, .false.)
+      call read_param_mpi(FILE, 'Time-Statistics', 'time_statistics_start_time', params%time_statistics_start_time, 0.0_rk)
+   endif
+
    !**************************************************************************
    ! read INITIAL CONDITION parameters
 
@@ -61,7 +72,13 @@ subroutine ini_file_to_params( params, filename )
 
    if (params%read_from_files ) then
       ! read variable names
-      allocate( params%input_files( params%n_eqn ) )
+      N_files = params%n_eqn
+      ! sometimes we want to read in time_statistics, sometimes we do not want that, this affects the number of files to read in
+      if (params%time_statistics .and. .not. params%read_from_files_time_statistics) then
+         N_files = N_files - params%N_time_statistics
+      endif
+
+      allocate( params%input_files( N_files ) )
 
       params%input_files = "---"
       call read_param_mpi(FILE, 'Physics', 'input_files', params%input_files, params%input_files, check_file_exists=.true. )
@@ -81,7 +98,8 @@ subroutine ini_file_to_params( params, filename )
    call read_param_mpi(FILE, 'Discretization', 'poisson_order', params%poisson_order, "FD_4th_comp_1_3")
    call read_param_mpi(FILE, 'Discretization', 'poisson_cycle_end_criteria', params%poisson_cycle_end_criteria, "fixed_iterations")
    call read_param_mpi(FILE, 'Discretization', 'poisson_cycle_it', params%poisson_cycle_it, 6)
-   call read_param_mpi(FILE, 'Discretization', 'poisson_cycle_tol', params%poisson_cycle_tol, 1.0e-6_rk)
+   call read_param_mpi(FILE, 'Discretization', 'poisson_cycle_tol_abs', params%poisson_cycle_tol_abs, 1.0e-6_rk)
+   call read_param_mpi(FILE, 'Discretization', 'poisson_cycle_tol_rel', params%poisson_cycle_tol_rel, 1.0e-3_rk)
    call read_param_mpi(FILE, 'Discretization', 'poisson_cycle_max_it', params%poisson_cycle_max_it, 100)
    call read_param_mpi(FILE, 'Discretization', 'poisson_GS_it', params%poisson_GS_it, 8)
    call read_param_mpi(FILE, 'Discretization', 'poisson_Sync_it', params%poisson_Sync_it, 2)
@@ -103,15 +121,6 @@ subroutine ini_file_to_params( params, filename )
    call read_param_mpi(FILE, 'Statistics', 'nsave_stats', params%nsave_stats, 99999999_ik )
    call read_param_mpi(FILE, 'Statistics', 'tsave_stats', params%tsave_stats, 9999999.9_rk )
 
-   !***************************************************************************
-   ! read time statistics parameters
-   call read_param_mpi(FILE, 'Time-Statistics', 'time_statistics', params%time_statistics, .false.)
-   if (params%time_statistics) then
-      call read_param_mpi(FILE, 'Time-Statistics', 'N_time_statistics', params%N_time_statistics, 1)
-      allocate( params%time_statistics_names(1:params%N_time_statistics) )
-      call read_param_mpi(FILE, 'Time-Statistics', 'time_statistics_names', params%time_statistics_names, (/ "none" /))
-   endif
-
    !***************************************************************************
    ! WABBIT needs to know about the mask function (if penalization is used): does it contain
    ! a time-dependent-part (e.g. moving obstacles, time-dependent forcing)? does it contain
@@ -133,6 +142,13 @@ subroutine ini_file_to_params( params, filename )
    call read_param_mpi(FILE, 'Debug', 'test_ghost_nodes_synch', params%test_ghost_nodes_synch, .true.)
    call read_param_mpi(FILE, 'Debug', 'test_wavelet_decomposition', params%test_wavelet_decomposition, .true.)
 
+   call read_param_mpi(FILE, 'Debug', 'debug_balanceLoad', params%debug_balanceLoad, .false.)
+   call read_param_mpi(FILE, 'Debug', 'debug_refinement', params%debug_refinement, .false.)
+   call read_param_mpi(FILE, 'Debug', 'debug_wavelet_decompose', params%debug_wavelet_decompose, .false.)
+   call read_param_mpi(FILE, 'Debug', 'debug_wavelet_reconstruct', params%debug_wavelet_reconstruct, .false.)
+   call read_param_mpi(FILE, 'Debug', 'debug_sync', params%debug_sync, .false.)
+   call read_param_mpi(FILE, 'Debug', 'debug_pruned2full', params%debug_pruned2full, .false.)
+
    ! Hack.
    ! Small ascii files are written with the module_t_files, which is just a buffered wrapper.
    ! Instead of directly dumping the files to disk, it collects data and flushes after "flush_frequency"
@@ -332,6 +348,9 @@ subroutine ini_blocks(params, FILE )
    elseif (params%wavelet(4:4) == "6") then
       g_default = 5
       g_RHS_default = 3
+   elseif (params%wavelet(4:4) == "8") then
+      g_default = 7
+      g_RHS_default = 4
    else 
       call abort(2320242, "no default specified for this wavelet...")
    endif
@@ -420,7 +439,10 @@ subroutine ini_blocks(params, FILE )
    allocate(params%threshold_state_vector_component(1:params%n_eqn))
    ! as default, use ones (all components used for indicator)
    tmp = 1.0_rk
-   call read_param_mpi(FILE, 'Blocks', 'threshold_state_vector_component',  tmp, tmp )
+   ! only read in threshold_state_vector_component if needed
+   if ((params%adapt_tree .and. params%coarsening_indicator=="threshold-state-vector") .or. (params%adapt_inicond .and. params%coarsening_indicator_inicond=="threshold-state-vector")) then
+      call read_param_mpi(FILE, 'Blocks', 'threshold_state_vector_component',  tmp, tmp )
+   end if
    do i = 1, params%n_eqn
       params%threshold_state_vector_component(i) = nint(tmp(i))
    enddo
diff --git a/LIB/MESH/module_mesh.f90 b/LIB/MESH/module_mesh.f90
index 477bfeba..04f9ee02 100644
--- a/LIB/MESH/module_mesh.f90
+++ b/LIB/MESH/module_mesh.f90
@@ -15,9 +15,6 @@ module module_mesh
     use module_operators
     ! used in executeCoarsening_tree
     use module_helpers
-    ! used for multigrid solver
-    use module_fft
-    use module_poisson
     ! if the threshold_mask option is used, then the mesh module needs to create the mask function here
     ! hence we require the metamodule to be used.
     use module_physics_metamodule
@@ -66,7 +63,6 @@ module module_mesh
 #include "check_lgt_block_synchronization.f90"
 #include "remove_nonperiodic_neighbors.f90"
 #include "forest.f90"
-#include "multigrid_vcycle.f90"
 
 ! former module_initialization
 #include "setInitialCondition_tree.f90"
diff --git a/LIB/MESH/refine_tree.f90 b/LIB/MESH/refine_tree.f90
index dc58d0f8..5b5b07c2 100644
--- a/LIB/MESH/refine_tree.f90
+++ b/LIB/MESH/refine_tree.f90
@@ -12,7 +12,7 @@
 !! output:   - light and heavy data arrays
 ! ********************************************************************************************
 
-subroutine refine_tree( params, hvy_block, indicator, tree_ID, error_OOM, check_full_tree)
+subroutine refine_tree( params, hvy_block, indicator, tree_ID, error_OOM, check_full_tree, time)
     use module_indicators
 
     implicit none
@@ -23,6 +23,7 @@ subroutine refine_tree( params, hvy_block, indicator, tree_ID, error_OOM, check_
     integer(kind=ik), intent(in)   :: tree_ID
     logical, intent(out)   :: error_OOM !> Out-of-memory error, causes the main time loop to exit.
     logical, intent(in), optional :: check_full_tree  !> if true, checks if a full tree can be created in deterministic fashion
+    real(kind=rk), intent(in), optional :: time  !> current simulation time, used for development output only
 
 
     ! cpu time variables for running time calculation
@@ -75,7 +76,7 @@ subroutine refine_tree( params, hvy_block, indicator, tree_ID, error_OOM, check_
     ! which means 2**D blocks after refinement), but is better than before.
     ! To ensure perfect balancing, a second step is done after the actual refinement.
     if (indicator /= "everywhere") then
-        call balanceLoad_tree( params, hvy_block, tree_ID, balanceForRefinement=.true.)
+        call balanceLoad_tree( params, hvy_block, tree_ID, balanceForRefinement=.true., balance_name='refine_pre', time=time )
     endif
 
     !---------------------------------------------------------------------------
@@ -93,7 +94,7 @@ subroutine refine_tree( params, hvy_block, indicator, tree_ID, error_OOM, check_
     !> (d) execute refinement, interpolate the new mesh. Blocks with status=+1 are refined, often
     !! that means all blocks are refined.
     t1 = MPI_Wtime()
-    call refinement_execute_tree( params, hvy_block, tree_ID )
+    call refinement_execute_tree( params, hvy_block, tree_ID, time=time )
     call toc( "refine_tree (refinement_execute)", 144, MPI_Wtime()-t1 )
 
 
@@ -115,7 +116,7 @@ subroutine refine_tree( params, hvy_block, indicator, tree_ID, error_OOM, check_
     !! This of course implies any other indicator than "everywhere" requires balancing here.
     if ((params%force_maxlevel_dealiasing .eqv. .false.) .or. (indicator/="everywhere")) then
         t1 = MPI_Wtime()
-        call balanceLoad_tree( params, hvy_block, tree_ID )
+        call balanceLoad_tree( params, hvy_block, tree_ID, balance_name='refine_post', time=time )
         call toc( "refine_tree (balanceLoad_tree)", 145, MPI_Wtime()-t1 )
     endif
 
diff --git a/LIB/MESH/refinementExecute.f90 b/LIB/MESH/refinementExecute.f90
index 61f24e0e..f91977e8 100644
--- a/LIB/MESH/refinementExecute.f90
+++ b/LIB/MESH/refinementExecute.f90
@@ -229,27 +229,52 @@ end subroutine refineBlock2SpaghettiWD
 !! input:    - params, light and heavy data \n
 !! output:   - light and heavy data arrays \n
 ! ********************************************************************************************
-subroutine refinement_execute_tree( params, hvy_block, tree_ID )
+subroutine refinement_execute_tree( params, hvy_block, tree_ID, time )
 
     implicit none
 
     type (type_params), intent(in)      :: params                               !< user defined parameter structure
     real(kind=rk), intent(inout)        :: hvy_block(:, :, :, :, :)             !< heavy data array - block data
     integer(kind=ik), intent(in)        :: tree_ID                              !< tree to work on
+    real(kind=rk), intent(in), optional :: time                                 !< current simulation time, used for development output only
 
-    integer(kind=ik)                    :: i_b, lgt_id
+    integer(kind=ik)                    :: i_b, lgt_id, ierr
+    integer(kind=ik)                    :: blocks_refined
+    integer(kind=ik), allocatable, save :: blocks_refined_list(:)
+    character(len=clong) :: format_string, string_prepare
 
     ! every proc loop over its active heavy data array
+    blocks_refined = 0
     do i_b = 1, hvy_n(tree_ID)
         ! light data id
         call hvy2lgt( lgt_id, hvy_active(i_b, tree_ID), params%rank, params%number_blocks )
 
         ! block wants to refine
         if ( (lgt_block( lgt_id, idx_refine_sts) == +1) ) then
+            blocks_refined = blocks_refined + 1
             call refineBlock(params, hvy_block, hvy_active(i_b, tree_ID), tree_ID)
         endif
     end do
 
+    if (params%debug_refinement) then
+        if (.not. allocated(blocks_refined_list)) allocate(blocks_refined_list(1:params%number_procs))
+
+        ! debug output to see how many blocks have been refined, gather information on rank 0 and print to file
+        call MPI_GATHER(blocks_refined, 1, MPI_INTEGER, blocks_refined_list, 1, MPI_INTEGER, 0, WABBIT_COMM, ierr)
+        if (params%rank == 0) then
+            open(unit=99, file=trim("debug_refinement.csv"), status="unknown", position="append")
+            string_prepare = "-1.0E+00,"  ! set negative time, just to have csv with the same length in every row
+            if (present(time)) write(string_prepare,'(es16.6,",")') time
+            if (params%number_procs == 1) then
+                write(99,'(A, i0)') trim(adjustl(string_prepare)), blocks_refined_list(1)
+            else
+                write(format_string, '("(A, i0,",i0,"("","",i0))")') params%number_procs - 1
+                write(99,format_string) trim(adjustl(string_prepare)), blocks_refined_list(1), blocks_refined_list(2:params%number_procs)
+            endif
+            close(99)
+        endif
+    endif
+
     ! synchronize light data
     call synchronize_lgt_data( params, refinement_status_only=.false. )
 
diff --git a/LIB/MESH/setInitialCondition_tree.f90 b/LIB/MESH/setInitialCondition_tree.f90
index 13279dc3..82dc15e7 100644
--- a/LIB/MESH/setInitialCondition_tree.f90
+++ b/LIB/MESH/setInitialCondition_tree.f90
@@ -235,7 +235,7 @@ subroutine setInitialCondition_tree(params, hvy_block, tree_ID, adapt, time, ite
     call updateMetadata_tree(params,  tree_ID)
 
     ! balance the load
-    call balanceLoad_tree(params, hvy_block, tree_ID)
+    call balanceLoad_tree(params, hvy_block, tree_ID, balance_name='setInitialCondition', time=time )
 
     ! synchronize ghosts now, in order to start with a clean grid. NOTE this can actually be removed, but
     ! it is a safety issue. Better simply keep it.
diff --git a/LIB/MPI/synchronize_ghosts_generic.f90 b/LIB/MPI/synchronize_ghosts_generic.f90
index 71d33a56..854c0441 100644
--- a/LIB/MPI/synchronize_ghosts_generic.f90
+++ b/LIB/MPI/synchronize_ghosts_generic.f90
@@ -1,6 +1,6 @@
 !> Wrapper to synch blocks with temporary flag from finer neighbours and same-level neighbors
 !> Used before wavelet decomposition
-subroutine sync_level_from_M(params, hvy_block, tree_ID, level, g_minus, g_plus)
+subroutine sync_level_from_M(params, hvy_block, tree_ID, level, g_minus, g_plus, sync_debug_name)
     implicit none
 
     type (type_params), intent(in) :: params
@@ -8,6 +8,7 @@ subroutine sync_level_from_M(params, hvy_block, tree_ID, level, g_minus, g_plus)
     integer(kind=ik), intent(in)   :: tree_ID                       !< which tree to study
     integer(kind=ik), intent(in)   :: level                         !< level value to be checked against
     integer(kind=ik), optional, intent(in) :: g_minus, g_plus         !< Boundary sizes in case we want to send less values
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
     integer(kind=ik) :: gminus, gplus
     gminus = params%g
@@ -17,14 +18,14 @@ subroutine sync_level_from_M(params, hvy_block, tree_ID, level, g_minus, g_plus)
     if (present(g_plus))   gplus = g_plus
 
     call sync_ghosts_generic(params, hvy_block, tree_ID, g_minus=gminus, g_plus=gplus, spaghetti_form=.false., sync_case="Monly_level", &
-        s_val=level)
+        s_val=level, sync_debug_name=sync_debug_name)
 
 end subroutine sync_level_from_M
 
 
 !> Wrapper to synch blocks with temporary flag from finer neighbours and same-level neighbors
 !> Used before wavelet decomposition
-subroutine sync_TMP_from_MF(params, hvy_block, tree_ID, ref_check, hvy_tmp, g_minus, g_plus)
+subroutine sync_TMP_from_MF(params, hvy_block, tree_ID, ref_check, hvy_tmp, g_minus, g_plus, sync_debug_name)
     implicit none
 
     type (type_params), intent(in) :: params
@@ -34,6 +35,7 @@ subroutine sync_TMP_from_MF(params, hvy_block, tree_ID, ref_check, hvy_tmp, g_mi
     !> heavy temp data array - block data of preserved values before the WD, used in adapt_tree as neighbours already might be wavelet decomposed
     real(kind=rk), intent(inout), optional :: hvy_tmp(:, :, :, :, :)
     integer(kind=ik), optional, intent(in) :: g_minus, g_plus         !< Boundary sizes in case we want to send less values
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
     integer(kind=ik) :: gminus, gplus
     gminus = params%g
@@ -43,7 +45,7 @@ subroutine sync_TMP_from_MF(params, hvy_block, tree_ID, ref_check, hvy_tmp, g_mi
     if (present(g_plus))   gplus = g_plus
 
     call sync_ghosts_generic(params, hvy_block, tree_ID, g_minus=gminus, g_plus=gplus, spaghetti_form=.false., sync_case="MoF_ref", &
-        s_val=ref_check, hvy_tmp=hvy_tmp)
+        s_val=ref_check, hvy_tmp=hvy_tmp, sync_debug_name=sync_debug_name)
 
 end subroutine sync_TMP_from_MF
 
@@ -51,7 +53,7 @@ end subroutine sync_TMP_from_MF
 
 !> Wrapper to synch blocks with temporary flag from all neighbors
 !> Used before wavelet decomposition
-subroutine sync_TMP_from_all(params, hvy_block, tree_ID, ref_check, hvy_tmp, g_minus, g_plus)
+subroutine sync_TMP_from_all(params, hvy_block, tree_ID, ref_check, hvy_tmp, g_minus, g_plus, sync_debug_name)
     implicit none
 
     type (type_params), intent(in) :: params
@@ -61,6 +63,7 @@ subroutine sync_TMP_from_all(params, hvy_block, tree_ID, ref_check, hvy_tmp, g_m
     !> heavy temp data array - block data of preserved values before the WD, used in adapt_tree as neighbours already might be wavelet decomposed
     real(kind=rk), intent(inout), optional :: hvy_tmp(:, :, :, :, :)
     integer(kind=ik), optional, intent(in) :: g_minus, g_plus         !< Boundary sizes in case we want to send less values
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
     integer(kind=ik) :: gminus, gplus
     gminus = params%g
@@ -70,7 +73,7 @@ subroutine sync_TMP_from_all(params, hvy_block, tree_ID, ref_check, hvy_tmp, g_m
     if (present(g_plus))   gplus = g_plus
 
     call sync_ghosts_generic(params, hvy_block, tree_ID, g_minus=gminus, g_plus=gplus, spaghetti_form=.false., sync_case="full_ref", &
-        s_val=ref_check, hvy_tmp=hvy_tmp)
+        s_val=ref_check, hvy_tmp=hvy_tmp, sync_debug_name=sync_debug_name)
 
 end subroutine sync_TMP_from_all
 
@@ -78,7 +81,7 @@ end subroutine sync_TMP_from_all
 
 !> Wrapper to synch from coarser neighbours and same-level neighbors
 !! Used after coarse extension to update SC and WC, coarse neighbours need to be synched from hvy_tmp
-subroutine sync_SCWC_from_MC(params, hvy_block, tree_ID, hvy_tmp, g_minus, g_plus, level)
+subroutine sync_SCWC_from_MC(params, hvy_block, tree_ID, hvy_tmp, g_minus, g_plus, level, sync_debug_name)
     implicit none
 
     type (type_params), intent(in) :: params
@@ -87,6 +90,7 @@ subroutine sync_SCWC_from_MC(params, hvy_block, tree_ID, hvy_tmp, g_minus, g_plu
     !> heavy temp data array - block data of preserved values before the WD, used in adapt_tree as neighbours already might be wavelet decomposed
     real(kind=rk), intent(inout)   :: hvy_tmp(:, :, :, :, :)
     integer(kind=ik), optional, intent(in) :: g_minus, g_plus, level
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
     integer(kind=ik) :: gminus, gplus
     gminus = params%g
@@ -98,17 +102,17 @@ subroutine sync_SCWC_from_MC(params, hvy_block, tree_ID, hvy_tmp, g_minus, g_plu
     ! if we passed on the level then sync is level-wise
     if (present(level)) then
         call sync_ghosts_generic(params, hvy_block, tree_ID, g_minus=gminus, g_plus=gplus, spaghetti_form=.true., sync_case="MoC_level", &
-            s_val=level, hvy_tmp=hvy_tmp)
+            s_val=level, hvy_tmp=hvy_tmp, sync_debug_name=sync_debug_name)
     else
         call sync_ghosts_generic(params, hvy_block, tree_ID, g_minus=gminus, g_plus=gplus, spaghetti_form=.true., sync_case="MoC", &
-            hvy_tmp=hvy_tmp)
+            hvy_tmp=hvy_tmp, sync_debug_name=sync_debug_name)
     endif
 
 end subroutine sync_SCWC_from_MC
 
 
 !> Wrapper to synch all ghost-point patches
-subroutine sync_ghosts_tree(params, hvy_block, tree_ID, g_minus, g_plus, ignore_Filter)
+subroutine sync_ghosts_tree(params, hvy_block, tree_ID, g_minus, g_plus, ignore_Filter, sync_debug_name)
     implicit none
 
     type (type_params), intent(in) :: params
@@ -116,6 +120,7 @@ subroutine sync_ghosts_tree(params, hvy_block, tree_ID, g_minus, g_plus, ignore_
     integer(kind=ik), intent(in)   :: tree_ID                       !< which tree to study
     integer(kind=ik), optional, intent(in) :: g_minus, g_plus       !< set specific ghost node sizes to be synced
     logical, optional, intent(in) :: ignore_Filter                  !< ignore restriction filter, defaults to false
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
     integer(kind=ik) :: gminus, gplus
     logical :: do_ignore_Filter
@@ -128,7 +133,7 @@ subroutine sync_ghosts_tree(params, hvy_block, tree_ID, g_minus, g_plus, ignore_
     if (present(ignore_Filter)) do_ignore_Filter = ignore_Filter
 
     ! set level to -1 to enable synching between all, set stati to send to all levels
-    call sync_ghosts_generic(params, hvy_block, tree_ID, g_minus=gminus, g_plus=gplus, sync_case="full", spaghetti_form=.false., ignore_Filter=do_ignore_Filter)
+    call sync_ghosts_generic(params, hvy_block, tree_ID, g_minus=gminus, g_plus=gplus, sync_case="full", spaghetti_form=.false., ignore_Filter=do_ignore_Filter, sync_debug_name=sync_debug_name)
 
 end subroutine sync_ghosts_tree
 
@@ -141,13 +146,14 @@ end subroutine sync_ghosts_tree
 !
 !  One might ask, why for RHS with only +-shaped stencils we still synch the diagonals (edges/corners) as well?
 !  This is, because within the synching we apply the coarsening and interpolation filters at the interfaces, which need the other stencils as well!
-subroutine sync_ghosts_RHS_tree(params, hvy_block, tree_ID, g_minus, g_plus)
+subroutine sync_ghosts_RHS_tree(params, hvy_block, tree_ID, g_minus, g_plus, sync_debug_name)
     implicit none
 
     type (type_params), intent(in) :: params
     real(kind=rk), intent(inout)   :: hvy_block(:, :, :, :, :)      !< heavy data array - block data
     integer(kind=ik), intent(in)   :: tree_ID                       !< which tree to study
     integer(kind=ik), optional, intent(in) :: g_minus, g_plus
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
     integer(kind=ik) :: gminus, gplus
     gminus = params%g
@@ -157,7 +163,7 @@ subroutine sync_ghosts_RHS_tree(params, hvy_block, tree_ID, g_minus, g_plus)
     if (present(g_plus))   gplus = g_plus
 
     ! set level to -1 to enable synching between all, set stati to send to all levels
-    call sync_ghosts_generic(params, hvy_block, tree_ID, g_minus=gminus, g_plus=gplus, sync_case="full_leaf", spaghetti_form=.false., ignore_Filter=.true.)
+    call sync_ghosts_generic(params, hvy_block, tree_ID, g_minus=gminus, g_plus=gplus, sync_case="full_leaf", spaghetti_form=.false., ignore_Filter=.true., sync_debug_name=sync_debug_name)
 
 end subroutine
 
@@ -167,7 +173,7 @@ subroutine sync_ghosts_RHS_tree(params, hvy_block, tree_ID, g_minus, g_plus)
 !! In order to avoid confusion wrapper functions should be used everywhere in order to implement
 !! specific versions. This also means that parameter changes only have to be changed in the wrappers
 subroutine sync_ghosts_generic( params, hvy_block, tree_ID, sync_case, spaghetti_form, &
-    g_minus, g_plus, s_val, hvy_tmp, verbose_check, ignore_Filter)
+    g_minus, g_plus, s_val, hvy_tmp, verbose_check, ignore_Filter, sync_debug_name)
     ! it is not technically required to include the module here, but for VS code it reduces the number of wrong "errors"
     use module_params
     
@@ -187,11 +193,15 @@ subroutine sync_ghosts_generic( params, hvy_block, tree_ID, sync_case, spaghetti
     integer(kind=ik), intent(in), optional  :: s_val
     logical, intent(in), optional  :: ignore_Filter                 !< If set, coarsening will be done only with loose downsampling, not applying HD filter even in the case of lifted wavelets
     integer(kind=ik), optional, intent(in) :: g_minus, g_plus       !< Synch only so many ghost points
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
-    integer(kind=ik) :: count_send_total, Nstages, istage, gminus, gplus, k, hvy_id, lgt_id, i_dim
+    integer(kind=ik) :: count_send_total, Nstages, istage, gminus, gplus, k, hvy_id, lgt_id, i_dim, ierr
     real(kind=rk) :: t0, t1, t2, x0(1:3), dx(1:3), tolerance
     character(len=clong) :: toc_statement
     integer(kind=2) :: n_domain(1:3)
+    character(len=clong) :: format_string, string_prepare
+    integer(kind=ik)                    :: points_synched
+    integer(kind=ik), allocatable, save :: points_synched_list(:)
 
     t0 = MPI_wtime()
 
@@ -279,6 +289,38 @@ subroutine sync_ghosts_generic( params, hvy_block, tree_ID, sync_case, spaghetti
         call bound_cond_generic(params, hvy_block, tree_ID, sync_case, spaghetti_form, s_val=s_val, edges_only=istage>=2)
         call toc( "sync ghosts (bound_cond_generic)", 83, MPI_wtime()-t2 )
 
+        !***************************************************************************
+        ! (iv) debugging output (optional) - print to files how many data was sent/received
+        !***************************************************************************
+        if (params%debug_sync .and. present(sync_debug_name)) then
+            if (.not. allocated(points_synched_list)) allocate(points_synched_list(1:params%number_procs))
+
+            ! debug send volume
+            call MPI_GATHER(sum(data_send_counter), 1, MPI_INTEGER, points_synched_list, 1, MPI_INTEGER, 0, WABBIT_COMM, ierr)
+            if (params%rank == 0) then
+                open(unit=99, file=trim("debug_sync.csv"), status="unknown", position="append")
+                if (params%number_procs == 1) then
+                    write(99,'(A, i0, ",", i0)') trim(adjustl(sync_debug_name))//",send,", istage, points_synched_list(1)
+                else
+                    write(format_string, '("(A, i0,",i0,"("","",i0))")') params%number_procs
+                    write(99,format_string) trim(adjustl(sync_debug_name))//",send,", istage, points_synched_list(1), points_synched_list(2:params%number_procs)
+                endif
+                close(99)
+            endif
+            ! debug recv volume
+            call MPI_GATHER(sum(data_recv_counter), 1, MPI_INTEGER, points_synched_list, 1, MPI_INTEGER, 0, WABBIT_COMM, ierr)
+            if (params%rank == 0) then
+                open(unit=99, file=trim("debug_sync.csv"), status="unknown", position="append")
+                if (params%number_procs == 1) then
+                    write(99,'(A, i0, ",", i0)') trim(adjustl(sync_debug_name))//",recv,", istage, points_synched_list(1)
+                else
+                    write(format_string, '("(A, i0,",i0,"("","",i0))")') params%number_procs
+                    write(99,format_string) trim(adjustl(sync_debug_name))//",recv,", istage, points_synched_list(1), points_synched_list(2:params%number_procs)
+                endif
+                close(99)
+            endif
+        endif
+
         write(toc_statement, '(A, i0, A)') "sync ghosts (stage ", istage, " TOTAL)"
         call toc( toc_statement, 83+istage, MPI_wtime()-t1 )
 
diff --git a/LIB/MPI/xfer_block_data.f90 b/LIB/MPI/xfer_block_data.f90
index e616f744..022f0796 100644
--- a/LIB/MPI/xfer_block_data.f90
+++ b/LIB/MPI/xfer_block_data.f90
@@ -576,7 +576,7 @@ subroutine finalize_xfer_mpi(params, isend, irecv, verbose_check)
 !    - saving of all metadata
 !    - computing of buffer sizes for metadata for both sending and receiving
 ! This is done strictly locally so no MPI needed here
-subroutine prepare_update_family_metadata(params, tree_ID, count_send, sync_case, ncomponents, s_val)
+subroutine prepare_update_family_metadata(params, tree_ID, count_send, sync_case, ncomponents, s_val, sync_debug_name)
 
     implicit none
 
@@ -589,6 +589,7 @@ subroutine prepare_update_family_metadata(params, tree_ID, count_send, sync_case
 
     !> Additional value to be considered for syncing logic, can be level or refinement status to which should be synced, dependend on sync case
     integer(kind=ik), intent(in), optional  :: s_val
+    character(len=*), optional, intent(in) :: sync_debug_name       !< name to be used in debug output files
 
     ! Following are global data used but defined in module_mpi:
     !    data_recv_counter, data_send_counter
@@ -600,6 +601,9 @@ subroutine prepare_update_family_metadata(params, tree_ID, count_send, sync_case
     integer(kind=ik) :: hvy_ID_n, lgt_ID_n, level_n, ref_n, rank_n
     integer(kind=tsize) :: tc
     logical :: b_send, b_recv
+    character(len=clong) :: format_string, string_prepare
+    integer(kind=ik)                    :: points_synched
+    integer(kind=ik), allocatable, save :: points_synched_list(:)
 
     !-----------------------------------------------------------------------
     ! set up constant arrays
@@ -811,6 +815,36 @@ subroutine prepare_update_family_metadata(params, tree_ID, count_send, sync_case
         endif ! check if daughters exist
     end do ! loop over all heavy active
 
+    if (params%debug_sync .and. present(sync_debug_name)) then
+        if (.not. allocated(points_synched_list)) allocate(points_synched_list(1:params%number_procs))
+
+        ! debug send volume
+        call MPI_GATHER(sum(data_send_counter), 1, MPI_INTEGER, points_synched_list, 1, MPI_INTEGER, 0, WABBIT_COMM, ierr)
+        if (params%rank == 0) then
+            open(unit=99, file=trim("debug_sync.csv"), status="unknown", position="append")
+            ! 0 is send as stage
+            if (params%number_procs == 1) then
+                write(99,'(A, i0, ",", i0)') trim(adjustl(sync_debug_name))//",send,", 0, points_synched_list(1)
+            else
+                write(format_string, '("(A, i0,",i0,"("","",i0))")') params%number_procs
+                write(99,format_string) trim(adjustl(sync_debug_name))//",send,", 0, points_synched_list(1), points_synched_list(2:params%number_procs)
+            endif
+            close(99)
+        endif
+        ! debug recv volume
+        call MPI_GATHER(sum(data_recv_counter), 1, MPI_INTEGER, points_synched_list, 1, MPI_INTEGER, 0, WABBIT_COMM, ierr)
+        if (params%rank == 0) then
+            open(unit=99, file=trim("debug_sync.csv"), status="unknown", position="append")
+            ! 0 is send as stage
+            if (params%number_procs == 1) then
+                write(99,'(A, i0, ",", i0)') trim(adjustl(sync_debug_name))//",recv,", 0, points_synched_list(1)
+            else
+                write(format_string, '("(A, i0,",i0,"("","",i0))")') params%number_procs
+                write(99,format_string) trim(adjustl(sync_debug_name))//",recv,", 0, points_synched_list(1), points_synched_list(2:params%number_procs)
+            endif
+            close(99)
+        endif
+    endif
 
     ! NOTE: this feature is against wabbits memory policy: we try to allocate the
     ! whole memory of the machine on startup, then work with that. however, we have to
diff --git a/LIB/OPERATORS/module_operators.f90 b/LIB/OPERATORS/module_operators.f90
index 00df205f..ebaae3f2 100644
--- a/LIB/OPERATORS/module_operators.f90
+++ b/LIB/OPERATORS/module_operators.f90
@@ -33,6 +33,19 @@ module module_operators
 real(kind=rk), parameter :: FD2_C6(-3:3) = (/  2.0_rk, -27.0_rk,   270.0_rk, -490.0_rk,   270.0_rk,  -27.0_rk,    2.0_rk/) / 180.0_rk  ! 6th order central difference for second derivative
 real(kind=rk), parameter :: FD2_C8(-4:4) = (/ -9.0_rk,  128.0_rk, -1008.0_rk, 8064.0_rk, -14350.0_rk, 8064.0_rk, -1008.0_rk, 128.0_rk, -9.0_rk /) / 5040_rk  ! 8th order central difference for second derivative
 
+! Composite second derivative stencils (from module_poisson)
+real(kind=rk), parameter :: FD2_COMP3_1_2(-3:3) = (/ -2.0_rk, 9.0_rk, 18.0_rk, -50.0_rk, 18.0_rk, 9.0_rk, -2.0_rk /) / 36.0_rk  ! 3th order composite difference (-1,2)
+real(kind=rk), parameter :: FD2_COMP4_0_4(-4:4) = (/ -75.0_rk, 544.0_rk, -1776.0_rk, 3552.0_rk, -4490.0_rk, 3552.0_rk, -1776.0_rk, 544.0_rk, -75.0_rk /) / 144.0_rk  ! 4th order composite difference (0,4)
+real(kind=rk), parameter :: FD2_COMP4_2_2(-4:4) = (/ 1.0_rk, -16.0_rk, 64.0_rk, 16.0_rk, -130.0_rk, 16.0_rk, 64.0_rk, -16.0_rk, 1.0_rk /) / 144.0_rk  ! 4th order composite difference (2,2)
+real(kind=rk), parameter :: FD2_COMP4_1_3(-4:4) = (/ 3.0_rk, -8.0_rk, -24.0_rk, 264.0_rk, -470.0_rk, 264.0_rk, -24.0_rk, -8.0_rk, 3.0_rk /) / 144.0_rk  ! 4th order composite difference (1,3)
+real(kind=rk), parameter :: FD2_COMP5_2_3(-5:5) = (/ -6.0_rk, 105.0_rk, -590.0_rk, 1440.0_rk, 1620.0_rk, -5138.0_rk, 1620.0_rk, 1440.0_rk, -590.0_rk, 105.0_rk, -6.0_rk /) / 3600.0_rk  ! 5th order composite difference (2,3)
+real(kind=rk), parameter :: FD2_COMP6_3_3(-6:6) = (/ 1.0_rk, -18.0_rk, 171.0_rk, -810.0_rk, 1935.0_rk, 828.0_rk, -4214.0_rk, 828.0_rk, 1935.0_rk, -810.0_rk, 171.0_rk, -18.0_rk, 1.0_rk /) / 3600.0_rk  ! 6th order composite difference (3,3)
+real(kind=rk), parameter :: FD2_COMP6_2_4(-6:6) = (/ 2.0_rk, -40.0_rk, 217.0_rk, -520.0_rk, 270.0_rk, 4656.0_rk, -9170.0_rk, 4656.0_rk, 270.0_rk, -520.0_rk, 217.0_rk, -40.0_rk, 2.0_rk /) / 3600.0_rk  ! 6th order composite difference (2,4)
+real(kind=rk), parameter :: FD2_COMP6_1_5(-6:6) = (/ 20.0_rk, 4.0_rk, -955.0_rk, 5300.0_rk, -15300.0_rk, 31560.0_rk, -41258.0_rk, 31560.0_rk, -15300.0_rk, 5300.0_rk, -955.0_rk, 4.0_rk, 20.0_rk /) / 3600.0_rk  ! 6th order composite difference (1,5)
+real(kind=rk), parameter :: FD2_COMP6_0_6(-6:6) = (/ -1470.0_rk, 14184.0_rk, -63495.0_rk, 176200.0_rk, -342450.0_rk, 501840.0_rk, -569618.0_rk, 501840.0_rk, -342450.0_rk, 176200.0_rk, -63495.0_rk, 14184.0_rk, -1470.0_rk /) / 3600.0_rk  ! 6th order composite difference (0,6)
+real(kind=rk), parameter :: FD2_COMP7_3_4(-7:7) = (/ -12.0_rk, 238.0_rk, -2436.0_rk, 13713.0_rk, -45612.0_rk, 83874.0_rk, 84924.0_rk, -269378.0_rk, 84924.0_rk, 83874.0_rk, -45612.0_rk, 13713.0_rk, -2436.0_rk, 238.0_rk, -12.0_rk /) / 176400.0_rk  ! 7th order composite difference (3,4)
+real(kind=rk), parameter :: FD2_COMP8_3_5(-8:8) = (/ 15.0_rk, -330.0_rk, 3760.0_rk, -21966.0_rk, 74760.0_rk, -155610.0_rk, 142800.0_rk, 767730.0_rk, -1622318.0_rk, 767730.0_rk, 142800.0_rk, -155610.0_rk, 74760.0_rk, -21966.0_rk, 3760.0_rk, -330.0_rk, 15.0_rk /) / 705600.0_rk  ! 8th order composite difference (3,5)
+
 ! Cross derivative coefficients (mixed partials d²/dxdy, d²/dxdz, d²/dydz)
 ! These are applied in a cross-shaped pattern along diagonals
 ! 2nd order cross derivative: 3-point stencil applied cross-wise
@@ -43,18 +56,22 @@ module module_operators
 
 ! Forward finite difference stencils for first derivative (used at left boundaries)
 ! R0 = starting from point 0, using points to the right
-real(kind=rk), parameter :: FD1_R0_1(0:1) = (/ -1.0_rk, 1.0_rk /)  ! 1st order forward difference (2-point)
-real(kind=rk), parameter :: FD1_R0_2(0:2) = (/ -3.0_rk, 6.0_rk, -1.0_rk /) / 2.0_rk  ! 2nd order forward difference (3-point)
-real(kind=rk), parameter :: FD1_R0_3(0:3) = (/ -11.0_rk, 18.0_rk, -9.0_rk, 2.0_rk /) / 6.0_rk  ! 3rd order forward difference (4-point)
-real(kind=rk), parameter :: FD1_R0_4(0:4) = (/ -25.0_rk, 48.0_rk, -36.0_rk, 16.0_rk, -3.0_rk /) / 12.0_rk  ! 4th order forward difference (5-point)
-real(kind=rk), parameter :: FD1_R0_5(0:5) = (/ -137.0_rk, 300.0_rk, -300.0_rk, 200.0_rk, -75.0_rk, 12.0_rk /) / 60.0_rk  ! 5th order forward difference (6-point)
-real(kind=rk), parameter :: FD1_R0_6(0:6) = (/ -147.0_rk, 360.0_rk, -450.0_rk, 400.0_rk, -225.0_rk, 72.0_rk, -10.0_rk /) / 60.0_rk  ! 6th order forward difference (7-point)
+real(kind=rk), parameter :: FD1_COMP1_0_1(0:1) = (/ -1.0_rk, 1.0_rk /)  ! 1st order forward difference (2-point)
+real(kind=rk), parameter :: FD1_COMP2_0_2(0:2) = (/ -3.0_rk, 6.0_rk, -1.0_rk /) / 2.0_rk  ! 2nd order forward difference (3-point)
+real(kind=rk), parameter :: FD1_COMP3_0_3(0:3) = (/ -11.0_rk, 18.0_rk, -9.0_rk, 2.0_rk /) / 6.0_rk  ! 3rd order forward difference (4-point)
+real(kind=rk), parameter :: FD1_COMP4_0_4(0:4) = (/ -25.0_rk, 48.0_rk, -36.0_rk, 16.0_rk, -3.0_rk /) / 12.0_rk  ! 4th order forward difference (5-point)
+real(kind=rk), parameter :: FD1_COMP5_0_5(0:5) = (/ -137.0_rk, 300.0_rk, -300.0_rk, 200.0_rk, -75.0_rk, 12.0_rk /) / 60.0_rk  ! 5th order forward difference (6-point)
+real(kind=rk), parameter :: FD1_COMP6_0_6(0:6) = (/ -147.0_rk, 360.0_rk, -450.0_rk, 400.0_rk, -225.0_rk, 72.0_rk, -10.0_rk /) / 60.0_rk  ! 6th order forward difference (7-point)
 
 ! Biased finite difference stencils for first derivative (used near boundaries)
 ! R1 = starting from point -1, using mostly points to the right
-real(kind=rk), parameter :: FD1_R1_3(-1:2) = (/ -2.0_rk, -3.0_rk, 6.0_rk, -1.0_rk /) / 6.0_rk  ! 3rd order biased difference (4-point, stencil from -1 to +2)
-real(kind=rk), parameter :: FD1_R1_4(-1:3) = (/ -3.0_rk, -10.0_rk, 18.0_rk, -6.0_rk, 1.0_rk /) / 12.0_rk  ! 4th order biased difference (5-point, stencil from -1 to +3)
-real(kind=rk), parameter :: FD1_R2_6(-2:4) = (/ 2.0_rk, -24.0_rk, -35.0_rk, 80.0_rk, -30.0_rk, 8.0_rk, -1.0_rk /) / 60.0_rk  ! 6th order biased difference (7-point, stencil from -2 to +4)
+real(kind=rk), parameter :: FD1_COMP3_1_2(-1:2) = (/ -2.0_rk, -3.0_rk, 6.0_rk, -1.0_rk /) / 6.0_rk  ! 3rd order biased difference (4-point, stencil from -1 to +2)
+real(kind=rk), parameter :: FD1_COMP4_1_3(-1:3) = (/ -3.0_rk, -10.0_rk, 18.0_rk, -6.0_rk, 1.0_rk /) / 12.0_rk  ! 4th order biased difference (5-point, stencil from -1 to +3)
+real(kind=rk), parameter :: FD1_COMP5_2_3(-2:3) = (/  3.0_rk, -30.0_rk, -20.0_rk, 60.0_rk, -15.0_rk, 2.0_rk /) / 60.0_rk  ! 5th order biased difference (6-point, stencil from -2 to +3)
+real(kind=rk), parameter :: FD1_COMP6_1_5(-1:5) = (/ -10.0_rk, -77.0_rk, 150.0_rk, -100.0_rk, 50.0_rk, -15.0_rk, 2.0_rk /) / 60.0_rk  ! 6th order biased difference (7-point, stencil from -1 to +5)
+real(kind=rk), parameter :: FD1_COMP6_2_4(-2:4) = (/  2.0_rk, -24.0_rk, -35.0_rk, 80.0_rk, -30.0_rk, 8.0_rk, -1.0_rk /) / 60.0_rk  ! 6th order biased difference (7-point, stencil from -2 to +4)
+real(kind=rk), parameter :: FD1_COMP7_3_4(-3:4) = (/ -4.0_rk, 42.0_rk, -252.0_rk, -105.0_rk, 420.0_rk, -126.0_rk, 28.0_rk, -3.0_rk /) / 420.0_rk  ! 7th order biased difference (8-point, stencil from -3 to +4)
+real(kind=rk), parameter :: FD1_COMP8_3_5(-3:5) = (/ -5.0_rk, 60.0_rk, -420.0_rk, -378.0_rk, 1050.0_rk, -420.0_rk, 140.0_rk, -30.0_rk, 3.0_rk /) / 840.0_rk  ! 8th order biased difference (9-point, stencil from -3 to +5)
 
 !**********************************************************************************************
 ! These are the important routines that are visible to WABBIT:
@@ -105,21 +122,46 @@ subroutine setup_FD1_left_stencil(FD1_order_discretization_in, FD1_l_array, l_st
             l_end = 4
             allocate(FD1_l_array(l_start:l_end))
             FD1_l_array(:) = FD1_C8(:)
+        case("FD_3th_comp_1_2")
+            l_start = -2
+            l_end = 1
+            allocate(FD1_l_array(l_start:l_end))
+            FD1_l_array(:) = FD1_COMP3_1_2(-l_start:-l_end:-1) ! left side is reversed
         case("FD_4th_comp_0_4")
             l_start = -4
             l_end = 0
             allocate(FD1_l_array(l_start:l_end))
-            FD1_l_array(:) = -FD1_R0_4(-l_start:-l_end:-1) ! left side is reversed
+            FD1_l_array(:) = -FD1_COMP4_0_4(-l_start:-l_end:-1) ! left side is reversed
         case("FD_4th_comp_1_3")
             l_start = -3
             l_end = 1
             allocate(FD1_l_array(l_start:l_end))
-            FD1_l_array(:) = -FD1_R1_4(-l_start:-l_end:-1) ! left side is reversed
+            FD1_l_array(:) = -FD1_COMP4_1_3(-l_start:-l_end:-1) ! left side is reversed
+        case("FD_5th_comp_2_3")
+            l_start = -3
+            l_end = 2
+            allocate(FD1_l_array(l_start:l_end))
+            FD1_l_array(:) = -FD1_COMP5_2_3(-l_start:-l_end:-1) ! left side is reversed
+        case("FD_6th_comp_1_5")
+            l_start = -5
+            l_end = 1
+            allocate(FD1_l_array(l_start:l_end))
+            FD1_l_array(:) = -FD1_COMP6_1_5(-l_start:-l_end:-1) ! left side is reversed
         case("FD_6th_comp_2_4")
             l_start = -4
             l_end = 2
             allocate(FD1_l_array(l_start:l_end))
-            FD1_l_array(:) = -FD1_R2_6(-l_start:-l_end:-1) ! left side is reversed
+            FD1_l_array(:) = -FD1_COMP6_2_4(-l_start:-l_end:-1) ! left side is reversed
+        case("FD_7th_comp_3_4")
+            l_start = -4
+            l_end = 3
+            allocate(FD1_l_array(l_start:l_end))
+            FD1_l_array(:) = -FD1_COMP7_3_4(-l_start:-l_end:-1) ! left side is reversed
+        case("FD_8th_comp_3_5")
+            l_start = -5
+            l_end = 3
+            allocate(FD1_l_array(l_start:l_end))
+            FD1_l_array(:) = -FD1_COMP8_3_5(-l_start:-l_end:-1) ! left side is reversed
         case default
             call abort(250615, "ERROR: order of FD1 discretization not known: " // trim(FD1_order_discretization_in))
     end select
@@ -158,21 +200,46 @@ subroutine setup_FD1_right_stencil(FD1_order_discretization_in, FD1_r_array, r_s
             r_end = 4
             allocate(FD1_r_array(r_start:r_end))
             FD1_r_array(:) = FD1_C8(:)
+        case("FD_3th_comp_1_2")
+            r_start = -1
+            r_end = 2
+            allocate(FD1_r_array(r_start:r_end))
+            FD1_r_array(:) = FD1_COMP3_1_2(:)
         case("FD_4th_comp_0_4")
             r_start = 0
             r_end = 4
             allocate(FD1_r_array(r_start:r_end))
-            FD1_r_array(:) = FD1_R0_4(:)
+            FD1_r_array(:) = FD1_COMP4_0_4(:)
         case("FD_4th_comp_1_3")
             r_start = -1
             r_end = 3
             allocate(FD1_r_array(r_start:r_end))
-            FD1_r_array(:) = FD1_R1_4(:)
+            FD1_r_array(:) = FD1_COMP4_1_3(:)
+        case("FD_5th_comp_2_3")
+            r_start = -2
+            r_end = 3
+            allocate(FD1_r_array(r_start:r_end))
+            FD1_r_array(:) = FD1_COMP5_2_3(:)
         case("FD_6th_comp_2_4")
             r_start = -2
             r_end = 4
             allocate(FD1_r_array(r_start:r_end))
-            FD1_r_array(:) = FD1_R2_6(:)
+            FD1_r_array(:) = FD1_COMP6_2_4(:)
+        case("FD_6th_comp_1_5")
+            r_start = -1
+            r_end = 5
+            allocate(FD1_r_array(r_start:r_end))
+            FD1_r_array(:) = FD1_COMP6_1_5(:)
+        case("FD_7th_comp_3_4")
+            r_start = -3
+            r_end = 4
+            allocate(FD1_r_array(r_start:r_end))
+            FD1_r_array(:) = FD1_COMP7_3_4(:)
+        case("FD_8th_comp_3_5")
+            r_start = -3
+            r_end = 5
+            allocate(FD1_r_array(r_start:r_end))
+            FD1_r_array(:) = FD1_COMP8_3_5(:)
         case default
             call abort(250615, "ERROR: order of FD1 discretization not known: " // trim(FD1_order_discretization_in))
     end select
@@ -181,12 +248,24 @@ end subroutine setup_FD1_right_stencil
 !**********************************************************************************************
 !> \brief setup the FD2 stencils
 !> \details This routine sets up the second derivative stencils based on the discretization order
-subroutine setup_FD2_stencil(FD2_order_discretization_in, FD2_array, fd2_start, fd2_end)
+subroutine setup_FD2_stencil(FD2_order_discretization_in, FD2_array, fd2_start, fd2_end, use_exact_discretization)
     implicit none
     
     character(len=*), intent(in) :: FD2_order_discretization_in !> order of the second derivative discretization
     real(kind=rk), allocatable, intent(inout) :: FD2_array(:) !> second derivative stencil array
     integer(kind=ik), intent(inout) :: fd2_start, fd2_end !> second derivative stencil bounds
+    logical, intent(in), optional :: use_exact_discretization !> if true, use exact composite stencils; if false (default), use central stencils for composite orders
+    
+    logical :: use_exact
+    
+    ! We need the second order stencils for two reasons:
+    ! 1) to compute the diffusion term
+    ! 2) to solve the pressure-poisson equation
+    ! For the first case, operator compatibility is not too important and we choose the best available stencil (central one)
+    ! For the second case, operator compatibility is crucial and we need the correct stencils
+    ! default behaviour is dictated by RHS, so for now we need to set use_exact_discretization = .true. when wanting to choose the coinciding stencil
+    use_exact = .false.
+    if (present(use_exact_discretization)) use_exact = use_exact_discretization
     
     if (allocated(FD2_array)) deallocate(FD2_array)
     
@@ -196,21 +275,163 @@ subroutine setup_FD2_stencil(FD2_order_discretization_in, FD2_array, fd2_start,
             fd2_end = 1
             allocate(FD2_array(fd2_start:fd2_end))
             FD2_array(:) = FD2_C2(:)
-        case("FD_4th_central", "FD_4th_comp_0_4", "FD_4th_comp_1_3", "FD_4th_comp_2_2")
+        case("FD_3rd_comp_1_2")
+            if (use_exact) then
+                fd2_start = -3
+                fd2_end = 3
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP3_1_2(:)
+            ! no corresponding 3rd order central stencil, fallback to 4th order
+            else
+                fd2_start = -2
+                fd2_end = 2
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C4(:)
+            endif
+        case("FD_4th_central")
             fd2_start = -2
             fd2_end = 2
             allocate(FD2_array(fd2_start:fd2_end))
             FD2_array(:) = FD2_C4(:)
-        case("FD_6th_central", "FD_6th_comp_0_6", "FD_6th_comp_1_5", "FD_6th_comp_2_4")
+        case("FD_4th_comp_0_4")
+            if (use_exact) then
+                fd2_start = -4
+                fd2_end = 4
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP4_0_4(:)
+            else
+                ! fall back to central 4th order stencil
+                fd2_start = -2
+                fd2_end = 2
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C4(:)
+            endif
+        case("FD_4th_comp_2_2")
+            if (use_exact) then
+                fd2_start = -4
+                fd2_end = 4
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP4_2_2(:)
+            else
+                ! fall back to central 4th order stencil
+                fd2_start = -2
+                fd2_end = 2
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C4(:)
+            endif
+        case("FD_4th_comp_1_3")
+            if (use_exact) then
+                fd2_start = -4
+                fd2_end = 4
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP4_1_3(:)
+            else
+                ! fall back to central 4th order stencil
+                fd2_start = -2
+                fd2_end = 2
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C4(:)
+            endif
+        case("FD_5th_comp_2_3")
+            if (use_exact) then
+                fd2_start = -5
+                fd2_end = 5
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP5_2_3(:)
+            ! no corresponding 5th order central stencil, fallback to 6th order
+            else
+                fd2_start = -3
+                fd2_end = 3
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C6(:)
+            endif
+        case("FD_6th_central")
             fd2_start = -3
             fd2_end = 3
             allocate(FD2_array(fd2_start:fd2_end))
             FD2_array(:) = FD2_C6(:)
+        case("FD_6th_comp_3_3")
+            if (use_exact) then
+                fd2_start = -6
+                fd2_end = 6
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP6_3_3(:)
+            else
+                ! fall back to central 6th order stencil
+                fd2_start = -3
+                fd2_end = 3
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C6(:)
+            endif
+        case("FD_6th_comp_2_4")
+            if (use_exact) then
+                fd2_start = -6
+                fd2_end = 6
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP6_2_4(:)
+            else
+                ! fall back to central 6th order stencil
+                fd2_start = -3
+                fd2_end = 3
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C6(:)
+            endif
+        case("FD_6th_comp_1_5")
+            if (use_exact) then
+                fd2_start = -6
+                fd2_end = 6
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP6_1_5(:)
+            else
+                ! fall back to central 6th order stencil
+                fd2_start = -3
+                fd2_end = 3
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C6(:)
+            endif
+        case("FD_6th_comp_0_6")
+            if (use_exact) then
+                fd2_start = -6
+                fd2_end = 6
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP6_0_6(:)
+            else
+                ! fall back to central 6th order stencil
+                fd2_start = -3
+                fd2_end = 3
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C6(:)
+            endif
+        case("FD_7th_comp_3_4")
+            if (use_exact) then
+                fd2_start = -7
+                fd2_end = 7
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP7_3_4(:)
+            ! no corresponding 7th order central stencil, fallback to 8th order
+            else
+                fd2_start = -4
+                fd2_end = 4
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C8(:)
+            endif
         case("FD_8th_central")
             fd2_start = -4
             fd2_end = 4
             allocate(FD2_array(fd2_start:fd2_end))
             FD2_array(:) = FD2_C8(:)
+        case("FD_8th_comp_3_5")
+            if (use_exact) then
+                fd2_start = -8
+                fd2_end = 8
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_COMP8_3_5(:)
+            else
+                fd2_start = -4
+                fd2_end = 4
+                allocate(FD2_array(fd2_start:fd2_end))
+                FD2_array(:) = FD2_C8(:)
+            endif
         case default
             call abort(250616, "ERROR: order of FD2 discretization not known: " // trim(FD2_order_discretization_in))
     end select
diff --git a/LIB/PARAMS/module_ini_files_parser.f90 b/LIB/PARAMS/module_ini_files_parser.f90
index 12fa0ff7..f2f0767e 100644
--- a/LIB/PARAMS/module_ini_files_parser.f90
+++ b/LIB/PARAMS/module_ini_files_parser.f90
@@ -369,7 +369,7 @@ subroutine param_dbl (PARAMS, section, keyword, params_real, defaultvalue)
 
             ! in verbose mode, inform about what we did
             if (verbosity) then
-                write (*,FORMAT1) trim(section), trim(keyword), adjustl(trim(value))
+                write (*,FORMAT1) trim(section), trim(keyword), trim(adjustl(value))
             endif
         end subroutine param_dbl
 
@@ -420,7 +420,7 @@ subroutine param_sgl (PARAMS, section, keyword, params_real, defaultvalue)
 
             ! in verbose mode, inform about what we did
             if (verbosity) then
-                write (*,FORMAT1) trim(section), trim(keyword), adjustl(trim(value))
+                write (*,FORMAT1) trim(section), trim(keyword), trim(adjustl(value))
             endif
         end subroutine param_sgl
 
@@ -470,7 +470,7 @@ subroutine param_str (PARAMS, section, keyword, params_string, defaultvalue, che
 
             ! in verbose mode, inform about what we did
             if (verbosity) then
-                write (*,FORMAT1) trim(section), trim(keyword), adjustl(trim(value))
+                write (*,FORMAT1) trim(section), trim(keyword), trim(adjustl(value))
             endif
         end subroutine param_str
 
@@ -538,7 +538,7 @@ subroutine param_vct (PARAMS, section, keyword, params_vector, defaultvalue)
 
             ! in verbose mode, inform about what we did
             if (verbosity) then
-                write (*,FORMAT1) trim(section), trim(keyword), adjustl(trim(value))
+                write (*,FORMAT1) trim(section), trim(keyword), trim(adjustl(value))
             endif
         end subroutine param_vct
 
@@ -627,7 +627,7 @@ subroutine param_vct_str (PARAMS, section, keyword, params_vector, defaultvalue,
 
             ! in verbose mode, inform about what we did
             if (verbosity) then
-                write (*,FORMAT1) trim(section),trim(keyword),adjustl(trim(value))
+                write (*,FORMAT1) trim(section),trim(keyword),trim(adjustl(value))
             endif
         end subroutine param_vct_str
 
@@ -665,7 +665,7 @@ subroutine param_int(PARAMS, section, keyword, params_int, defaultvalue)
 
             ! in verbose mode, inform about what we did
             if (verbosity) then
-                write (*,FORMAT1) trim(section),trim(keyword),adjustl(trim(value))
+                write (*,FORMAT1) trim(section),trim(keyword),trim(adjustl(value))
             endif
         end subroutine param_int
 
@@ -709,7 +709,7 @@ subroutine param_bool(PARAMS, section, keyword, params_bool, defaultvalue)
 
             ! in verbose mode, inform about what we did
             if (verbosity) then
-                write (*,FORMAT1) trim(section),trim(keyword),adjustl(trim(value))
+                write (*,FORMAT1) trim(section),trim(keyword),trim(adjustl(value))
             endif
         end subroutine param_bool
 
@@ -783,7 +783,7 @@ subroutine param_vct_bool (PARAMS, section, keyword, params_vector, defaultvalue
 
             ! in verbose mode, inform about what we did
             if (verbosity) then
-                write (*,FORMAT1) trim(section),trim(keyword),adjustl(trim(value))
+                write (*,FORMAT1) trim(section),trim(keyword),trim(adjustl(value))
             endif
         end subroutine param_vct_bool
 
diff --git a/LIB/PARAMS/module_params.f90 b/LIB/PARAMS/module_params.f90
index 642f6884..3323f997 100644
--- a/LIB/PARAMS/module_params.f90
+++ b/LIB/PARAMS/module_params.f90
@@ -89,7 +89,8 @@ module module_params
         integer(kind=ik)      :: poisson_stencil_size=0
         character(len=cshort) :: poisson_cycle_end_criteria="not-initialized"
         integer(kind=ik)      :: poisson_cycle_it=0
-        real(kind=rk)         :: poisson_cycle_tol=0.0_rk
+        real(kind=rk)         :: poisson_cycle_tol_abs=0.0_rk
+        real(kind=rk)         :: poisson_cycle_tol_rel=0.0_rk
         integer(kind=ik)      :: poisson_cycle_max_it=0
         integer(kind=ik)      :: poisson_GS_it=0
         integer(kind=ik)      :: poisson_Sync_it=0
@@ -141,6 +142,8 @@ module module_params
         logical :: time_statistics = .false.
         integer(kind=ik) :: N_time_statistics = 0
         character(len=cshort), allocatable :: time_statistics_names(:)
+        logical :: read_from_files_time_statistics = .false.
+        real(kind=rk) :: time_statistics_start_time = 0.0_rk
 
         ! -------------------------------------------------------------------------------------
         ! MPI
@@ -173,10 +176,13 @@ module module_params
         logical :: use_iteration_as_fileid = .false.
 
         ! -------------------------------------------------------------------------------------
-        ! unit test
+        ! unit tests and debugging
         ! -------------------------------------------------------------------------------------
         logical :: test_treecode=.false., test_ghost_nodes_synch=.true., test_wavelet_decomposition=.true.
 
+        logical :: debug_balanceLoad = .false., debug_refinement = .false., debug_wavelet_decompose = .false.
+        logical :: debug_wavelet_reconstruct = .false., debug_sync = .false., debug_pruned2full = .false.
+
         ! -------------------------------------------------------------------------------------
         ! filter
         ! -------------------------------------------------------------------------------------
diff --git a/LIB/POISSON/module_poisson.f90 b/LIB/POISSON/module_poisson.f90
index d7474015..98dadd65 100644
--- a/LIB/POISSON/module_poisson.f90
+++ b/LIB/POISSON/module_poisson.f90
@@ -14,6 +14,10 @@ module module_poisson
     use module_forestMetaData
     use module_timing               ! debug module
     use module_treelib              ! module with evrything related to treecodes (encoding, decoding, neighbors, etc)
+    use module_mesh                 ! needed for multigrid
+    use module_mpi                  ! needed for multigrid
+    use module_fft                  ! needed for multigrid
+    use module_operators, only: setup_FD2_stencil  ! for finite difference stencils
 
     implicit none
 
@@ -34,7 +38,7 @@ module module_poisson
         ! different stencils, either for one-dimensional cross-stencil form or tensorial form
         real(kind=rk), allocatable  :: stencil(:), stencil_tensor_2D(:,:), stencil_tensor_3D(:,:,:), stencil_RHS_tensor_2D(:,:), stencil_RHS_tensor_3D(:,:,:)
         real(kind=rk)  :: stencil_RHS
-        integer(kind=ik) :: stencil_size, stencil_RHS_size
+        integer(kind=ik) :: stencil_RHS_size
         logical :: use_tensor
 
 
@@ -42,17 +46,22 @@ module module_poisson
 !---------------------------------------------------------------------------------------------
 ! public parts of this module
     
-    PUBLIC :: GS_iteration_level, GS_iteration_ref, GS_compute_residual, GS_compute_Ax, CG_solve_poisson_level0, setup_Laplacian_stencils
+    PUBLIC :: GS_iteration_level, GS_iteration_ref, GS_compute_residual, GS_compute_Ax, CG_solve_poisson_level0, setup_Laplacian_stencils, multigrid_vcycle, multigrid_solve
 
 
 contains
 
+#include "multigrid_vcycle.f90"
+
     subroutine setup_Laplacian_stencils(params, g)
         implicit none
         !> parameter struct
         type (type_params), intent(inout)  :: params
         integer(kind=ik), intent(inout), optional :: g  ! ghost points, if present then we set it if we require more points
 
+        ! local variables for setup_FD2_stencil
+        integer(kind=ik) :: fd2_start, fd2_end
+
         ! prepare stencil
         if (allocated(stencil)) deallocate(stencil)
         if (allocated(stencil_tensor_2D)) deallocate(stencil_tensor_2D)
@@ -60,95 +69,19 @@ subroutine setup_Laplacian_stencils(params, g)
         if (allocated(stencil_RHS_tensor_2D)) deallocate(stencil_RHS_tensor_2D)
         if (allocated(stencil_RHS_tensor_3D)) deallocate(stencil_RHS_tensor_3D)
 
-        if (params%poisson_order == "FD_2nd_central") then
-            allocate(stencil(-1:1))
-            stencil = (/  1.0_rk, -2.0_rk,    1.0_rk /)
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 1
-            stencil_size = 1
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_4th_central") then
-            allocate(stencil(-2:2))
-            stencil = (/ -1.0_rk,  16.0_rk,  -30.0_rk,   16.0_rk,   -1.0_rk /) / 12.0_rk
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 2
-            stencil_size = 2
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_6th_central") then
-            allocate(stencil(-3:3))
-            stencil = (/  2.0_rk, -27.0_rk,   270.0_rk, -490.0_rk,   270.0_rk,  -27.0_rk,    2.0_rk/) / 180.0_rk
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 3
-            stencil_size = 3
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_8th_central") then
-            allocate(stencil(-4:4))
-            stencil = (/ -9.0_rk,  128.0_rk, -1008.0_rk, 8064.0_rk, -14350.0_rk, 8064.0_rk, -1008.0_rk, 128.0_rk, -9.0_rk /) / 5040_rk
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 4
-            stencil_size = 4
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_4th_comp_0_4") then
-            allocate(stencil(-4:4))
-            stencil = (/  -75.0_rk, 544.0_rk, -1776.0_rk, 3552.0_rk, -4490.0_rk, 3552.0_rk, -1776.0_rk, 544.0_rk, -75.0_rk /) / 144.0_rk
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 4
-            stencil_size = 4
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_4th_comp_2_2") then
-            allocate(stencil(-4:4))
-            stencil = (/  1.0_rk, -16.0_rk, 64.0_rk, 16.0_rk, -130.0_rk, 16.0_rk, 64.0_rk, -16.0_rk, 1.0_rk /) / 144.0_rk
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 4
-            stencil_size = 4
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_4th_comp_1_3") then
-            allocate(stencil(-4:4))
-            stencil = (/  3.0_rk, -8.0_rk, -24.0_rk, 264.0_rk, -470.0_rk, 264.0_rk, -24.0_rk, -8.0_rk, 3.0_rk /) / 144.0_rk
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 4
-            stencil_size = 4
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_6th_comp_3_3") then
-            allocate(stencil(-6:6))
-            stencil = (/  1.0_rk, -18.0_rk, 171.0_rk, -810.0_rk, 1935.0_rk, 828.0_rk, -4214.0_rk, 828.0_rk, 1935.0_rk, -810.0_rk, 171.0_rk, -18.0_rk, 1.0_rk /) / 3600.0_rk
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 6
-            stencil_size = 6
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_6th_comp_2_4") then
-            allocate(stencil(-6:6))
-            stencil = (/  2.0_rk, -40.0_rk, 217.0_rk, -520.0_rk, 270.0_rk, 4656.0_rk, -9170.0_rk, 4656.0_rk, 270.0_rk, -520.0_rk, 217.0_rk, -40.0_rk, 2.0_rk /) / 3600.0_rk
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 6
-            stencil_size = 6
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_6th_comp_1_5") then
-            allocate(stencil(-6:6))
-            stencil = (/  20.0_rk, 4.0_rk, -955.0_rk, 5300.0_rk, -15300.0_rk, 31560.0_rk, -41258.0_rk, 31560.0_rk, -15300.0_rk, 5300.0_rk, -955.0_rk, 4.0_rk, 20.0_rk /) / 3600.0_rk
-            stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 6
-            stencil_size = 6
-            stencil_RHS_size = 0
-            use_tensor = .false.
-        elseif (params%poisson_order == "FD_6th_comp_0_6") then
-            allocate(stencil(-6:6))
-            stencil = (/  -1470.0_rk, 14184.0_rk, -63495.0_rk, 176200.0_rk, -342450.0_rk, 501840.0_rk, -569618.0_rk, 501840.0_rk, -342450.0_rk, 176200.0_rk, -63495.0_rk, 14184.0_rk, -1470.0_rk /) / 3600.0_rk
+        ! check if we have a standard stencil (1D cross-shaped) or the special Mehrstellenverfahren
+        if (.not. params%poisson_order == "FD_6th_mehrstellen") then
+            ! use standard 1D stencils via module_operators setup_FD2_stencil
+            ! for Poisson solver, we want to use exact discretizations for composite stencils to preserve compatibility between operators
+            call setup_FD2_stencil(params%poisson_order, stencil, fd2_start, fd2_end, use_exact_discretization=.true.)
+            
+            ! set module variables for compatibility with existing code
             stencil_RHS = 1.0_rk
-            params%poisson_stencil_size = 6
-            stencil_size = 6
+            params%poisson_stencil_size = max(abs(fd2_start), abs(fd2_end))
             stencil_RHS_size = 0
             use_tensor = .false.
-        elseif (params%poisson_order == "FD_6th_mehrstellen") then
+        else
+            ! handle the special Mehrstellenverfahren case with tensor stencils
             if (params%dim == 2) then
                 allocate(stencil_tensor_2D(-1:1, -1:1))
                 allocate(stencil_RHS_tensor_2D(-2:2, -2:2))
@@ -201,11 +134,8 @@ subroutine setup_Laplacian_stencils(params, g)
                 0.0_rk,  0.0_rk,   0.0_rk,  0.0_rk,  0.0_rk /), (/5,5,5/) ) / 720.0_rk
             endif
             params%poisson_stencil_size = 1
-            stencil_size = 1
             stencil_RHS_size = 2
             use_tensor = .true.
-        else
-            call abort(250612, "I don't know about this laplacian discretization order: "//trim(adjustl(params%poisson_order))//" in GS_compute_residual")
         endif
 
         if (present(g)) then
@@ -320,19 +250,163 @@ subroutine GS_iteration(params, u, b, dx, sweep_forward, filter_offset)
         do ic = 1, size(u,4)
             if (.not. use_tensor) then
                 if (params%dim == 2) then
-                    do iy = is(2), ie(2), dir
-                        do ix = is(1), ie(1), dir
-                            u(ix,iy,1,ic) = (-sum(stencil * u(ix-a:ix+a,iy,1,ic)) - sum(stencil * u(ix,iy-a:iy+a,1,ic)) + b(ix,iy,1,ic)*dx(1)*dx(1) + params%dim*stencil(0)*u(ix,iy,1,ic)) / (stencil(0)*params%dim)
-                        enddo
-                    enddo
+                    select case(a)
+                    case(1)
+                        do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,1,ic) = ( &
+                                - stencil(-1)*u(ix-1,iy,1,ic) - stencil(1)*u(ix+1,iy,1,ic) &
+                                - stencil(-1)*u(ix,iy-1,1,ic) - stencil(1)*u(ix,iy+1,1,ic) &
+                                + b(ix,iy,1,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo
+                    case(2)
+                        do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,1,ic) = ( &
+                                - stencil(-2)*u(ix-2,iy,1,ic) - stencil(-1)*u(ix-1,iy,1,ic) - stencil(1)*u(ix+1,iy,1,ic) - stencil(2)*u(ix+2,iy,1,ic) &
+                                - stencil(-2)*u(ix,iy-2,1,ic) - stencil(-1)*u(ix,iy-1,1,ic) - stencil(1)*u(ix,iy+1,1,ic) - stencil(2)*u(ix,iy+2,1,ic) &
+                                + b(ix,iy,1,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo
+                    case(3)
+                        do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,1,ic) = ( &
+                                - stencil(-3)*u(ix-3,iy,1,ic) - stencil(-2)*u(ix-2,iy,1,ic) - stencil(-1)*u(ix-1,iy,1,ic) - stencil(1)*u(ix+1,iy,1,ic) - stencil(2)*u(ix+2,iy,1,ic) - stencil(3)*u(ix+3,iy,1,ic) &
+                                - stencil(-3)*u(ix,iy-3,1,ic) - stencil(-2)*u(ix,iy-2,1,ic) - stencil(-1)*u(ix,iy-1,1,ic) - stencil(1)*u(ix,iy+1,1,ic) - stencil(2)*u(ix,iy+2,1,ic) - stencil(3)*u(ix,iy+3,1,ic) &
+                                + b(ix,iy,1,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo
+                    case(4)
+                        do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,1,ic) = ( &
+                                - stencil(-4)*u(ix-4,iy,1,ic) - stencil(-3)*u(ix-3,iy,1,ic) - stencil(-2)*u(ix-2,iy,1,ic) - stencil(-1)*u(ix-1,iy,1,ic) - stencil(1)*u(ix+1,iy,1,ic) - stencil(2)*u(ix+2,iy,1,ic) - stencil(3)*u(ix+3,iy,1,ic) - stencil(4)*u(ix+4,iy,1,ic) &
+                                - stencil(-4)*u(ix,iy-4,1,ic) - stencil(-3)*u(ix,iy-3,1,ic) - stencil(-2)*u(ix,iy-2,1,ic) - stencil(-1)*u(ix,iy-1,1,ic) - stencil(1)*u(ix,iy+1,1,ic) - stencil(2)*u(ix,iy+2,1,ic) - stencil(3)*u(ix,iy+3,1,ic) - stencil(4)*u(ix,iy+4,1,ic) &
+                                + b(ix,iy,1,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo
+                    case(5)
+                        do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,1,ic) = ( &
+                                - stencil(-5)*u(ix-5,iy,1,ic) - stencil(-4)*u(ix-4,iy,1,ic) - stencil(-3)*u(ix-3,iy,1,ic) - stencil(-2)*u(ix-2,iy,1,ic) - stencil(-1)*u(ix-1,iy,1,ic) &
+                                - stencil(1)*u(ix+1,iy,1,ic) - stencil(2)*u(ix+2,iy,1,ic) - stencil(3)*u(ix+3,iy,1,ic) - stencil(4)*u(ix+4,iy,1,ic) - stencil(5)*u(ix+5,iy,1,ic) &
+                                - stencil(-5)*u(ix,iy-5,1,ic) - stencil(-4)*u(ix,iy-4,1,ic) - stencil(-3)*u(ix,iy-3,1,ic) - stencil(-2)*u(ix,iy-2,1,ic) - stencil(-1)*u(ix,iy-1,1,ic) &
+                                - stencil(1)*u(ix,iy+1,1,ic) - stencil(2)*u(ix,iy+2,1,ic) - stencil(3)*u(ix,iy+3,1,ic) - stencil(4)*u(ix,iy+4,1,ic) - stencil(5)*u(ix,iy+5,1,ic) &
+                                + b(ix,iy,1,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo
+                    case(6)
+                        do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,1,ic) = ( &
+                                - stencil(-6)*u(ix-6,iy,1,ic) - stencil(-5)*u(ix-5,iy,1,ic) - stencil(-4)*u(ix-4,iy,1,ic) - stencil(-3)*u(ix-3,iy,1,ic) - stencil(-2)*u(ix-2,iy,1,ic) - stencil(-1)*u(ix-1,iy,1,ic) &
+                                - stencil(1)*u(ix+1,iy,1,ic) - stencil(2)*u(ix+2,iy,1,ic) - stencil(3)*u(ix+3,iy,1,ic) - stencil(4)*u(ix+4,iy,1,ic) - stencil(5)*u(ix+5,iy,1,ic) - stencil(6)*u(ix+6,iy,1,ic) &
+                                - stencil(-6)*u(ix,iy-6,1,ic) - stencil(-5)*u(ix,iy-5,1,ic) - stencil(-4)*u(ix,iy-4,1,ic) - stencil(-3)*u(ix,iy-3,1,ic) - stencil(-2)*u(ix,iy-2,1,ic) - stencil(-1)*u(ix,iy-1,1,ic) &
+                                - stencil(1)*u(ix,iy+1,1,ic) - stencil(2)*u(ix,iy+2,1,ic) - stencil(3)*u(ix,iy+3,1,ic) - stencil(4)*u(ix,iy+4,1,ic) - stencil(5)*u(ix,iy+5,1,ic) - stencil(6)*u(ix,iy+6,1,ic) &
+                                + b(ix,iy,1,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo
+                    case(7)
+                        do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,1,ic) = ( &
+                                - stencil(-7)*u(ix-7,iy,1,ic) - stencil(-6)*u(ix-6,iy,1,ic) - stencil(-5)*u(ix-5,iy,1,ic) - stencil(-4)*u(ix-4,iy,1,ic) - stencil(-3)*u(ix-3,iy,1,ic) - stencil(-2)*u(ix-2,iy,1,ic) - stencil(-1)*u(ix-1,iy,1,ic) &
+                                - stencil(1)*u(ix+1,iy,1,ic) - stencil(2)*u(ix+2,iy,1,ic) - stencil(3)*u(ix+3,iy,1,ic) - stencil(4)*u(ix+4,iy,1,ic) - stencil(5)*u(ix+5,iy,1,ic) - stencil(6)*u(ix+6,iy,1,ic) - stencil(7)*u(ix+7,iy,1,ic) &
+                                - stencil(-7)*u(ix,iy-7,1,ic) - stencil(-6)*u(ix,iy-6,1,ic) - stencil(-5)*u(ix,iy-5,1,ic) - stencil(-4)*u(ix,iy-4,1,ic) - stencil(-3)*u(ix,iy-3,1,ic) - stencil(-2)*u(ix,iy-2,1,ic) - stencil(-1)*u(ix,iy-1,1,ic) &
+                                - stencil(1)*u(ix,iy+1,1,ic) - stencil(2)*u(ix,iy+2,1,ic) - stencil(3)*u(ix,iy+3,1,ic) - stencil(4)*u(ix,iy+4,1,ic) - stencil(5)*u(ix,iy+5,1,ic) - stencil(6)*u(ix,iy+6,1,ic) - stencil(7)*u(ix,iy+7,1,ic) &
+                                + b(ix,iy,1,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo
+                    case(8)
+                        do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,1,ic) = ( &
+                                - stencil(-8)*u(ix-8,iy,1,ic) - stencil(-7)*u(ix-7,iy,1,ic) - stencil(-6)*u(ix-6,iy,1,ic) - stencil(-5)*u(ix-5,iy,1,ic) - stencil(-4)*u(ix-4,iy,1,ic) - stencil(-3)*u(ix-3,iy,1,ic) - stencil(-2)*u(ix-2,iy,1,ic) - stencil(-1)*u(ix-1,iy,1,ic) &
+                                - stencil(1)*u(ix+1,iy,1,ic) - stencil(2)*u(ix+2,iy,1,ic) - stencil(3)*u(ix+3,iy,1,ic) - stencil(4)*u(ix+4,iy,1,ic) - stencil(5)*u(ix+5,iy,1,ic) - stencil(6)*u(ix+6,iy,1,ic) - stencil(7)*u(ix+7,iy,1,ic) - stencil(8)*u(ix+8,iy,1,ic) &
+                                - stencil(-8)*u(ix,iy-8,1,ic) - stencil(-7)*u(ix,iy-7,1,ic) - stencil(-6)*u(ix,iy-6,1,ic) - stencil(-5)*u(ix,iy-5,1,ic) - stencil(-4)*u(ix,iy-4,1,ic) - stencil(-3)*u(ix,iy-3,1,ic) - stencil(-2)*u(ix,iy-2,1,ic) - stencil(-1)*u(ix,iy-1,1,ic) &
+                                - stencil(1)*u(ix,iy+1,1,ic) - stencil(2)*u(ix,iy+2,1,ic) - stencil(3)*u(ix,iy+3,1,ic) - stencil(4)*u(ix,iy+4,1,ic) - stencil(5)*u(ix,iy+5,1,ic) - stencil(6)*u(ix,iy+6,1,ic) - stencil(7)*u(ix,iy+7,1,ic) - stencil(8)*u(ix,iy+8,1,ic) &
+                                + b(ix,iy,1,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo
+                    case default
+                        do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                                u(ix,iy,1,ic) = (-sum(stencil * u(ix-a:ix+a,iy,1,ic)) - sum(stencil * u(ix,iy-a:iy+a,1,ic)) + b(ix,iy,1,ic)*dx(1)*dx(1) + params%dim*stencil(0)*u(ix,iy,1,ic)) / (stencil(0)*params%dim)
+                        enddo; enddo
+                    end select
                 else
-                    do iz = is(3), ie(3), dir
-                        do iy = is(2), ie(2), dir
-                            do ix = is(1), ie(1), dir
-                                u(ix,iy,iz,ic) = (-sum(stencil * u(ix-a:ix+a,iy,iz,ic)) - sum(stencil * u(ix,iy-a:iy+a,iz,ic)) - sum(stencil * u(ix,iy,iz-a:iz+a,ic)) + b(ix,iy,iz,ic)*dx(1)*dx(1) + params%dim*stencil(0)*u(ix,iy,iz,ic)) / (stencil(0)*params%dim)
-                            enddo
-                        enddo
-                    enddo
+                    ! hardcoding of several stencil sizes, as this is critical for performance
+                    select case(a)
+                    case(1)
+                        do iz = is(3), ie(3), dir; do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,iz,ic) = ( &
+                                - (stencil(-1) * u(ix-1,iy,iz,ic) + stencil(1) * u(ix+1,iy,iz,ic)) &
+                                - (stencil(-1) * u(ix,iy-1,iz,ic) + stencil(1) * u(ix,iy+1,iz,ic)) &
+                                - (stencil(-1) * u(ix,iy,iz-1,ic) + stencil(1) * u(ix,iy,iz+1,ic)) &
+                                + b(ix,iy,iz,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo; enddo
+                    case(2)
+                        do iz = is(3), ie(3), dir; do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,iz,ic) = ( &
+                                - (stencil(-2) * u(ix-2,iy,iz,ic) + stencil(-1) * u(ix-1,iy,iz,ic) + stencil(1) * u(ix+1,iy,iz,ic) + stencil(2) * u(ix+2,iy,iz,ic)) &
+                                - (stencil(-2) * u(ix,iy-2,iz,ic) + stencil(-1) * u(ix,iy-1,iz,ic) + stencil(1) * u(ix,iy+1,iz,ic) + stencil(2) * u(ix,iy+2,iz,ic)) &
+                                - (stencil(-2) * u(ix,iy,iz-2,ic) + stencil(-1) * u(ix,iy,iz-1,ic) + stencil(1) * u(ix,iy,iz+1,ic) + stencil(2) * u(ix,iy,iz+2,ic)) &
+                                + b(ix,iy,iz,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo; enddo
+                    case(3)
+                        do iz = is(3), ie(3), dir; do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,iz,ic) = ( &
+                                - (stencil(-3) * u(ix-3,iy,iz,ic) + stencil(-2) * u(ix-2,iy,iz,ic) + stencil(-1) * u(ix-1,iy,iz,ic) + stencil(1) * u(ix+1,iy,iz,ic) + stencil(2) * u(ix+2,iy,iz,ic) + stencil(3) * u(ix+3,iy,iz,ic)) &
+                                - (stencil(-3) * u(ix,iy-3,iz,ic) + stencil(-2) * u(ix,iy-2,iz,ic) + stencil(-1) * u(ix,iy-1,iz,ic) + stencil(1) * u(ix,iy+1,iz,ic) + stencil(2) * u(ix,iy+2,iz,ic) + stencil(3) * u(ix,iy+3,iz,ic)) &
+                                - (stencil(-3) * u(ix,iy,iz-3,ic) + stencil(-2) * u(ix,iy,iz-2,ic) + stencil(-1) * u(ix,iy,iz-1,ic) + stencil(1) * u(ix,iy,iz+1,ic) + stencil(2) * u(ix,iy,iz+2,ic) + stencil(3) * u(ix,iy,iz+3,ic)) &
+                                + b(ix,iy,iz,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo; enddo
+                    case(4)
+                        do iz = is(3), ie(3), dir; do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,iz,ic) = ( &
+                                - (stencil(-4) * u(ix-4,iy,iz,ic) + stencil(-3) * u(ix-3,iy,iz,ic) + stencil(-2) * u(ix-2,iy,iz,ic) + stencil(-1) * u(ix-1,iy,iz,ic) + &
+                                   stencil(1) * u(ix+1,iy,iz,ic) + stencil(2) * u(ix+2,iy,iz,ic) + stencil(3) * u(ix+3,iy,iz,ic) + stencil(4) * u(ix+4,iy,iz,ic)) &
+                                - (stencil(-4) * u(ix,iy-4,iz,ic) + stencil(-3) * u(ix,iy-3,iz,ic) + stencil(-2) * u(ix,iy-2,iz,ic) + stencil(-1) * u(ix,iy-1,iz,ic) + &
+                                   stencil(1) * u(ix,iy+1,iz,ic) + stencil(2) * u(ix,iy+2,iz,ic) + stencil(3) * u(ix,iy+3,iz,ic) + stencil(4) * u(ix,iy+4,iz,ic)) &
+                                - (stencil(-4) * u(ix,iy,iz-4,ic) + stencil(-3) * u(ix,iy,iz-3,ic) + stencil(-2) * u(ix,iy,iz-2,ic) + stencil(-1) * u(ix,iy,iz-1,ic) + &
+                                   stencil(1) * u(ix,iy,iz+1,ic) + stencil(2) * u(ix,iy,iz+2,ic) + stencil(3) * u(ix,iy,iz+3,ic) + stencil(4) * u(ix,iy,iz+4,ic)) &
+                                + b(ix,iy,iz,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo; enddo
+                    case(5)
+                        do iz = is(3), ie(3), dir; do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,iz,ic) = ( &
+                                - (stencil(-5) * u(ix-5,iy,iz,ic) + stencil(-4) * u(ix-4,iy,iz,ic) + stencil(-3) * u(ix-3,iy,iz,ic) + stencil(-2) * u(ix-2,iy,iz,ic) + stencil(-1) * u(ix-1,iy,iz,ic) + &
+                                   stencil(1) * u(ix+1,iy,iz,ic) + stencil(2) * u(ix+2,iy,iz,ic) + stencil(3) * u(ix+3,iy,iz,ic) + stencil(4) * u(ix+4,iy,iz,ic) + stencil(5) * u(ix+5,iy,iz,ic)) &
+                                - (stencil(-5) * u(ix,iy-5,iz,ic) + stencil(-4) * u(ix,iy-4,iz,ic) + stencil(-3) * u(ix,iy-3,iz,ic) + stencil(-2) * u(ix,iy-2,iz,ic) + stencil(-1) * u(ix,iy-1,iz,ic) + &
+                                   stencil(1) * u(ix,iy+1,iz,ic) + stencil(2) * u(ix,iy+2,iz,ic) + stencil(3) * u(ix,iy+3,iz,ic) + stencil(4) * u(ix,iy+4,iz,ic) + stencil(5) * u(ix,iy+5,iz,ic)) &
+                                - (stencil(-5) * u(ix,iy,iz-5,ic) + stencil(-4) * u(ix,iy,iz-4,ic) + stencil(-3) * u(ix,iy,iz-3,ic) + stencil(-2) * u(ix,iy,iz-2,ic) + stencil(-1) * u(ix,iy,iz-1,ic) + &
+                                   stencil(1) * u(ix,iy,iz+1,ic) + stencil(2) * u(ix,iy,iz+2,ic) + stencil(3) * u(ix,iy,iz+3,ic) + stencil(4) * u(ix,iy,iz+4,ic) + stencil(5) * u(ix,iy,iz+5,ic)) &
+                                + b(ix,iy,iz,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo; enddo
+                    case(6)
+                        do iz = is(3), ie(3), dir; do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,iz,ic) = ( &
+                                - (stencil(-6) * u(ix-6,iy,iz,ic) + stencil(-5) * u(ix-5,iy,iz,ic) + stencil(-4) * u(ix-4,iy,iz,ic) + stencil(-3) * u(ix-3,iy,iz,ic) + stencil(-2) * u(ix-2,iy,iz,ic) + stencil(-1) * u(ix-1,iy,iz,ic) + &
+                                   stencil(1) * u(ix+1,iy,iz,ic) + stencil(2) * u(ix+2,iy,iz,ic) + stencil(3) * u(ix+3,iy,iz,ic) + stencil(4) * u(ix+4,iy,iz,ic) + stencil(5) * u(ix+5,iy,iz,ic) + stencil(6) * u(ix+6,iy,iz,ic)) &
+                                - (stencil(-6) * u(ix,iy-6,iz,ic) + stencil(-5) * u(ix,iy-5,iz,ic) + stencil(-4) * u(ix,iy-4,iz,ic) + stencil(-3) * u(ix,iy-3,iz,ic) + stencil(-2) * u(ix,iy-2,iz,ic) + stencil(-1) * u(ix,iy-1,iz,ic) + &
+                                   stencil(1) * u(ix,iy+1,iz,ic) + stencil(2) * u(ix,iy+2,iz,ic) + stencil(3) * u(ix,iy+3,iz,ic) + stencil(4) * u(ix,iy+4,iz,ic) + stencil(5) * u(ix,iy+5,iz,ic) + stencil(6) * u(ix,iy+6,iz,ic)) &
+                                - (stencil(-6) * u(ix,iy,iz-6,ic) + stencil(-5) * u(ix,iy,iz-5,ic) + stencil(-4) * u(ix,iy,iz-4,ic) + stencil(-3) * u(ix,iy,iz-3,ic) + stencil(-2) * u(ix,iy,iz-2,ic) + stencil(-1) * u(ix,iy,iz-1,ic) + &
+                                   stencil(1) * u(ix,iy,iz+1,ic) + stencil(2) * u(ix,iy,iz+2,ic) + stencil(3) * u(ix,iy,iz+3,ic) + stencil(4) * u(ix,iy,iz+4,ic) + stencil(5) * u(ix,iy,iz+5,ic) + stencil(6) * u(ix,iy,iz+6,ic)) &
+                                + b(ix,iy,iz,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo; enddo
+                    case(7)
+                        do iz = is(3), ie(3), dir; do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,iz,ic) = ( &
+                                - (stencil(-7) * u(ix-7,iy,iz,ic) + stencil(-6) * u(ix-6,iy,iz,ic) + stencil(-5) * u(ix-5,iy,iz,ic) + stencil(-4) * u(ix-4,iy,iz,ic) + stencil(-3) * u(ix-3,iy,iz,ic) + stencil(-2) * u(ix-2,iy,iz,ic) + stencil(-1) * u(ix-1,iy,iz,ic) + &
+                                   stencil(1) * u(ix+1,iy,iz,ic) + stencil(2) * u(ix+2,iy,iz,ic) + stencil(3) * u(ix+3,iy,iz,ic) + stencil(4) * u(ix+4,iy,iz,ic) + stencil(5) * u(ix+5,iy,iz,ic) + stencil(6) * u(ix+6,iy,iz,ic) + stencil(7) * u(ix+7,iy,iz,ic)) &
+                                - (stencil(-7) * u(ix,iy-7,iz,ic) + stencil(-6) * u(ix,iy-6,iz,ic) + stencil(-5) * u(ix,iy-5,iz,ic) + stencil(-4) * u(ix,iy-4,iz,ic) + stencil(-3) * u(ix,iy-3,iz,ic) + stencil(-2) * u(ix,iy-2,iz,ic) + stencil(-1) * u(ix,iy-1,iz,ic) + &
+                                   stencil(1) * u(ix,iy+1,iz,ic) + stencil(2) * u(ix,iy+2,iz,ic) + stencil(3) * u(ix,iy+3,iz,ic) + stencil(4) * u(ix,iy+4,iz,ic) + stencil(5) * u(ix,iy+5,iz,ic) + stencil(6) * u(ix,iy+6,iz,ic) + stencil(7) * u(ix,iy+7,iz,ic)) &
+                                - (stencil(-7) * u(ix,iy,iz-7,ic) + stencil(-6) * u(ix,iy,iz-6,ic) + stencil(-5) * u(ix,iy,iz-5,ic) + stencil(-4) * u(ix,iy,iz-4,ic) + stencil(-3) * u(ix,iy,iz-3,ic) + stencil(-2) * u(ix,iy,iz-2,ic) + stencil(-1) * u(ix,iy,iz-1,ic) + &
+                                   stencil(1) * u(ix,iy,iz+1,ic) + stencil(2) * u(ix,iy,iz+2,ic) + stencil(3) * u(ix,iy,iz+3,ic) + stencil(4) * u(ix,iy,iz+4,ic) + stencil(5) * u(ix,iy,iz+5,ic) + stencil(6) * u(ix,iy,iz+6,ic) + stencil(7) * u(ix,iy,iz+7,ic)) &
+                                + b(ix,iy,iz,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo; enddo
+                    case(8)
+                        do iz = is(3), ie(3), dir; do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,iz,ic) = ( &
+                                - (stencil(-8) * u(ix-8,iy,iz,ic) + stencil(-7) * u(ix-7,iy,iz,ic) + stencil(-6) * u(ix-6,iy,iz,ic) + stencil(-5) * u(ix-5,iy,iz,ic) + stencil(-4) * u(ix-4,iy,iz,ic) + stencil(-3) * u(ix-3,iy,iz,ic) + stencil(-2) * u(ix-2,iy,iz,ic) + stencil(-1) * u(ix-1,iy,iz,ic) + &
+                                   stencil(1) * u(ix+1,iy,iz,ic) + stencil(2) * u(ix+2,iy,iz,ic) + stencil(3) * u(ix+3,iy,iz,ic) + stencil(4) * u(ix+4,iy,iz,ic) + stencil(5) * u(ix+5,iy,iz,ic) + stencil(6) * u(ix+6,iy,iz,ic) + stencil(7) * u(ix+7,iy,iz,ic) + stencil(8) * u(ix+8,iy,iz,ic)) &
+                                - (stencil(-8) * u(ix,iy-8,iz,ic) + stencil(-7) * u(ix,iy-7,iz,ic) + stencil(-6) * u(ix,iy-6,iz,ic) + stencil(-5) * u(ix,iy-5,iz,ic) + stencil(-4) * u(ix,iy-4,iz,ic) + stencil(-3) * u(ix,iy-3,iz,ic) + stencil(-2) * u(ix,iy-2,iz,ic) + stencil(-1) * u(ix,iy-1,iz,ic) + &
+                                   stencil(1) * u(ix,iy+1,iz,ic) + stencil(2) * u(ix,iy+2,iz,ic) + stencil(3) * u(ix,iy+3,iz,ic) + stencil(4) * u(ix,iy+4,iz,ic) + stencil(5) * u(ix,iy+5,iz,ic) + stencil(6) * u(ix,iy+6,iz,ic) + stencil(7) * u(ix,iy+7,iz,ic) + stencil(8) * u(ix,iy+8,iz,ic)) &
+                                - (stencil(-8) * u(ix,iy,iz-8,ic) + stencil(-7) * u(ix,iy,iz-7,ic) + stencil(-6) * u(ix,iy,iz-6,ic) + stencil(-5) * u(ix,iy,iz-5,ic) + stencil(-4) * u(ix,iy,iz-4,ic) + stencil(-3) * u(ix,iy,iz-3,ic) + stencil(-2) * u(ix,iy,iz-2,ic) + stencil(-1) * u(ix,iy,iz-1,ic) + &
+                                   stencil(1) * u(ix,iy,iz+1,ic) + stencil(2) * u(ix,iy,iz+2,ic) + stencil(3) * u(ix,iy,iz+3,ic) + stencil(4) * u(ix,iy,iz+4,ic) + stencil(5) * u(ix,iy,iz+5,ic) + stencil(6) * u(ix,iy,iz+6,ic) + stencil(7) * u(ix,iy,iz+7,ic) + stencil(8) * u(ix,iy,iz+8,ic)) &
+                                + b(ix,iy,iz,ic)*dx(1)*dx(1)) / (stencil(0)*params%dim)
+                        enddo; enddo; enddo
+                    case default
+                        do iz = is(3), ie(3), dir; do iy = is(2), ie(2), dir; do ix = is(1), ie(1), dir
+                            u(ix,iy,iz,ic) = (-sum(stencil * u(ix-a:ix+a,iy,iz,ic)) - sum(stencil * u(ix,iy-a:iy+a,iz,ic)) - sum(stencil * u(ix,iy,iz-a:iz+a,ic)) + b(ix,iy,iz,ic)*dx(1)*dx(1) + params%dim*stencil(0)*u(ix,iy,iz,ic)) / (stencil(0)*params%dim)
+                        enddo; enddo; enddo
+                    end select
                 endif
             else
                 if (params%dim == 2) then
@@ -386,7 +460,7 @@ subroutine GS_compute_residual(params, u, b, r, dx, recompute_b)
                 if (params%dim == 2) then
                     do iy = params%g+1, params%g+params%bs(2)
                         do ix = params%g+1, params%g+params%bs(1)
-                            r(ix,iy,1,ic) = b(ix,iy,1,ic) + Ax_factor * (sum(stencil * u(ix-a:ix+a,iy,1,ic)) / (dx(1)*dx(1)) \
+                            r(ix,iy,1,ic) = b(ix,iy,1,ic) + Ax_factor * (sum(stencil * u(ix-a:ix+a,iy,1,ic)) / (dx(1)*dx(1)) &
                                 + sum(stencil * u(ix,iy-a:iy+a,1,ic))  / (dx(2)*dx(2)))
                         enddo
                     enddo
@@ -394,7 +468,7 @@ subroutine GS_compute_residual(params, u, b, r, dx, recompute_b)
                     do iz = params%g+1, params%g+params%bs(3)
                         do iy = params%g+1, params%g+params%bs(2)
                             do ix = params%g+1, params%g+params%bs(1)
-                                r(ix,iy,iz,ic) = b(ix,iy,iz,ic) + Ax_factor * (sum(stencil * u(ix-a:ix+a,iy,iz,ic)) / (dx(1)*dx(1)) \
+                                r(ix,iy,iz,ic) = b(ix,iy,iz,ic) + Ax_factor * (sum(stencil * u(ix-a:ix+a,iy,iz,ic)) / (dx(1)*dx(1)) &
                                     + sum(stencil * u(ix,iy-a:iy+a,iz,ic)) / (dx(2)*dx(2)) + sum(stencil * u(ix,iy,iz-a:iz+a,ic)) / (dx(3)*dx(3)))
                             enddo
                         enddo
@@ -513,7 +587,6 @@ subroutine GS_compute_Ax(params, u, ddu, dx, apply_B_RHS)
 
 
     subroutine CG_solve_poisson_level0(params, u, f, dx, tol, max_iter)
-        use module_params
         implicit none
 
         ! Arguments
@@ -589,7 +662,6 @@ end subroutine CG_solve_poisson_level0
 
     ! synch routine for block on lowest level
     subroutine sync_block_level0(params, u, sync_sides_only)
-        use module_params
         implicit none
 
         ! Arguments
diff --git a/LIB/MESH/multigrid_vcycle.f90 b/LIB/POISSON/multigrid_vcycle.f90
similarity index 94%
rename from LIB/MESH/multigrid_vcycle.f90
rename to LIB/POISSON/multigrid_vcycle.f90
index cf812cd7..0c0595c4 100644
--- a/LIB/MESH/multigrid_vcycle.f90
+++ b/LIB/POISSON/multigrid_vcycle.f90
@@ -9,7 +9,7 @@ subroutine multigrid_solve(params, hvy_sol, hvy_RHS, hvy_work, tree_ID, init_0,
     logical, intent(in), optional      :: verbose
 
     integer(kind=ik)                   :: k_block, lgt_ID, hvy_id, ic, i_cycle
-    real(kind=rk)                      :: dx(1:3), x0(1:3), residual(1:4*size(hvy_sol,4))
+    real(kind=rk)                      :: dx(1:3), x0(1:3), residual(1:4*size(hvy_sol,4)), norm_sol(1:size(hvy_sol,4))
     real(kind=rk)                      :: t_block
     logical                            :: verbose_apply, init0
 
@@ -39,6 +39,9 @@ subroutine multigrid_solve(params, hvy_sol, hvy_RHS, hvy_work, tree_ID, init_0,
         call toc( "Sync Layer", 10010, MPI_Wtime()-t_block )
     endif
 
+    ! compute norm of RHS for relative tolerances
+    call componentWiseNorm_tree(params, hvy_RHS(:,:,:,1:size(hvy_RHS,4),:), tree_ID, "Linfty", norm_sol(1:size(hvy_RHS,4)), threshold_state_vector=.false.)
+
     ! prepare full tree grid, this will be populated along the way
     ! blocks are practically empty, but we fill them along the way so ref flag will be set to 0 to allow synching
     call init_full_tree(params, tree_ID, set_ref=0)
@@ -63,11 +66,12 @@ subroutine multigrid_solve(params, hvy_sol, hvy_RHS, hvy_work, tree_ID, init_0,
 
         ! choose between different end criteria:
         ! fixed_iterations - do a set number of V-cycles
-        ! tolerance - stop when residuals are below a certain threshold or maximum number of iterations is reached
+        ! tolerance - stop when absolute or relative residuals are below a certain threshold or maximum number of iterations is reached
         if (params%poisson_cycle_end_criteria == "fixed_iterations") then
             if (i_cycle >= params%poisson_cycle_it) exit
         elseif (params%poisson_cycle_end_criteria == "tolerance") then
-            if (all(residual(1:size(hvy_sol,4)) < params%poisson_cycle_tol)) exit
+            if (all(residual(1:size(hvy_sol,4)) < params%poisson_cycle_tol_abs)) exit
+            if (all(residual(1:size(hvy_sol,4))/norm_sol(1:size(hvy_sol,4)) < params%poisson_cycle_tol_rel) .and. all(norm_sol(1:size(hvy_sol,4)) > 1e-8)) exit
         else
             call abort(250903, "Don't know how to stop! Please choose between 'fixed_iterations' and 'tolerance', thank you :)")
         endif
@@ -161,6 +165,25 @@ subroutine multigrid_vcycle(params, hvy_sol, hvy_RHS, hvy_work, tree_ID, verbose
     call sync_ghosts_tree(params, hvy_sol(:,:,:,1:nc,:), tree_ID)
     call toc( "Sync Layer", 10010, MPI_Wtime()-t_block )
 
+    ! JB Comment: I think this is not necessary, we do coarse extension but only keep the block values and not the decomposed ones, so we should reconstruct the original values later on without any problems
+    ! ! we need to preserve b
+    ! ! Usually we could restore it later with b = r + Ax, however, with coarse extension we have to alter the residual and would not get back the original b
+    !     ! downwards - no sweeps are done, we only restrict the residual down to the lowest level
+    ! ! compute the residual r = b - Ax to pass it downwards
+    ! t_block = MPI_Wtime()
+    ! do k_block = 1, hvy_n(tree_ID)
+    !     hvy_id = hvy_active(k_block, tree_ID)
+    !     call hvy2lgt( lgt_id, hvy_id, params%rank, params%number_blocks )
+
+    !     ! b only defined on leaf layer
+    !     if (.not. block_is_leaf(params, hvy_id)) cycle
+
+    !     ! store b
+    !     hvy_work(:,:,:,nc+1:2*nc,hvy_id) = hvy_RHS(:,:,:,1:nc,hvy_id)
+    ! enddo
+    ! call toc( "GS Downwards - Store b", 10020, MPI_Wtime()-t_block )
+
+
     ! downwards - no sweeps are done, we only restrict the residual down to the lowest level
     ! compute the residual r = b - Ax to pass it downwards
     t_block = MPI_Wtime()
@@ -199,6 +222,7 @@ subroutine multigrid_vcycle(params, hvy_sol, hvy_RHS, hvy_work, tree_ID, verbose
 
     ! for all levels we are not solving Au = b, but rather Ae=r
     ! only on the last upwards step will we iterate on the original problem, so we need to backup u
+    ! starting from here, hvy_work contains the backed-up solution
     t_block = MPI_Wtime()
     do k_block = 1, hvy_n(tree_ID)
         hvy_id = hvy_active(k_block, tree_ID)
@@ -235,8 +259,10 @@ subroutine multigrid_vcycle(params, hvy_sol, hvy_RHS, hvy_work, tree_ID, verbose
 
             ! recompute RHS b = r + Ax, 
             call GS_compute_residual(params, hvy_work(:,:,:,1:nc,hvy_id), hvy_RHS(:,:,:,1:nc,hvy_id), hvy_RHS(:,:,:,1:nc,hvy_id), dx, recompute_b=.true.)
+            ! ! restore full RHS b
+            ! hvy_RHS(:,:,:,1:nc,hvy_id) = hvy_work(:,:,:,nc+1:2*nc,hvy_id)
     
-            ! add reconstructed solution to residual to previous solution
+            ! add previous solution to reconstructed solution
             hvy_sol(:,:,:,1:nc,hvy_id) = hvy_sol(:,:,:,1:nc,hvy_id) + hvy_work(:,:,:,1:nc,hvy_id)
         endif
     enddo
diff --git a/LIB/POSTPROCESSING/compute_vorticity_post.f90 b/LIB/POSTPROCESSING/compute_vorticity_post.f90
index 0faf37b4..8ee0cb65 100644
--- a/LIB/POSTPROCESSING/compute_vorticity_post.f90
+++ b/LIB/POSTPROCESSING/compute_vorticity_post.f90
@@ -97,35 +97,50 @@ subroutine compute_vorticity_post(params)
     ! is used in ghost nodes sync'ing
     if (order == "2") then
         params%order_discretization = "FD_2nd_central"
+        params%wavelet = "CDF20"  ! wavelet used for adaptive syncing
         params%g = 2_ik
     elseif (order == "4") then
         params%order_discretization = "FD_4th_central"
-        params%g = 2_ik
+        params%wavelet = "CDF40"  ! wavelet used for adaptive syncing
+        params%g = 3_ik
     elseif (order == "4op") then
         params%order_discretization = "FD_4th_central_optimized"
+        params%wavelet = "CDF40"  ! wavelet used for adaptive syncing
         params%g = 3_ik
     elseif (order == "4_comp_13") then
         params%order_discretization = "FD_4th_comp_1_3"
+        params%wavelet = "CDF40"  ! wavelet used for adaptive syncing
         params%g = 3_ik
+        if (operator == "--dissipation") params%g = 4_ik  ! dissipation uses L=DG with different stencil width
     elseif (order == "4_comp_04") then
         params%order_discretization = "FD_4th_comp_0_4"
+        params%wavelet = "CDF40"  ! wavelet used for adaptive syncing
         params%g = 4_ik
     elseif (order == "6") then
         params%order_discretization = "FD_6th_central"
+        params%wavelet = "CDF60"  ! wavelet used for adaptive syncing
         params%g = 3_ik
     elseif (order == "6_comp_24") then
         params%order_discretization = "FD_6th_comp_2_4"
+        params%wavelet = "CDF60"  ! wavelet used for adaptive syncing
         params%g = 4_ik
+        if (operator == "--dissipation") params%g = 6_ik  ! dissipation uses L=DG with different stencil width
     elseif (order == "6_comp_15") then
         params%order_discretization = "FD_6th_comp_1_5"
+        params%wavelet = "CDF60"  ! wavelet used for adaptive syncing
         params%g = 5_ik
+        if (operator == "--dissipation") params%g = 6_ik  ! dissipation uses L=DG with different stencil width
     elseif (order == "6_comp_06") then
         params%order_discretization = "FD_6th_comp_0_6"
+        params%wavelet = "CDF60"  ! wavelet used for adaptive syncing
         params%g = 6_ik
+    elseif (order == "8") then
+        params%order_discretization = "FD_8th_central"
+        params%wavelet = "CDF80"  ! wavelet used for adaptive syncing
+        params%g = 4_ik
     else
-        call abort(8765,"chosen discretization order invalid or not (yet) implemented. choose between 6 (FD_6th_central), 4 (FD_4th_central), 4op (FD_4th_central_optimized), 6_comp_24 (FD_6th_comp_2_4), 6_comp_15 (FD_6th_comp_1_5), 6_comp_06 (FD_6th_comp_0_6) and 2 (FD_2nd_central)")
+        call abort(8765,"chosen discretization order invalid or not (yet) implemented. choose between 2 (FD_2nd_central), 4 (FD_4th_central), 6 (FD_6th_central), 4op (FD_4th_central_optimized), 4_comp_13 (FD_4th_comp_1_3), 4_comp_04 (FD_4th_comp_0_4), 6_comp_24 (FD_6th_comp_2_4), 6_comp_15 (FD_6th_comp_1_5) and 6_comp_06 (FD_6th_comp_0_6)")
     end if
-    params%wavelet="CDF20"  ! wavelet is not used
 
     params%Jmax = tc_length
     params%n_eqn = params%dim
diff --git a/LIB/POSTPROCESSING/post_add_two_masks.f90 b/LIB/POSTPROCESSING/post_add_two_masks.f90
index efafec3d..550e7c4c 100644
--- a/LIB/POSTPROCESSING/post_add_two_masks.f90
+++ b/LIB/POSTPROCESSING/post_add_two_masks.f90
@@ -70,7 +70,7 @@ subroutine post_add_two_masks(params)
     end if
     params%domain_size = domain
     params%Jmax = max( tc_length2, tc_length1 )
-    params%Jmin = 1
+    params%Jmin = 0
     params%n_eqn = 1
 
     allocate(params%threshold_state_vector_component(params%n_eqn))
@@ -101,8 +101,8 @@ subroutine post_add_two_masks(params)
     hvy_n = 0
     tree_n = 0 ! reset number of trees in forest
 
-    call readHDF5vct_tree((/fname1/), params, hvy_block, tree_ID=1, verbosity=.false.)
-    call readHDF5vct_tree((/fname2/), params, hvy_block, tree_ID=2, verbosity=.false.)
+    call readHDF5vct_tree((/fname1/), params, hvy_block, tree_ID=1, verbosity=.true.)
+    call readHDF5vct_tree((/fname2/), params, hvy_block, tree_ID=2, verbosity=.true.)
 
     call createActiveSortedLists_forest(params)
 
@@ -128,10 +128,15 @@ subroutine post_add_two_masks(params)
         call store_ref_meshes( lgt_block_ref, lgt_active_ref, lgt_n_ref, tree_ID1=1, tree_ID2=2)
         ! before applying wavelet filters we need to sync
         call sync_ghosts_tree(params, hvy_block, tree_ID=1)
-        ! make grid1 equal to grid 1 (by only coarsening).
-	! Should refinement be required error will be thrown.
+        ! make grid1 equal to grid 1 (by coarsening and refining).
+        ! something is not working here yet but I don't have the brain for it currently
+        if (params%rank==0) write(*,'(A)') "Coarsening grid 1 to grid 2"
         call coarse_tree_2_reference_mesh(params, lgt_block_ref, lgt_active_ref(:,2), lgt_n_ref(2), &
-        hvy_block, hvy_tmp, tree_ID=1, verbosity=.true.)
+        hvy_block, hvy_tmp, tree_ID=1, ref_can_be_finer=.true., verbosity=.true.)
+
+        if (params%rank==0) write(*,'(A)') "Refining grid 1 to grid 2"
+        call refine_tree_2_reference_mesh(params, lgt_block_ref, lgt_active_ref(:,2), lgt_n_ref(2), &
+        hvy_block, hvy_tmp, tree_ID=1, ref_can_be_coarser=.false., verbosity=.true.)
 
     case ("--test_operations")
         ! this tests inplace vs out of place operations
diff --git a/LIB/POSTPROCESSING/post_compression_unit_test.f90 b/LIB/POSTPROCESSING/post_compression_unit_test.f90
index da21f7e7..f9be14b9 100644
--- a/LIB/POSTPROCESSING/post_compression_unit_test.f90
+++ b/LIB/POSTPROCESSING/post_compression_unit_test.f90
@@ -133,7 +133,7 @@ subroutine post_compression_unit_test(params)
 
         open(14, file="header.txt", status='replace')
         write (14,'("Bs=",i3," Jmax=",i2," dim=",i1," eps-norm=",A," eps-normalized=",L1)') Bs(1), params%Jmax, params%dim, &
-        adjustl(trim(params%eps_norm)), params%eps_normalized
+        trim(adjustl(params%eps_norm)), params%eps_normalized
         close(14)
     endif
 
diff --git a/LIB/POSTPROCESSING/post_mean.f90 b/LIB/POSTPROCESSING/post_mean.f90
index 2de48c8c..a0a60a4a 100644
--- a/LIB/POSTPROCESSING/post_mean.f90
+++ b/LIB/POSTPROCESSING/post_mean.f90
@@ -1,6 +1,7 @@
 subroutine post_mean(params)
     use module_globals
     use module_params
+    use module_operators
     use module_mesh
     use mpi
     use module_forestMetaData
@@ -10,22 +11,11 @@ subroutine post_mean(params)
     type (type_params), intent(inout)       :: params                           !> parameter struct
 
     real(kind=rk), allocatable         :: hvy_block(:, :, :, :, :)
-    integer(kind=ik)                   :: tree_ID=1, hvy_id
-
-    integer(hsize_t), dimension(4)          :: size_field
-    integer(hid_t)                          :: file_id
-    integer(kind=ik)                        :: lgt_id, k, nz, iteration, dim, g
-    integer(kind=ik), dimension(3)          :: Bs
-    real(kind=rk), dimension(3)             :: x0, dx
-    real(kind=rk), dimension(3)             :: domain
-    real(kind=rk)                           :: time
-    integer(hsize_t), dimension(2)          :: dims_treecode
-    integer(kind=ik), allocatable           :: tree(:), sum_tree(:), blocks_per_rank(:)
-
-    real(kind=rk)    :: x,y,z
-    real(kind=rk)    :: maxi,mini,squari,meani,qi,inti
-    real(kind=rk)    :: maxl,minl,squarl,meanl,ql
-    integer(kind=ik) :: ix,iy,iz,mpicode, ioerr, rank, i, tc_length
+    integer(kind=ik)                   :: tree_ID=1
+    integer(kind=ik)                   :: rank, iteration
+    real(kind=rk)                      :: time
+    real(kind=rk)                      :: norms(1:5)
+    logical                            :: exist
 
     ! this routine works only on one tree
     allocate( hvy_n(1), lgt_n(1) )
@@ -36,8 +26,7 @@ subroutine post_mean(params)
     call get_command_argument(2,fname)
     if (fname =='--help' .or. fname=='--h') then
         if (rank==0) then
-            write(*,*) "WABBIT postprocessing: compute mean value of field. We also output the volume integral"
-            write(*,*) "(Lx*Ly*Lz*mean(field) to the file results.txt"
+            write(*,*) "WABBIT postprocessing: compute mean value of field. We output all values to a file."
             write(*,*) "mpi_command -n number_procs ./wabbit-post --mean filename.h5 result.txt"
         end if
         return
@@ -47,66 +36,45 @@ subroutine post_mean(params)
     call check_file_exists( fname )
 
     ! add some parameters from the file
-    call read_attributes(fname, lgt_n(tree_ID), time, iteration, domain, Bs, tc_length, dim, &
+    call read_attributes(fname, lgt_n(tree_ID), time, iteration, params%domain_size, params%Bs, params%Jmax, params%dim, &
     periodic_BC=params%periodic_BC, symmetry_BC=params%symmetry_BC)
 
-    params%Bs = Bs
     params%n_eqn = 1
     params%g = 2_ik
     params%order_predictor = "multiresolution_2nd"
-    params%Jmax = tc_length
-    params%domain_size(1) = domain(1)
-    params%domain_size(2) = domain(2)
-    params%domain_size(3) = domain(3)
     params%number_blocks = ceiling( real(lgt_n(tree_ID)) / real(params%number_procs) )
+    allocate(params%threshold_state_vector_component(1:params%n_eqn))
+    params%threshold_state_vector_component = 1
 
     call allocate_forest(params, hvy_block)
 
     ! read input data
     call readHDF5vct_tree( (/fname/), params, hvy_block, tree_ID)
 
-    ! compute an additional quantity that depends also on the position
-    ! (the others are translation invariant)
-    if (params%dim == 3) then
-        nz = Bs(3)
-    else
-        nz = 1
-    end if
-
-    meanl = 0.0_rk
-
-    g = params%g
-    do k = 1, hvy_n(tree_ID)
-        hvy_id = hvy_active(k,tree_ID)
-
-        call hvy2lgt(lgt_id, hvy_id, params%rank, params%number_blocks)
-        call get_block_spacing_origin( params, lgt_id, x0, dx )
-
-        if (params%dim == 3) then
-            meanl = meanl + sum( hvy_block(g+1:Bs(1)+g, g+1:Bs(2)+g, g+1:Bs(3)+g, 1, hvy_id))*dx(1)*dx(2)*dx(3)
-        else
-            meanl = meanl + sum( hvy_block(g+1:Bs(1)+g, g+1:Bs(2)+g, 1, 1, hvy_id))*dx(1)*dx(2)
-        endif
-    end do
-
-    call MPI_REDUCE(meanl,meani,1,MPI_DOUBLE_PRECISION,MPI_SUM,0,WABBIT_COMM,mpicode)
-
-    if (params%dim == 3) then
-        meani = meani / (params%domain_size(1)*params%domain_size(2)*params%domain_size(3))
-        inti = meani*product(domain)
-    else
-        meani = meani / (params%domain_size(1)*params%domain_size(2))
-        inti = meani*product(domain(1:2))
-    endif
+    ! use functions from module_operators
+    call componentWiseNorm_tree(params, hvy_block, tree_ID, "Mean", norms(1:1), threshold_state_vector=.false.)
+    call componentWiseNorm_tree(params, hvy_block, tree_ID, "L1", norms(2:2), threshold_state_vector=.false.)
+    call componentWiseNorm_tree(params, hvy_block, tree_ID, "L2", norms(3:3), threshold_state_vector=.false.)
+    call componentWiseNorm_tree(params, hvy_block, tree_ID, "Linfty", norms(4:4), threshold_state_vector=.false.)
+    call componentWiseNorm_tree(params, hvy_block, tree_ID, "H1", norms(5:5), threshold_state_vector=.false.)
 
     if (rank == 0) then
-        write(*,*) "Computed mean value is: ", meani
-        write(*,*) "Computed integral value is: ", inti
+        write(*,'(A, es16.8)') "Mean:        ", norms(1)
+        write(*,'(A, es16.8)') "L1 norm:     ", norms(2)
+        write(*,'(A, es16.8)') "L2 norm:     ", norms(3)
+        write(*,'(A, es16.8)') "LInfty norm: ", norms(4)
+        write(*,'(A, es16.8)') "H1 norm:     ", norms(5)
 
         ! write volume integral to disk
         call get_command_argument(3,fname_out)
-        open(14,file=fname_out, status='replace')
-        write(14,*) inti
-        close(14)
+        inquire ( file=fname_out, exist=exist )
+        if (exist) then
+            open(14,file=fname_out, status='replace')
+            write(14,'(A)') "# Mean, L1 norm, L2 norm, LInfty norm, H1 norm" 
+            write(14,'(5(es16.8, ", "))') norms(1), norms(2), norms(3), norms(4), norms(5)
+            close(14)
+        else
+            write(*,'(A)') "Output file "//trim(adjustl(fname_out))//" does not exist. Not writing output. Use e.g. touch"
+        endif
     endif
 end subroutine post_mean
diff --git a/LIB/POSTPROCESSING/proto_pressure_multigrid.f90 b/LIB/POSTPROCESSING/proto_pressure_multigrid.f90
index 555a12fc..ac505316 100644
--- a/LIB/POSTPROCESSING/proto_pressure_multigrid.f90
+++ b/LIB/POSTPROCESSING/proto_pressure_multigrid.f90
@@ -144,6 +144,7 @@ end subroutine compute_divergence_tree
     end select
     params%poisson_coarsest = "FFT"
     params%FFT_accuracy = "FD"  ! FD or spectral
+    params%poisson_Sync_it = 2
 
     Bs = params%Bs
 
@@ -160,7 +161,7 @@ end subroutine compute_divergence_tree
     ! allocate data
     call allocate_forest(params, hvy_block, hvy_tmp=hvy_tmp, hvy_work=hvy_work, nrhs_slots1=1, neqn_hvy_tmp=8)
 
-    ! read input data, if comparison file is given, we read it in as well
+    ! read input data
     if (exist_p) then
         call readHDF5vct_tree( (/file_ux, file_uy, file_uz, file_p/), params, hvy_block, tree_ID)
     else
@@ -549,6 +550,7 @@ end subroutine compute_projection
     call sync_ghosts_tree( params, hvy_block(:,:,:,params%dim+1:params%dim+1,:), tree_ID_flow )
     call compute_projection(params, hvy_block(:,:,:,1:params%dim,:), hvy_block(:,:,:,params%dim+1:params%dim+1,:), hvy_block(:,:,:,1:params%dim,:), order_disc_pressure, tree_ID_flow, 1.0_rk)
 
+    ! --- following block is for debugging the helmholtz projection ----
     ! call sync_ghosts_tree( params, hvy_block(:,:,:,1:params%dim,:), tree_ID_flow )
     ! write(fname, '(A, i12.12, A)') "ux_", int(100*1.0e6),".h5"
     ! call saveHDF5_tree(fname, 100.0_rk, 0, 1, params, hvy_block, tree_ID )
@@ -568,6 +570,7 @@ end subroutine compute_projection
     ! call saveHDF5_tree(fname, 100.0_rk, 0, 1, params, hvy_block, tree_ID )
     ! write(fname, '(A, i12.12, A)') "u2y_", int(100*1.0e6),".h5"
     ! call saveHDF5_tree(fname, 100.0_rk, 0, 2, params, hvy_block, tree_ID )
+    ! -----------------------------------------------------------------
 
     ! we have to recompute p
     call sync_ghosts_tree( params, hvy_block(:,:,:,1:params%dim,:), tree_ID_flow )
@@ -633,24 +636,34 @@ end subroutine compute_projection
         !***********************************************************************
         ! Timestep - Euler Explicit or RK2 / Heun's method
         !***********************************************************************
-        ! ! possibility to set different stencil for intermediate projections
-        ! params%poisson_order = "FD_6th_mehrstellen"
-        ! call setup_laplacian_stencils(params, params%g)
+        t4 = MPI_wtime()
 
         ! RK_generic
+        t3 = MPI_wtime()
         call sync_ghosts_tree( params, hvy_block(:,:,:,1:params%dim,:), tree_ID_flow )
+        call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
 
         ! caluclate timestep, here we use that of ACM and set the speed of sound to close to 0
         call calculate_time_step(params, time, iteration, hvy_block, dt, tree_ID)
 
         ! compute RHS for the first stage without pressure gradient
+        t3 = MPI_wtime()
         call compute_NSI_RHS(params, hvy_block(:,:,:,1:params%dim,:), hvy_mask, hvy_work(:,:,:,1:params%dim,:,1), order_disc_nonlinear, tree_ID_flow, nu, C_eta )
+        call toc( "timestep: compute_NSI_RHS", 22, MPI_wtime()-t3)
 
         ! compute RHS for the pressure Poisson equation, solve the Poisson equation and add pressure gradient
+        t3 = MPI_wtime()
         call sync_ghosts_tree( params, hvy_work(:,:,:,1:params%dim,:,1), tree_ID_flow )
+        call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
+        t3 = MPI_wtime()
         call compute_divergence_tree(params, hvy_work(:,:,:,1:params%dim,:,1), hvy_tmp(:,:,:,1,:), order_disc_pressure, tree_ID_flow)
-        call multigrid_solve(params, hvy_tmp(:,:,:,4:4,:), hvy_tmp(:,:,:,1:1,:), hvy_tmp(:,:,:,3:size(hvy_tmp,4),:), tree_ID_flow, verbose=.false.)
-        call compute_projection(params, hvy_work(:,:,:,1:params%dim,:,1), hvy_tmp(:,:,:,4:4,:), hvy_work(:,:,:,1:params%dim,:,1), order_disc_pressure, tree_ID_flow, 1.0_rk)
+        call toc( "timestep: compute_divergence_tree", 23, MPI_wtime()-t3)
+        t3 = MPI_wtime()
+        call multigrid_solve(params, hvy_tmp(:,:,:,params%dim+1:params%dim+1,:), hvy_tmp(:,:,:,1:1,:), hvy_tmp(:,:,:,params%dim+2:size(hvy_tmp,4),:), tree_ID_flow, verbose=.false.)
+        call toc( "timestep: multigrid_solve", 24, MPI_wtime()-t3)
+        t3 = MPI_wtime()
+        call compute_projection(params, hvy_work(:,:,:,1:params%dim,:,1), hvy_tmp(:,:,:,params%dim+1:params%dim+1,:), hvy_work(:,:,:,1:params%dim,:,1), order_disc_pressure, tree_ID_flow, 1.0_rk)
+        call toc( "timestep: compute_projection", 25, MPI_wtime()-t3)
 
         ! compute k_1, k_2, .... (coefficients for final stage)
         do j = 2, size(params%butcher_tableau, 1) - 1
@@ -671,14 +684,26 @@ end subroutine compute_projection
                     + dt * params%butcher_tableau(j,l) * hvy_work(g(1)+1:g(1)+bs(1),g(2)+1:g(2)+bs(2),g(3)+1:g(3)+bs(3),1:params%dim,hvy_id,l-1)
                 end do
             end do
+            t3 = MPI_wtime()
             call sync_ghosts_tree( params, hvy_tmp(:,:,:,1:params%dim,:), tree_ID_flow )
+            call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
             ! compute RHS for the current stage without pressure gradient
+            t3 = MPI_wtime()
             call compute_NSI_RHS(params, hvy_tmp(:,:,:,1:params%dim,:), hvy_mask, hvy_work(:,:,:,1:params%dim,:,j), order_disc_nonlinear, tree_ID_flow, nu, C_eta )
+            call toc( "timestep: compute_NSI_RHS", 22, MPI_wtime()-t3)
             ! compute RHS for the pressure Poisson equation, solve the Poisson equation and add pressure gradient
+            t3 = MPI_wtime()
             call sync_ghosts_tree( params, hvy_work(:,:,:,1:params%dim,:,j), tree_ID_flow )
+            call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call compute_divergence_tree(params, hvy_work(:,:,:,1:params%dim,:,j), hvy_tmp(:,:,:,1,:), order_disc_pressure, tree_ID_flow)
+            call toc( "timestep: compute_divergence_tree", 23, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call multigrid_solve(params, hvy_tmp(:,:,:,params%dim+1:params%dim+1,:), hvy_tmp(:,:,:,1:1,:), hvy_tmp(:,:,:,params%dim+2:size(hvy_tmp,4),:), tree_ID_flow, verbose=.false.)
+            call toc( "timestep: multigrid_solve", 24, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call compute_projection(params, hvy_work(:,:,:,1:params%dim,:,j), hvy_tmp(:,:,:,params%dim+1:params%dim+1,:), hvy_work(:,:,:,1:params%dim,:,j), order_disc_pressure, tree_ID_flow, 1.0_rk)
+            call toc( "timestep: compute_projection", 25, MPI_wtime()-t3)
         enddo
 
         ! final stage (actual final update of state vector)
@@ -699,7 +724,9 @@ end subroutine compute_projection
                     + dt*params%butcher_tableau(size(params%butcher_tableau,1),j) * hvy_work(g(1)+1:g(1)+bs(1),g(2)+1:g(2)+bs(2),g(3)+1:g(3)+bs(3),1:params%dim,hvy_id,j-1)
             end do
         end do
+        t3 = MPI_wtime()
         call sync_ghosts_tree( params, hvy_block(:,:,:,1:params%dim,:), tree_ID_flow )
+        call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
 
         ! write(fname, '(A, i12.12, A)') "ux_", int(time*1.0e6),".h5"
         ! call saveHDF5_tree(fname, time, 0, 1, params, hvy_block, tree_ID )
@@ -729,39 +756,68 @@ end subroutine compute_projection
         ! let's do some regular projections - this can be changed by a parameter
         if (modulo(iteration, params%nprojection_NSI)==0) then
             ! Maybe u is not completely divergence free in the discrete sense, so we do one projection in order to get a divergence free velocity field
+            t3 = MPI_wtime()
             call sync_ghosts_tree( params, hvy_block(:,:,:,1:params%dim,:), tree_ID_flow )
+            call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call compute_divergence_tree(params, hvy_block(:,:,:,1:params%dim,:), hvy_tmp(:,:,:,params%dim+2,:), order_disc_pressure, tree_ID_flow)
+            call toc( "timestep: compute_divergence_tree", 23, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call sync_ghosts_tree( params, hvy_tmp(:,:,:,params%dim+2:params%dim+2,:), tree_ID_flow )
+            call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call multigrid_solve(params, hvy_block(:,:,:,params%dim+1:params%dim+1,:), hvy_tmp(:,:,:,params%dim+2:params%dim+2,:), hvy_tmp(:,:,:,params%dim+3:size(hvy_tmp,4),:), tree_ID_flow, init_0=.true., verbose=.false.)
+            call toc( "timestep: multigrid_solve", 24, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call sync_ghosts_tree( params, hvy_block(:,:,:,params%dim+1:params%dim+1,:), tree_ID_flow )
+            call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call compute_projection(params, hvy_block(:,:,:,1:params%dim,:), hvy_block(:,:,:,params%dim+1:params%dim+1,:), hvy_block(:,:,:,1:params%dim,:), order_disc_pressure, tree_ID_flow, 1.0_rk)
+            call toc( "timestep: compute_projection", 25, MPI_wtime()-t3)
         endif
 
         ! we reconstruct the pressure if we do statistics or save data to have it in respective accuracy
         if (it_is_time_to_save_data .or. (modulo(iteration, params%nsave_stats)==0).or.(abs(time - params%next_stats_time)<1e-12_rk)) then
+            t3 = MPI_wtime()
             call sync_ghosts_tree( params, hvy_block(:,:,:,1:params%dim,:), tree_ID_flow )
+            call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call compute_NSI_RHS(params, hvy_block(:,:,:,1:params%dim,:), hvy_mask, hvy_tmp(:,:,:,1:params%dim,:), order_disc_nonlinear, tree_ID_flow, nu, C_eta)
+            call toc( "timestep: compute_NSI_RHS", 22, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call sync_ghosts_tree( params, hvy_tmp(:,:,:,1:params%dim,:), tree_ID_flow )
+            call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call compute_divergence_tree(params, hvy_tmp(:,:,:,1:params%dim,:), hvy_tmp(:,:,:,params%dim+2,:), order_disc_pressure, tree_ID_flow)
+            call toc( "timestep: compute_divergence_tree", 23, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call sync_ghosts_tree( params, hvy_tmp(:,:,:,params%dim+2:params%dim+2,:), tree_ID_flow )
+            call toc( "timestep: sync_ghost_tree", 21, MPI_wtime()-t3)
+            t3 = MPI_wtime()
             call multigrid_solve(params, hvy_tmp(:,:,:,params%dim+1:params%dim+1,:), hvy_tmp(:,:,:,params%dim+2:params%dim+2,:), hvy_tmp(:,:,:,params%dim+3:size(hvy_tmp,4),:), tree_ID_flow)
+            call toc( "timestep: multigrid_solve", 24, MPI_wtime()-t3)
             do k_block = 1, hvy_n(tree_ID)
                 hvy_id = hvy_active(k_block, tree_ID)
                 hvy_block(g(1)+1:g(1)+bs(1),g(2)+1:g(2)+bs(2),g(3)+1:g(3)+bs(3),params%dim+1,hvy_id) = hvy_tmp(g(1)+1:g(1)+bs(1),g(2)+1:g(2)+bs(2),g(3)+1:g(3)+bs(3),4,hvy_id)
             enddo
         endif
 
+        call toc( "TOPLEVEL: time stepper", 11, MPI_wtime()-t4)
+
         !*******************************************************************
         ! statistics
         !*******************************************************************
-        t_block = MPI_wtime()
-        if ( (modulo(iteration, params%nsave_stats)==0).or.(abs(time - params%next_stats_time)<1e-12_rk) ) then
+        if ( (modulo(iteration, params%nsave_stats)==0).or.(abs(time - params%next_stats_time)<1e-12_rk) .or. abs(time - params%time_max) < 1e-12_rk ) then
+            t4 = MPI_wtime()
             ! we need to sync ghost nodes for some derived qtys, for sure
             call sync_ghosts_RHS_tree( params, hvy_block, tree_ID_flow )
 
             call statistics_wrapper(time, dt, params, hvy_block, hvy_tmp, hvy_mask, tree_ID_flow)
+            call toc( "TOPLEVEL: statistics", 13, MPI_wtime()-t4)
+
+            ! update next stats time
+            if (abs(time - params%next_stats_time)<1e-12_rk) params%next_stats_time = params%next_stats_time + params%tsave_stats
         endif
-        call toc( "TOPLEVEL: statistics", 13, MPI_wtime()-t_block)
 
         !***********************************************************************
         ! Adapt mesh - (coarsening where possible)
diff --git a/LIB/TIME/calculate_time_step.f90 b/LIB/TIME/calculate_time_step.f90
index e589465d..91db2cd8 100644
--- a/LIB/TIME/calculate_time_step.f90
+++ b/LIB/TIME/calculate_time_step.f90
@@ -5,12 +5,12 @@ subroutine calculate_time_step( params, time, iteration, hvy_block, dt, tree_ID
 
     !--------------------------------------------------------------
     implicit none
-    type (type_params), intent(in):: params                    !< user defined parameter structure
-    real(kind=rk), intent(in)     :: time                      !< current time of the simulation
-    integer(kind=ik), intent(in)  :: iteration
-    real(kind=rk), intent(in)     :: hvy_block(:, :, :, :, :)  !< heavy data array contains the block data of the statevector
-    integer(kind=ik), intent(in)  :: tree_ID
-    real(kind=rk), intent(out)    :: dt                         !< time step dt
+    type (type_params), intent(inout) :: params                    !< user defined parameter structure
+    real(kind=rk), intent(in)         :: time                      !< current time of the simulation
+    integer(kind=ik), intent(in)      :: iteration
+    real(kind=rk), intent(in)         :: hvy_block(:, :, :, :, :)  !< heavy data array contains the block data of the statevector
+    integer(kind=ik), intent(in)      :: tree_ID
+    real(kind=rk), intent(out)        :: dt                         !< time step dt
     !--------------------------------------------------------------
     ! MPI error variable
     integer(kind=ik) :: ierr, Jmax, k, lgt_id, hvy_id
@@ -74,6 +74,12 @@ subroutine calculate_time_step( params, time, iteration, hvy_block, dt, tree_ID
         end if
     end if
 
+    if ( params%time_statistics) then
+        ! time step should hit time_statistics_start_time exactly
+        if ( time+dt > params%time_statistics_start_time .and. time<params%time_statistics_start_time ) then
+            dt = params%time_statistics_start_time - time
+        end if
+    end if
 
     ! do not jump past final time
     if (time + dt > params%time_max .and. time<=params%time_max) dt = params%time_max - time
diff --git a/LIB/TIME/statistics_wrapper.f90 b/LIB/TIME/statistics_wrapper.f90
index ab9e71a2..b46cdd1b 100644
--- a/LIB/TIME/statistics_wrapper.f90
+++ b/LIB/TIME/statistics_wrapper.f90
@@ -88,11 +88,11 @@ subroutine statistics_wrapper(time, dt, params, hvy_block, hvy_tmp, hvy_mask, tr
 end subroutine statistics_wrapper
 
 
-subroutine time_statistics_wrapper(time, dt, time_start, params, hvy_block, hvy_tmp, hvy_mask, tree_ID)
+subroutine time_statistics_wrapper(time, dt, params, hvy_block, hvy_tmp, hvy_mask, tree_ID)
 
     implicit none
 
-    real(kind=rk), intent(in)           :: time, dt, time_start
+    real(kind=rk), intent(in)           :: time, dt
     type (type_params), intent(inout)   :: params                     !> user defined parameter structure
     real(kind=rk), intent(inout)        :: hvy_tmp(:, :, :, :, :)     !> heavy work data array - block data
     real(kind=rk), intent(inout)        :: hvy_block(:, :, :, :, :)   !> heavy data array - block data
@@ -121,7 +121,7 @@ subroutine time_statistics_wrapper(time, dt, time_start, params, hvy_block, hvy_
             hvy_id_mask = hvy_id
         endif
 
-        call TIME_STATISTICS_meta(params%physics_type, time, dt, time_start, hvy_block(:,:,:,:, hvy_id), params%g, x0, dx, &
+        call TIME_STATISTICS_meta(params%physics_type, time, dt, params%time_statistics_start_time, hvy_block(:,:,:,:, hvy_id), params%g, x0, dx, &
         hvy_tmp(:,:,:,:, hvy_id), hvy_mask(:,:,:,:, hvy_id_mask))
     enddo
 
diff --git a/LIB/TIMING/module_timing.f90 b/LIB/TIMING/module_timing.f90
index 02831783..1b1b0716 100644
--- a/LIB/TIMING/module_timing.f90
+++ b/LIB/TIMING/module_timing.f90
@@ -53,6 +53,7 @@ subroutine reset_all_timings()
     !   50-  55    createActiveSortedLists
     !   57-  59    updateMetadata_tree
     !   60-  63    synchronize_lgt_data
+    !   65-  69    createMask_tree
     !   70-  74    xfer_block_data
     !   80-  85    sync ghosts
     !   90-  92    balanceLoad_tree
@@ -64,7 +65,7 @@ subroutine reset_all_timings()
     !  150- 158    coarseExtensionUpdate_tree
     !  160- 165    executeCoarsening_tree
     !  200-1000 Other
-    !  250- 258    forest
+    !  250- 270    forest
     !  350- 354    module_MOR
     ! 1000-XXXX Miscellaneous
     ! 1001-1002    Commented block based toc in coarsening indicator
diff --git a/LIB/WAVELETS/module_wavelets.f90 b/LIB/WAVELETS/module_wavelets.f90
index b05e0ff9..de85f83c 100644
--- a/LIB/WAVELETS/module_wavelets.f90
+++ b/LIB/WAVELETS/module_wavelets.f90
@@ -1713,12 +1713,6 @@ subroutine WaveReconstruction_dim1( params, u_wc, SC_reconstruct, WC_reconstruct
         if (.not. allocated(params%HD)) call abort(1717229, "Wavelet setup not called?!")
         if (.not. allocated(params%GD)) call abort(1717231, "Wavelet setup not called?!")
 
-        ! For the reconstruction of the wavelets all points are shifted by one one the SC positions
-        ! therefore, if the first point in the ghost layer would be a WC, it is ignored
-        ! This is done to align the WC with the correct positions. However, as the left bound of the GR filter defines the minimum g,
-        ! we apply a trick here, to ignore the first point in order to reduce the ghost point size, because it is always zero
-        ! The respecting buffers are zero-padded in order to account for this shift.
-
         maxn = maxval((/ nx, ny, nz /))
         if (allocated(buffer1)) then
             if (size(buffer1, dim=1)<maxn) deallocate(buffer1)
diff --git a/LIB/fortran.mk b/LIB/fortran.mk
index 85922d97..a95185b0 100644
--- a/LIB/fortran.mk
+++ b/LIB/fortran.mk
@@ -211,10 +211,11 @@ $(OBJDIR)/module_ini_files_parser.o: module_ini_files_parser.f90 $(OBJDIR)/modul
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_params.o: module_params.f90 $(OBJDIR)/module_ini_files_parser_mpi.o $(OBJDIR)/module_globals.o $(OBJDIR)/module_t_files.o \
-	ini_file_to_params.f90 $(OBJDIR)/module_helpers.o $(OBJDIR)/module_t_files.o
+	ini_file_to_params.f90 $(OBJDIR)/module_helpers.o
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
-$(OBJDIR)/module_bridge_interface.o: module_bridge_interface.f90 $(OBJDIR)/module_treelib.o $(OBJDIR)/module_mesh.o
+$(OBJDIR)/module_bridge_interface.o: module_bridge_interface.f90 $(OBJDIR)/module_treelib.o $(OBJDIR)/module_mesh.o $(OBJDIR)/module_params.o \
+	$(OBJDIR)/module_forestMetaData.o
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_navier_stokes_params.o: module_navier_stokes_params.f90 $(OBJDIR)/module_globals.o\
@@ -252,12 +253,11 @@ $(OBJDIR)/module_shock.o: module_shock.f90 $(OBJDIR)/module_globals.o $(OBJDIR)/
 
 $(OBJDIR)/module_ACM.o: module_ACM.f90 rhs_ACM.f90 create_mask.f90 sponge.f90 2D_wingsection.f90 save_data_ACM.f90 \
 	$(OBJDIR)/module_ini_files_parser_mpi.o $(OBJDIR)/module_operators.o $(OBJDIR)/module_globals.o $(OBJDIR)/module_t_files.o \
-	$(OBJDIR)/module_helpers.o $(OBJDIR)/module_insects.o statistics_ACM.f90 time_statistics_ACM.f90 inicond_ACM.f90 boundcond_ACM.f90 filter_ACM.f90 $(OBJDIR)/module_params.o \
-	$(OBJDIR)/module_t_files.o $(OBJDIR)/module_timing.o
+	$(OBJDIR)/module_helpers.o $(OBJDIR)/module_insects.o statistics_ACM.f90 time_statistics_ACM.f90 inicond_ACM.f90 boundcond_ACM.f90 filter_ACM.f90 $(OBJDIR)/module_params.o $(OBJDIR)/module_timing.o
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_ConvDiff_new.o: module_ConvDiff_new.f90 rhs_convdiff.f90 statistics_convdiff.f90 time_statistics_convdiff.f90 \
-	$(OBJDIR)/module_ini_files_parser_mpi.o $(OBJDIR)/module_globals.o $(OBJDIR)/module_params.o $(OBJDIR)/module_operators.o
+	$(OBJDIR)/module_ini_files_parser_mpi.o $(OBJDIR)/module_globals.o $(OBJDIR)/module_params.o $(OBJDIR)/module_operators.o $(OBJDIR)/module_t_files.o
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_timing.o: module_timing.f90
@@ -274,7 +274,7 @@ $(OBJDIR)/module_fft.o: module_fft.f90 $(OBJDIR)/module_params.o $(OBJDIR)/modul
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_poisson.o: module_poisson.f90 $(OBJDIR)/module_params.o $(OBJDIR)/module_globals.o $(OBJDIR)/module_helpers.o $(OBJDIR)/module_forestMetaData.o \
-	$(OBJDIR)/module_treelib.o $(OBJDIR)/module_timing.o
+	$(OBJDIR)/module_treelib.o $(OBJDIR)/module_timing.o $(OBJDIR)/module_mesh.o $(OBJDIR)/module_fft.o $(OBJDIR)/module_mpi.o multigrid_vcycle.f90
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_physics_metamodule.o: module_physics_metamodule.f90 $(OBJDIR)/module_globals.o \
@@ -301,22 +301,20 @@ $(OBJDIR)/module_indicators.o: module_indicators.f90 $(OBJDIR)/module_params.o $
 	$(OBJDIR)/module_mpi.o $(OBJDIR)/module_wavelets.o refinementIndicator_tree.f90 coarseningIndicator_block.f90 threshold_block.f90
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
-$(OBJDIR)/module_helpers.o: module_helpers.f90 $(OBJDIR)/module_globals.o most_common_element.f90 $(OBJDIR)/module_ini_files_parser_mpi.o
+$(OBJDIR)/module_helpers.o: module_helpers.f90 $(OBJDIR)/module_globals.o most_common_element.f90 rotation_matrices.f90
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_mesh.o: module_mesh.f90 $(OBJDIR)/module_params.o $(OBJDIR)/module_timing.o $(OBJDIR)/module_wavelets.o \
 	$(OBJDIR)/module_mpi.o $(OBJDIR)/module_treelib.o $(OBJDIR)/module_physics_metamodule.o $(OBJDIR)/module_indicators.o \
-	$(OBJDIR)/module_helpers.o $(OBJDIR)/module_params.o $(OBJDIR)/module_forestMetaData.o $(OBJDIR)/module_poisson.o $(OBJDIR)/module_fft.o \
-	refineToEquidistant_tree.f90 \
+	$(OBJDIR)/module_helpers.o $(OBJDIR)/module_params.o $(OBJDIR)/module_forestMetaData.o \
 	InputOutput_Flusi.f90 InputOutput.f90 create_active_and_sorted_lists.f90 createMask_tree.f90 block_xfer_nonblocking.f90 \
 	updateNeighbors_tree.f90 find_neighbors.f90 doesBlockExist_tree.f90 refine_tree.f90 respectJmaxJmin_tree.f90 \
-	refinementExecute.f90 adapt_tree.f90 coarseningIndicator_tree.f90 ensureGradedness_tree.f90 \
+	refinementExecute.f90 adapt_tree.f90 coarseningIndicator_tree.f90 ensureGradedness_tree.f90 refineToEquidistant_tree.f90 \
 	ensure_completeness_block.f90 executeCoarsening_tree.f90 merge_blocks.f90 balanceLoad_tree.f90 \
 	treecode_to_sfc_id_2D.f90 treecode_to_sfc_id_3D.f90 treecode_to_hilbertcode_2D.f90 treecode_to_hilbertcode_3D.f90 get_block_spacing_origin.f90 \
 	find_family.f90 ActiveLevel_tree.f90 get_free_local_light_id.f90 quicksort.f90 updateMetadata_tree.f90 createEquidistantGrid_tree.f90 \
 	createRandomGrid_tree.f90 reset_tree.f90 allocate_forest.f90 write_block_distribution.f90 check_lgt_block_synchronization.f90 \
-	remove_nonperiodic_neighbors.f90 forest.f90 multigrid_vcycle.f90 \
-	securityZone_tree.f90 coarseExtensionUpdate_tree.f90 updateFamily_tree.f90
+	remove_nonperiodic_neighbors.f90 forest.f90 setInitialCondition_tree.f90 securityZone_tree.f90 coarseExtensionUpdate_tree.f90 updateFamily_tree.f90
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_unit_test.o: module_unit_test.f90 $(OBJDIR)/module_params.o $(OBJDIR)/module_mesh.o $(OBJDIR)/module_time_step.o \
@@ -340,12 +338,14 @@ $(OBJDIR)/module_forestMetaData.o: module_forestMetaData.f90 $(OBJDIR)/module_gl
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_saving.o: module_saving.f90 \
-	 $(OBJDIR)/module_physics_metamodule.o $(OBJDIR)/module_mesh.o \
-	save_data.f90
+	 $(OBJDIR)/module_physics_metamodule.o $(OBJDIR)/module_mesh.o $(OBJDIR)/module_params.o $(OBJDIR)/module_globals.o \
+	 $(OBJDIR)/module_timing.o $(OBJDIR)/module_mpi.o $(OBJDIR)/module_forestMetaData.o $(OBJDIR)/module_helpers.o \
+	 $(OBJDIR)/module_operators.o save_data.f90
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 $(OBJDIR)/module_MOR.o: module_MOR.f90  \
-	$(OBJDIR)/module_globals.o $(OBJDIR)/module_ini_files_parser.o $(OBJDIR)/module_mesh.o
+	$(OBJDIR)/module_globals.o $(OBJDIR)/module_ini_files_parser.o $(OBJDIR)/module_mesh.o $(OBJDIR)/module_params.o \
+	$(OBJDIR)/module_helpers.o $(OBJDIR)/module_forestMetaData.o
 	$(FC) $(FFLAGS) -c -o $@ $< $(LDFLAGS)
 
 # Compile remaining objects from Fortran files.
diff --git a/PARAMS_template.ini b/PARAMS_template.ini
index 53892f34..660928ff 100644
--- a/PARAMS_template.ini
+++ b/PARAMS_template.ini
@@ -278,8 +278,18 @@ N_time_statistics=1;
 ; quantities are:
 ;    phiX for convdiff
 ;    ux,uy,uz,p,umag,divu,vort,vorabs,dissipation, u[xyz]d[xyz], u[xyz]d[xyz]2, u[xyz]2
-; operators are int, avg/mean, var, minmax, var needs to be followed by avg/mean of the same quantity
+; operators are int, avg/mean, var, minmax, var needs to be followed by avg/mean of the same quantity at any point
+; we can also compute covariances: uxuv-cov, uxuz-cov, uyuz-cov, uxp-cov, uyp-cov, uzp-cov
+; covariances need to be followed by the avg/mean of the two quantities involved and any point
+; example for var / cov: ux-var uy-var uxuy-cov p-avg ux-avg uy-avg, avg just needs to be there somewhere behind the variance/covariance
 time_statistics_names=phi1-avg;
+; sometimes we want to restart simulations. In this case we want to read in the time_statistics
+; if this is set to 1, Physics::input_files has to include the time_statistics files as well
+read_from_files_time_statistics=0;
+; set a start time from when averaging is done.
+; If it is before the read-in time, then it is for weighting the read-in values.
+; If it is after the read-in time or 0.0, then the time_statistics accumulation start at that time.
+time_statistics_start_time=1.0;
 
 
 [ACM-new]
@@ -482,7 +492,7 @@ filter_freq=10;
 ; flag for penalization (0/1)
 penalization=1;
 ; WABBIT needs to know about the mask function (if penalization is used): does it contain
-; a time-dependent-part (e.g. moving obstacles, time-dependent forcing)? does it contain
+; a time-dependent-part (e.g. moving obstacles, time-dependent forcing, SPONGE)? does it contain
 ; a time-independent part (fixed walls, homogeneous forcing)? or both? WABBIT needs to know
 ; that since we try to create the time-independent mask function only once, but the time-dependent
 ; part of course in every time step.
diff --git a/PARAMS_template_ACM.ini b/PARAMS_template_ACM.ini
index d5bb7a4d..7675d0dc 100644
--- a/PARAMS_template_ACM.ini
+++ b/PARAMS_template_ACM.ini
@@ -49,7 +49,7 @@ number_ghost_nodes=;
 number_ghost_nodes_rhs=;
 ; number of equations / components of state vector. Note you have to properly
 ; adjust this value for the physics module that you use.
-; ACM: 3 (2D), 4 (3D) +  nscalars
+; ACM: 3 (2D), 4 (3D) +  nscalars + nTimeStatistics
 ; Convection: nscalars (2D /3D)
 number_equations=3;
 ; how many of the equations are used in the rhs evaluation? default=number_equations, can be reduced when using timestatistics
@@ -283,8 +283,19 @@ N_time_statistics=1;
 ; quantities are:
 ;    phiX for convdiff
 ;    ux,uy,uz,p,umag,divu,vort,vorabs,dissipation, u[xyz]d[xyz], u[xyz]d[xyz]2, u[xyz]2
-; operators are int, avg/mean, var, minmax, var needs to be followed by avg/mean of the same quantity
+; operators are int, avg/mean, var, minmax, var needs to be followed by avg/mean of the same quantity at any point
+; we can also compute covariances: uxuv-cov, uxuz-cov, uyuz-cov, uxp-cov, uyp-cov, uzp-cov
+; covariances need to be followed by the avg/mean of the two quantities involved and any point
+; example for var / cov: ux-var uy-var uxuy-cov p-avg ux-avg uy-avg, avg just needs to be there somewhere behind the variance/covariance
 time_statistics_names=ux-avg;
+; sometimes we want to restart simulations. In this case we want to read in the time_statistics
+; if this is set to 1, Physics::input_files has to include the time_statistics files as well
+read_from_files_time_statistics=0;
+; set a start time from when averaging is done.
+; If it is before the read-in time, then it is for weighting the read-in values.
+; If it is after the read-in time or 0.0, then the time_statistics accumulation start at that time.
+; However, it is advised that for adaptive simulations, the initial transient is computaded without time_statistics due to performance reasons.
+time_statistics_start_time=1.0;
 
 
 [ACM-new]
@@ -408,7 +419,7 @@ filter_freq=10;
 ; flag for penalization (0/1)
 penalization=1;
 ; WABBIT needs to know about the mask function (if penalization is used): does it contain
-; a time-dependent-part (e.g. moving obstacles, time-dependent forcing)? does it contain
+; a time-dependent-part (e.g. moving obstacles, time-dependent forcing, SPONGE)? does it contain
 ; a time-independent part (fixed walls, homogeneous forcing)? or both? WABBIT needs to know
 ; that since we try to create the time-independent mask function only once, but the time-dependent
 ; part of course in every time step.