Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions components/eamxx/docs/user/diags/field_contraction.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ number of bins `Y` as `_Y_bins`. All zonal averages are calculated
using the area fraction in each bin as the weight.

For 180 latitude bins, the bins are defined
as follows: [-90, -89), [-89, -88), ..., [89, 90).
as follows: [-90, -89), [-89, -88), ..., [89, 90].
For 90 latitude bins, the bins are defined as follows:
[-90, -88), [-88, -86), ..., [88, 90).
[-90, -88), [-88, -86), ..., [88, 90].
And so on...

| Reduction | Weight | Description |
Expand All @@ -99,7 +99,7 @@ fields:
# in this example, we use T_mid in units of K
- T_mid_horiz_avg # K
- T_mid_vert_avg_dp_weighted # K
- T_mid_vert_sum_dp_weighted # K * Pa * s / (m * m)
- T_mid_vert_sum_dp_weighted # K * Pa * s / (m * m)
- T_mid_vert_avg_dz_weighted # K
- T_mid_vert_sum_dz_weighted # K * m
- T_mid_vert_avg # K
Expand Down
11 changes: 6 additions & 5 deletions components/eamxx/src/diagnostics/tests/zonal_avg_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
namespace scream {

std::shared_ptr<GridsManager> create_gm(const ekat::Comm &comm, const int ngcols, const int nlevs) {

using vos_t = std::vector<std::string>;
ekat::ParameterList gm_params;
gm_params.set("grids_names", vos_t{"Point Grid"});
Expand Down Expand Up @@ -41,11 +40,11 @@ TEST_CASE("zonal_avg") {
constexpr int nlevs = 3;
constexpr int dim3 = 4;
const int ncols = 6;
const int nlats = 4;
const int nlats = 4; // needs to be <= ncols

const int ngcols = ncols * comm.size();
auto gm = create_gm(comm, ngcols, nlevs);
auto grid = gm->get_grid("Physics");
auto gm = create_gm(comm, ngcols, nlevs);
auto grid = gm->get_grid("Physics");

Field area = grid->get_geometry_data("area");
auto area_view_h = area.get_view<const Real *, Host>();
Expand All @@ -60,8 +59,10 @@ TEST_CASE("zonal_avg") {
lat_view_h(i) = sp(-90.0) + (i % nlats + sp(0.5)) * lat_delta;
zonal_areas[i % nlats] += area_view_h[i];
}
lat.sync_to_dev();
comm.all_reduce(zonal_areas.data(), zonal_areas.size(), MPI_SUM);
lat_view_h(0) = sp(-90.0); // move column to be directly at southern pole
lat_view_h(nlats-1) = sp(90.0); // move column to be directly at northern pole
lat.sync_to_dev();

// Input (randomized) qc
FieldLayout scalar1d_layout{{COL}, {ncols}};
Expand Down
175 changes: 121 additions & 54 deletions components/eamxx/src/diagnostics/zonal_avg.cpp
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
#include "diagnostics/zonal_avg.hpp"

#include "share/field/field_utils.hpp"

#include <ekat_math_utils.hpp>
#include <ekat_team_policy_utils.hpp>

namespace scream {

void ZonalAvgDiag::compute_zonal_sum(const Field &result, const Field &field, const Field &weight,
const Field &lat, const ekat::Comm *comm) {
auto result_layout = result.get_header().get_identifier().get_layout();
const int num_zonal_bins = result_layout.dim(0);
const int ncols = field.get_header().get_identifier().get_layout().dim(0);
const Real lat_delta = sp(180.0) / num_zonal_bins;
// Utility to compute the contraction of a field along its column dimension.
// This is equivalent to f_out = einsum('i,i...k->...k', weight, f_in).
// The implementation is such that:
// - all Field objects must be allocated
// - the first dimension for field and weight and second dimension for
// bin_to_cols is for the columns (COL)
// - the first dimension for result and bin_to_cols is for the zonal bins (CMP,"bin")
// - field and result must be the same dimension, up to 3
void compute_zonal_sum(const Field &result, const Field &field, const Field &weight,
const Field &bin_to_cols, const ekat::Comm *comm) {
auto result_layout = result.get_header().get_identifier().get_layout();
auto bin_to_cols_layout = bin_to_cols.get_header().get_identifier().get_layout();
const int num_zonal_bins = bin_to_cols_layout.dim(0);
const int max_ncols_per_bin = bin_to_cols_layout.dim(1);

auto weight_view = weight.get_view<const Real *>();
auto lat_view = lat.get_view<const Real *>();
auto bin_to_cols_view = bin_to_cols.get_view<const Int **>();
using KT = ekat::KokkosTypes<DefaultDevice>;
using TeamPolicy = Kokkos::TeamPolicy<Field::device_t::execution_space>;
using TeamMember = typename TeamPolicy::member_type;
Expand All @@ -23,67 +29,60 @@ void ZonalAvgDiag::compute_zonal_sum(const Field &result, const Field &field, co
case 1: {
auto field_view = field.get_view<const Real *>();
auto result_view = result.get_view<Real *>();
TeamPolicy team_policy = TPF::get_default_team_policy(num_zonal_bins, ncols);
TeamPolicy team_policy = TPF::get_default_team_policy(num_zonal_bins, max_ncols_per_bin);
Kokkos::parallel_for(
"compute_zonal_sum_" + field.name(), team_policy, KOKKOS_LAMBDA(const TeamMember &tm) {
const int lat_i = tm.league_rank();
const Real lat_lower = sp(-90.0) + lat_i * lat_delta;
const Real lat_upper = lat_lower + lat_delta;
"compute_zonal_sum_" + field.name(), team_policy,
KOKKOS_LAMBDA(const TeamMember &tm) {
const int bin_i = tm.league_rank();
Kokkos::parallel_reduce(
Kokkos::TeamVectorRange(tm, ncols),
[&](int i, Real &val) {
// TODO: check if tenary is ok here (if not, multiply by flag)
int flag = (lat_lower <= lat_view(i)) && (lat_view(i) < lat_upper);
val += flag ? weight_view(i) * field_view(i) : sp(0.0);
Kokkos::TeamVectorRange(tm, 1, 1+bin_to_cols_view(bin_i,0)),
[&](int lcol_j, Real &val) {
const int col_i = bin_to_cols_view(bin_i, lcol_j);
val += weight_view(col_i) * field_view(col_i);
},
result_view(lat_i));
result_view(bin_i));
});
} break;
case 2: {
const int d1 = result_layout.dim(1);
auto field_view = field.get_view<const Real **>();
auto result_view = result.get_view<Real **>();
TeamPolicy team_policy = TPF::get_default_team_policy(num_zonal_bins * d1, ncols);
TeamPolicy team_policy = TPF::get_default_team_policy(num_zonal_bins * d1, max_ncols_per_bin);
Kokkos::parallel_for(
"compute_zonal_sum_" + field.name(), team_policy, KOKKOS_LAMBDA(const TeamMember &tm) {
"compute_zonal_sum_" + field.name(), team_policy,
KOKKOS_LAMBDA(const TeamMember &tm) {
const int idx = tm.league_rank();
const int d1_i = idx / num_zonal_bins;
const int lat_i = idx % num_zonal_bins;
const Real lat_lower = sp(-90.0) + lat_i * lat_delta;
const Real lat_upper = lat_lower + lat_delta;
const int bin_i = idx % num_zonal_bins;
Kokkos::parallel_reduce(
Kokkos::TeamVectorRange(tm, ncols),
[&](int i, Real &val) {
int flag = (lat_lower <= lat_view(i)) && (lat_view(i) < lat_upper);
// TODO: check if tenary is ok here (if not, multiply by flag)
val += flag ? weight_view(i) * field_view(i, d1_i) : sp(0.0);
Kokkos::TeamVectorRange(tm, 1, 1+bin_to_cols_view(bin_i,0)),
[&](int lcol_j, Real &val) {
const int col_i = bin_to_cols_view(bin_i, lcol_j);
val += weight_view(col_i) * field_view(col_i, d1_i);
},
result_view(lat_i, d1_i));
result_view(bin_i, d1_i));
});
} break;
case 3: {
const int d1 = result_layout.dim(1);
const int d2 = result_layout.dim(2);
auto field_view = field.get_view<const Real ***>();
auto result_view = result.get_view<Real ***>();
TeamPolicy team_policy = TPF::get_default_team_policy(num_zonal_bins * d1 * d2, ncols);
TeamPolicy team_policy = TPF::get_default_team_policy(num_zonal_bins * d1 * d2, max_ncols_per_bin);
Kokkos::parallel_for(
"compute_zonal_sum_" + field.name(), team_policy, KOKKOS_LAMBDA(const TeamMember &tm) {
const int idx = tm.league_rank();
const int d1_i = idx / (num_zonal_bins * d2);
const int idx2 = idx % (num_zonal_bins * d2);
const int d2_i = idx2 / num_zonal_bins;
const int lat_i = idx2 % num_zonal_bins;
const Real lat_lower = sp(-90.0) + lat_i * lat_delta;
const Real lat_upper = lat_lower + lat_delta;
const int bin_i = idx2 % num_zonal_bins;
Kokkos::parallel_reduce(
Kokkos::TeamVectorRange(tm, ncols),
[&](int i, Real &val) {
int flag = (lat_lower <= lat_view(i)) && (lat_view(i) < lat_upper);
// TODO: check if tenary is ok here (if not, multiply by flag)
val += flag ? weight_view(i) * field_view(i, d1_i, d2_i) : sp(0.0);
Kokkos::TeamVectorRange(tm, 1, 1+bin_to_cols_view(bin_i,0)),
[&](int lcol_j, Real &val) {
const int col_i = bin_to_cols_view(bin_i, lcol_j);
val += weight_view(col_i) * field_view(col_i, d1_i, d2_i);
},
result_view(lat_i, d1_i, d2_i));
result_view(bin_i, d1_i, d2_i));
});
} break;
default:
Expand Down Expand Up @@ -152,6 +151,75 @@ void ZonalAvgDiag::initialize_impl(const RunType /*run_type*/) {
m_diagnostic_output = Field(diagnostic_id);
m_diagnostic_output.allocate_view();

// allocate column counter
FieldLayout ncols_per_bin_layout({CMP}, {m_num_zonal_bins}, {"bin"});
FieldIdentifier ncols_per_bin_id("number of columns per bin",
ncols_per_bin_layout, FieldIdentifier::Units::nondimensional(),
field_id.get_grid_name(), DataType::IntType);
Field ncols_per_bin(ncols_per_bin_id);
ncols_per_bin.allocate_view();
ncols_per_bin.deep_copy(0);

// count how many columns are in each zonal bin
using KT = ekat::KokkosTypes<DefaultDevice>;
using TeamPolicy = Kokkos::TeamPolicy<Field::device_t::execution_space>;
using TeamMember = typename TeamPolicy::member_type;
using TPF = ekat::TeamPolicyFactory<typename KT::ExeSpace>;
const int ncols = field_layout.dim(COL);
const Real lat_delta = sp(180.0) / m_num_zonal_bins;
auto lat_view = m_lat.get_view<const Real *>();
auto ncols_per_bin_view = ncols_per_bin.get_view<Int *>();
const int num_zonal_bins = m_num_zonal_bins; // for use inside lambdas
TeamPolicy team_policy = TPF::get_default_team_policy(m_num_zonal_bins, ncols);
Kokkos::parallel_for("count_columns_per_zonal_bin_" + field.name(),
team_policy, KOKKOS_LAMBDA(const TeamMember &tm) {
const int bin_i = tm.league_rank();
const Real lat_lower = sp(-90.0) + bin_i * lat_delta;
const Real lat_upper = (bin_i < num_zonal_bins-1)
? lat_lower + lat_delta : sp(90.0 + 0.5*lat_delta);
Kokkos::parallel_reduce(Kokkos::TeamVectorRange(tm, ncols),
[&](int col_i, Int &val) {
if ((lat_lower <= lat_view(col_i)) && (lat_view(col_i) < lat_upper))
val++;
},
ncols_per_bin_view(bin_i));
});

// determine maximum number of columns per bin & allocate bin to column map
using RangePolicy = Kokkos::RangePolicy<Field::device_t::execution_space>;
Int max_ncols_per_bin = 0;
Kokkos::parallel_reduce(RangePolicy(0, m_num_zonal_bins),
KOKKOS_LAMBDA(int bin_i, Int &val) {
val = std::max(val, ncols_per_bin_view(bin_i));
},
Kokkos::Max<Int>(max_ncols_per_bin));
FieldLayout bin_to_cols_layout = ncols_per_bin_layout.append_dim({COL}, {1+max_ncols_per_bin});
FieldIdentifier bin_to_cols_id("columns in each zonal bin",
bin_to_cols_layout, FieldIdentifier::Units::nondimensional(),
field_id.get_grid_name(), DataType::IntType);
m_bin_to_cols = Field(bin_to_cols_id);
m_bin_to_cols.allocate_view();

// compute bin to column map, where the (i,j)-th entry is such that
// - for j=0, the entry is the number of columns in the i-th zonal bin
// - for j>0, the entry is a column index in the i-th zonal bin
auto bin_to_cols_view = m_bin_to_cols.get_view<Int **>();
Kokkos::parallel_for("assign_columns_to_zonal_bins_" + field.name(),
RangePolicy(0, m_num_zonal_bins), KOKKOS_LAMBDA(int bin_i) {
const Real lat_lower = sp(-90.0) + bin_i * lat_delta;
const Real lat_upper = (bin_i < num_zonal_bins-1)
? lat_lower + lat_delta : sp(90.0 + 0.5*lat_delta);
bin_to_cols_view(bin_i, 0) = 0;
for (int col_i=0; col_i < ncols; col_i++)
{
if ((lat_lower <= lat_view(col_i)) && (lat_view(col_i) < lat_upper))
{
bin_to_cols_view(bin_i, 0) += 1;
bin_to_cols_view(bin_i, bin_to_cols_view(bin_i,0)) = col_i;
}
}
Comment on lines +213 to +220
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this loop is what is causing the CUDA errors (I'll confirm this once I have access to an appropriate machine).

The loop is a plain old for loop because the process of building lists of column indices is inherently serial (i.e., a parallel for loop would run into many race conditions). Looking through the kokkos documentation, I believe this could be made into a parallel_for with the use of either

  1. atomic operations .. although there would technically need to be two atomic operations
  2. a View with an Atomic memory trait
    Any thoughts on one versus the other is greatly appreciated.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could probably accomplish this through a parallel_reduce(), but I would need to take a closer look at the structure to confirm. I'll take a look at this Monday, and review the PR as a whole.

Copy link
Contributor

@bartgol bartgol Aug 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is done during initialization, I am not too concerned about this. That said, you could split the work into 3 kernels:

  1. compute the bin index each col belongs to
pfor (policy, KOKKOS_LAMBDA (int i) {
  int ibin = ... // figure out the index
  col_bin(icol) = ibin;
});
  1. Run a || for over cols with a || reduce to compute the bins size. @tcclevenger prob knows how to use an array as accumulator. Something that looks like
preduce(policy,KOKKOS_LAMBDA(int icol, Array& local_sizes) {
  int ibin = col_bin(icol);
  ++local_sizes[ibin]'
},bin_sizes);
  1. Proceed with the gathering of indices:
pfor(policy,KOKKOS_LAMBDA(int ibin){
  for (int icol=0, k=0; icol<ncols; ++icol) {
    if (col_bin(icol)!=ibin) continue;
      bin_columns(bin,k) = icol;
      ++k;
  }
});

Between 2 and 3 you can also check that the max of bin_sizes is indeed less then max_ncols_per_bin. Your code may be attempting to access the view out of bounds if the max was not large enough.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This does not really make the code faster, but it removes the need for a max_ncols_per_bin, as you could create the diagnostic after step 2, once you know all the bins sizes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your code may be attempting to access the view out of bounds if the max was not large enough.

This was more-or-less the issue: I was using std::max on device, which was resulting in the max being zero. This should be corrected now in 73dc4dc

This does not really make the code faster, but it removes the need for a max_ncols_per_bin, as you could create the diagnostic after step 2, once you know all the bins sizes.

This makes sense to me. The only reason I opted to keep the original approach is that I don't have experience using an array as an accumulator.

});

// allocate zonal area
const FieldIdentifier &area_id = m_scaled_area.get_header().get_identifier();
FieldLayout zonal_area_layout({CMP}, {m_num_zonal_bins}, {"bin"});
Expand All @@ -166,27 +234,26 @@ void ZonalAvgDiag::initialize_impl(const RunType /*run_type*/) {
Field ones(ones_id);
ones.allocate_view();
ones.deep_copy(1.0);
compute_zonal_sum(zonal_area, m_scaled_area, ones, m_lat, &m_comm);
compute_zonal_sum(zonal_area, m_scaled_area, ones, m_bin_to_cols, &m_comm);

// scale area by 1 / zonal area
using RangePolicy = Kokkos::RangePolicy<Field::device_t::execution_space>;
const Real lat_delta = sp(180.0) / m_num_zonal_bins;
const int ncols = field_layout.dim(0);
const int nbins = m_num_zonal_bins;
auto lat_view = m_lat.get_view<const Real *>();
auto zonal_area_view = zonal_area.get_view<const Real *>();
auto scaled_area_view = m_scaled_area.get_view<Real *>();
Kokkos::parallel_for(
"scale_area_by_zonal_area_" + field.name(), RangePolicy(0, ncols),
KOKKOS_LAMBDA(const int &i) {
const int lat_i = ekat::impl::min(static_cast<int>((lat_view(i) + sp(90.0)) / lat_delta),nbins-1);
scaled_area_view(i) /= zonal_area_view(lat_i);
Kokkos::parallel_for("scale_area_by_zonal_area_" + field.name(),
team_policy, KOKKOS_LAMBDA(const TeamMember &tm) {
const int bin_i = tm.league_rank();
Kokkos::parallel_for(
Kokkos::TeamVectorRange(tm, 1, 1+bin_to_cols_view(bin_i,0)),
[&](int lcol_j) {
const int col_i = bin_to_cols_view(bin_i, lcol_j);
scaled_area_view(col_i) /= zonal_area_view(bin_i);
});
});
}

void ZonalAvgDiag::compute_diagnostic_impl() {
const auto &field = get_fields_in().front();
compute_zonal_sum(m_diagnostic_output, field, m_scaled_area, m_lat, &m_comm);
compute_zonal_sum(m_diagnostic_output, field, m_scaled_area, m_bin_to_cols, &m_comm);
}

} // namespace scream
19 changes: 6 additions & 13 deletions components/eamxx/src/diagnostics/zonal_avg.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ namespace scream {
* This diagnostic will calculate area-weighted zonal averages of a field across
* the COL tag dimension producing an N dimensional field, where the COL tag
* dimension is replaced by a CMP tag dimension named "bin" that indicates which
* zonal band the average value corresponds to.
* zonal band the average value corresponds to. All bins are "closed" at the
* lower value and "open" at the upper value (lat_lower <= lat < lat_upper),
* with the exception of the "last" bin that is closed at both ends to capture
* any column that is centered at the northern pole (lat_lower <= lat <= 90).
*/

class ZonalAvgDiag : public AtmosphereDiagnostic {
Expand All @@ -30,24 +33,14 @@ class ZonalAvgDiag : public AtmosphereDiagnostic {
void initialize_impl(const RunType /*run_type*/);
void compute_diagnostic_impl();

// TODO: make it a local function in the cpp file
// Utility to compute the contraction of a field along its column dimension.
// This is equivalent to f_out = einsum('i,i...k->...k', weight, f_in).
// The implementation is such that:
// - all Field objects must be allocated
// - the first dimension for field, weight, and lat is for the columns (COL)
// - the first dimension for result is for the zonal bins (CMP,"bin")
// - field and result must be the same dimension, up to 3
// TODO: make it a local function in the cpp file
static void compute_zonal_sum(const Field &result, const Field &field, const Field &weight,
const Field &lat, const ekat::Comm *comm = nullptr);

protected:
std::string m_diag_name;
int m_num_zonal_bins;

Field m_lat;
Field m_scaled_area;
Field m_bin_to_cols;

};

} // namespace scream
Expand Down
Loading