diff --git a/CMakeLists.txt b/CMakeLists.txt index 80551ce9..a77ac543 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -481,6 +481,12 @@ endif() if(ENABLE_INTEGRATED_TESTS) enable_testing() + + # NOTE: This can also be done by a subdirectory (useful when we have multiple tests) + # Add includes for Catch2 + set(TEST_DIR "./test/include") + include_directories(${TEST_DIR}) + add_subdirectory(test/integrated) endif(ENABLE_INTEGRATED_TESTS) diff --git a/src/particle_operations/sort.h b/src/particle_operations/sort.h index 3f37f699..97a7a90d 100644 --- a/src/particle_operations/sort.h +++ b/src/particle_operations/sort.h @@ -12,8 +12,8 @@ struct min_max_functor { min_max_functor(const Kokkos::View& view_) : view(view_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t& i, minmax_scalar& minmax) const { - if(view(i) < minmax.min_val && view(i) != 0) minmax.min_val = view(i); - if(view(i) > minmax.max_val && view(i) != 0) minmax.max_val = view(i); + if(view(i) < minmax.min_val) minmax.min_val = view(i); + if(view(i) > minmax.max_val) minmax.max_val = view(i); } }; @@ -23,8 +23,8 @@ struct min_max_functor_u64 { min_max_functor_u64(const Kokkos::View& view_) : view(view_) {} KOKKOS_INLINE_FUNCTION void operator()(const size_t& i, minmax_scalar& minmax) const { - if(view(i) < minmax.min_val && view(i) != 0) minmax.min_val = view(i); - if(view(i) > minmax.max_val && view(i) != 0) minmax.max_val = view(i); + if(view(i) < minmax.min_val) minmax.min_val = view(i); + if(view(i) > minmax.max_val) minmax.max_val = view(i); } }; @@ -201,21 +201,19 @@ struct DefaultSort { Kokkos::deep_copy(bin_counter, 0); // Count number of particles in each cell Kokkos::parallel_for("get max nppc", Kokkos::RangePolicy<>(0, np), KOKKOS_LAMBDA(const int i) { - Kokkos::atomic_increment(&(bin_counter(key_view(i)))); + Kokkos::atomic_inc(&(bin_counter(key_view(i)))); }); // Find the max and min number of particles per cell Kokkos::parallel_reduce("Get max/min nppc", Kokkos::RangePolicy<>(0,num_bins), min_max_functor(bin_counter), nppc_reducer); + const int chunk_size = tile_size*(nppc_result.max_val+1); // Reset bin_counter Kokkos::deep_copy(bin_counter, 0); // Update particle indices Kokkos::parallel_for("Update keys", Kokkos::RangePolicy<>(0, np), KOKKOS_LAMBDA(const int i) { - int count = Kokkos::atomic_fetch_add(&(bin_counter(key_view(i))), 1); - int chunk_size = tile_size*nppc_result.max_val; - int chunk = (key_view(i)-result.min_val)/tile_size; - int min_idx = result.min_val + chunk*tile_size; - int offset = count*nppc_result.max_val; - key_view(i) += chunk*chunk_size + offset - min_idx + 1; + const int count = Kokkos::atomic_fetch_add(&(bin_counter(key_view(i))), 1); + const int chunk_idx = (key_view(i)-(result.min_val))/tile_size; + key_view(i) += chunk_idx*chunk_size + count*tile_size - result.min_val; }); // Find smallest and largest index Kokkos::parallel_reduce("Get min/max bin", Kokkos::RangePolicy<>(0,particles_i.extent(0)), @@ -228,7 +226,7 @@ struct DefaultSort { Comparator comp(np, result.min_val, result.max_val); // Sort and create permutation View - int sort_within_bins = 0; + int sort_within_bins = 1; Kokkos::BinSort bin_sort(keys, 0, np, comp, sort_within_bins ); bin_sort.create_permute_vector(); diff --git a/src/species_advance/species_advance.h b/src/species_advance/species_advance.h index 80bfb63e..1db98364 100644 --- a/src/species_advance/species_advance.h +++ b/src/species_advance/species_advance.h @@ -159,6 +159,11 @@ class species_t { k_particle_movers_t::HostMirror k_pm_h; // kokkos particle movers on host k_particle_i_movers_t::HostMirror k_pm_i_h; // kokkos particle movers on host + k_counter_t num_temp_movers_d; + k_counter_t::HostMirror num_temp_movers_h; + k_particle_movers_t temp_movers; + k_particle_i_movers_t temp_movers_i; + // TODO: what is an iterator here?? k_counter_t k_nm_d; // nm iterator k_counter_t::HostMirror k_nm_h; @@ -227,6 +232,11 @@ class species_t { k_nm_h = Kokkos::create_mirror_view(k_nm_d); clean_up_from_count_h = Kokkos::create_mirror_view(clean_up_from_count); + +num_temp_movers_d = k_counter_t("Num temp movers"); +num_temp_movers_h = Kokkos::create_mirror_view(num_temp_movers_d); +temp_movers = k_particle_movers_t("Temp movers", n_particles); +temp_movers_i = k_particle_i_movers_t("Temp mover indices", n_particles); } /** @@ -442,14 +452,23 @@ move_p_kokkos( auto scatter_access = scatter_view.access(); q = qsp*p_w; + int ii = pii; + float r[3]; + r[0] = p_dx; + r[1] = p_dy; + r[2] = p_dz; //printf("in move %d \n", pi); for(;;) { - int ii = pii; - s_midx = p_dx; - s_midy = p_dy; - s_midz = p_dz; +// int ii = pii; +// s_midx = p_dx; +// s_midy = p_dy; +// s_midz = p_dz; + + s_midx = r[0]; + s_midy = r[1]; + s_midz = r[2]; s_dispx = pm->dispx; @@ -570,14 +589,23 @@ move_p_kokkos( //printf("pre axis %d x %e y %e z %e disp x %e y %e z %e\n", axis, p_dx, p_dy, p_dz, s_dispx, s_dispy, s_dispz); // Compute the new particle offset - p_dx += s_dispx+s_dispx; - p_dy += s_dispy+s_dispy; - p_dz += s_dispz+s_dispz; +// p_dx += s_dispx+s_dispx; +// p_dy += s_dispy+s_dispy; +// p_dz += s_dispz+s_dispz; + r[0] += s_dispx+s_dispx; + r[1] += s_dispy+s_dispy; + r[2] += s_dispz+s_dispz; // If an end streak, return success (should be ~50% of the time) //printf("axis %d x %e y %e z %e disp x %e y %e z %e\n", axis, p_dx, p_dy, p_dz, s_dispx, s_dispy, s_dispz); - if( axis==3 ) break; + if( axis==3 ) { + pii = ii; + p_dx = r[0]; + p_dy = r[1]; + p_dz = r[2]; + break; + } // Determine if the particle crossed into a local cell or if it // hit a boundary and convert the coordinate system accordingly. @@ -587,7 +615,8 @@ move_p_kokkos( // +/-1 _exactly_ for the particle. v0 = s_dir[axis]; - k_particles(pi, particle_var::dx + axis) = v0; // Avoid roundoff fiascos--put the particle + //k_particles(pi, particle_var::dx + axis) = v0; // Avoid roundoff fiascos--put the particle + r[axis] = v0; // Avoid roundoff fiascos--put the particle // _exactly_ on the boundary. face = axis; if( v0>0 ) face += 3; @@ -617,16 +646,22 @@ move_p_kokkos( // Cannot handle the boundary condition here. Save the updated // particle position, face it hit and update the remaining // displacement in the particle mover. - pii = 8*pii + face; + //pii = 8*pii + face; + pii = 8*ii + face; + p_dx = r[0]; + p_dy = r[1]; + p_dz = r[2]; return 1; // Return "mover still in use" - } + } // Crossed into a normal voxel. Update the voxel index, convert the // particle coordinate system and keep moving the particle. - pii = neighbor - rangel; + //pii = neighbor - rangel; + ii = neighbor - rangel; /**/ // Note: neighbor - rangel < 2^31 / 6 - k_particles(pi, particle_var::dx + axis) = -v0; // Convert coordinate system + //k_particles(pi, particle_var::dx + axis) = -v0; // Convert coordinate system + r[axis] = -v0; // Convert coordinate system } #undef p_dx #undef p_dy diff --git a/src/species_advance/standard/advance_p.cc b/src/species_advance/standard/advance_p.cc index d8e94aa0..4067dd0d 100644 --- a/src/species_advance/standard/advance_p.cc +++ b/src/species_advance/standard/advance_p.cc @@ -335,31 +335,32 @@ void load_interpolators( void advance_p_kokkos_unified( - k_particles_t& k_particles, - k_particles_i_t& k_particles_i, - k_particle_copy_t& k_particle_copy, - k_particle_i_copy_t& k_particle_i_copy, - k_particle_movers_t& k_particle_movers, - k_particle_i_movers_t& k_particle_movers_i, - k_field_sa_t k_f_sa, - k_interpolator_t& k_interp, - //k_particle_movers_t k_local_particle_movers, - k_counter_t& k_nm, - k_neighbor_t& k_neighbors, - field_array_t* RESTRICT fa, - const grid_t *g, - const float qdt_2mc, - const float cdt_dx, - const float cdt_dy, - const float cdt_dz, - const float qsp, - const int np, - const int max_nm, - const int nx, - const int ny, - const int nz) + species_t* sp, + field_array_t* fa, + interpolator_array_t* ia) { - + const grid_t* g = sp->g; + k_particles_t k_particles = sp->k_p_d; + k_particles_i_t k_particles_i = sp->k_p_i_d; + k_particle_copy_t k_particle_copy = sp->k_pc_d; + k_particle_i_copy_t k_particle_i_copy = sp->k_pc_i_d; + k_particle_movers_t k_particle_movers = sp->k_pm_d; + k_particle_i_movers_t k_particle_movers_i = sp->k_pm_i_d; + k_counter_t k_nm = sp->k_nm_d; + k_neighbor_t k_neighbors = g->k_neighbor_d; + k_field_sa_t k_f_sa = fa->k_field_sa_d; + k_interpolator_t k_interp = ia->k_i_d; + const float qsp = sp->q; + const float qdt_2mc = (qsp*g->dt)/(2*sp->m*g->cvac); + const float cdt_dx = g->cvac*g->dt*g->rdx; + const float cdt_dy = g->cvac*g->dt*g->rdy; + const float cdt_dz = g->cvac*g->dt*g->rdz; + const int np = sp->np; + const int max_nm = sp->max_nm; + const int nx = g->nx; + const int ny = g->ny; + const int nz = g->nz; + constexpr float one = 1.; constexpr float one_third = 1./3.; constexpr float two_fifteenths = 2./15.; @@ -775,38 +776,54 @@ advance_p_kokkos_unified( void advance_p_kokkos_gpu( - k_particles_t& k_particles, - k_particles_i_t& k_particles_i, - k_particle_copy_t& k_particle_copy, - k_particle_i_copy_t& k_particle_i_copy, - k_particle_movers_t& k_particle_movers, - k_particle_i_movers_t& k_particle_movers_i, - k_field_sa_t k_f_sa, - k_interpolator_t& k_interp, - k_counter_t& k_nm, - k_neighbor_t& k_neighbors, - field_array_t* RESTRICT fa, - const grid_t *g, - const float qdt_2mc, - const float cdt_dx, - const float cdt_dy, - const float cdt_dz, - const float qsp, - const int np, - const int max_nm, - const int nx, - const int ny, - const int nz) + species_t* sp, + field_array_t* fa, + interpolator_array_t* ia) { + const grid_t* g = sp->g; + k_particles_t k_particles = sp->k_p_d; + k_particles_i_t k_particles_i = sp->k_p_i_d; + k_particle_copy_t k_particle_copy = sp->k_pc_d; + k_particle_i_copy_t k_particle_i_copy = sp->k_pc_i_d; + k_particle_movers_t k_particle_movers = sp->k_pm_d; + k_particle_i_movers_t k_particle_movers_i = sp->k_pm_i_d; + k_counter_t k_nm = sp->k_nm_d; + k_neighbor_t k_neighbors = g->k_neighbor_d; + k_field_sa_t k_f_sa = fa->k_field_sa_d; + k_interpolator_t k_interp = ia->k_i_d; + const float qsp = sp->q; + const float qdt_2mc = (qsp*g->dt)/(2*sp->m*g->cvac); + const float cdt_dx = g->cvac*g->dt*g->rdx; + const float cdt_dy = g->cvac*g->dt*g->rdy; + const float cdt_dz = g->cvac*g->dt*g->rdz; + const int np = sp->np; + const int max_nm = sp->max_nm; + const int nx = g->nx; + const int ny = g->ny; + const int nz = g->nz; + float cx = 0.25 * g->rdy * g->rdz / g->dt; + float cy = 0.25 * g->rdz * g->rdx / g->dt; + float cz = 0.25 * g->rdx * g->rdy / g->dt; constexpr float one = 1.; constexpr float one_third = 1./3.; constexpr float two_fifteenths = 2./15.; + +// k_counter_t num_temp_movers_d("Num temp movers"); +// k_counter_t::HostMirror num_temp_movers_h = Kokkos::create_mirror_view(num_temp_movers_d); +// Kokkos::deep_copy(num_temp_movers_d, 0); +// k_particle_movers_t temp_movers("Temp movers", np); +// k_particle_i_movers_t temp_movers_i("Temp mover indices", np); +// Kokkos::View > atomic_nm = num_temp_movers_d; + auto num_temp_movers_d = sp->num_temp_movers_d; + auto num_temp_movers_h = sp->num_temp_movers_h; + auto temp_movers = sp->temp_movers; + auto temp_movers_i = sp->temp_movers_i; + Kokkos::deep_copy(num_temp_movers_d, 0); + k_field_t k_field = fa->k_f_d; k_field_sa_t k_f_sv = Kokkos::Experimental::create_scatter_view<>(k_field); - float cx = 0.25 * g->rdy * g->rdz / g->dt; - float cy = 0.25 * g->rdz * g->rdx / g->dt; - float cz = 0.25 * g->rdx * g->rdy / g->dt; + Kokkos::View> k_interp_const = k_interp; // Process particles for this pipeline @@ -819,28 +836,28 @@ advance_p_kokkos_gpu( #define p_w k_particles(p_index, particle_var::w) #define pii k_particles_i(p_index) - #define f_cbx k_interp(ii, interpolator_var::cbx) - #define f_cby k_interp(ii, interpolator_var::cby) - #define f_cbz k_interp(ii, interpolator_var::cbz) - #define f_ex k_interp(ii, interpolator_var::ex) - #define f_ey k_interp(ii, interpolator_var::ey) - #define f_ez k_interp(ii, interpolator_var::ez) + #define f_cbx k_interp_const(ii, interpolator_var::cbx) + #define f_cby k_interp_const(ii, interpolator_var::cby) + #define f_cbz k_interp_const(ii, interpolator_var::cbz) + #define f_ex k_interp_const(ii, interpolator_var::ex) + #define f_ey k_interp_const(ii, interpolator_var::ey) + #define f_ez k_interp_const(ii, interpolator_var::ez) - #define f_dexdy k_interp(ii, interpolator_var::dexdy) - #define f_dexdz k_interp(ii, interpolator_var::dexdz) + #define f_dexdy k_interp_const(ii, interpolator_var::dexdy) + #define f_dexdz k_interp_const(ii, interpolator_var::dexdz) - #define f_d2exdydz k_interp(ii, interpolator_var::d2exdydz) - #define f_deydx k_interp(ii, interpolator_var::deydx) - #define f_deydz k_interp(ii, interpolator_var::deydz) + #define f_d2exdydz k_interp_const(ii, interpolator_var::d2exdydz) + #define f_deydx k_interp_const(ii, interpolator_var::deydx) + #define f_deydz k_interp_const(ii, interpolator_var::deydz) - #define f_d2eydzdx k_interp(ii, interpolator_var::d2eydzdx) - #define f_dezdx k_interp(ii, interpolator_var::dezdx) - #define f_dezdy k_interp(ii, interpolator_var::dezdy) + #define f_d2eydzdx k_interp_const(ii, interpolator_var::d2eydzdx) + #define f_dezdx k_interp_const(ii, interpolator_var::dezdx) + #define f_dezdy k_interp_const(ii, interpolator_var::dezdy) - #define f_d2ezdxdy k_interp(ii, interpolator_var::d2ezdxdy) - #define f_dcbxdx k_interp(ii, interpolator_var::dcbxdx) - #define f_dcbydy k_interp(ii, interpolator_var::dcbydy) - #define f_dcbzdz k_interp(ii, interpolator_var::dcbzdz) + #define f_d2ezdxdy k_interp_const(ii, interpolator_var::d2ezdxdy) + #define f_dcbxdx k_interp_const(ii, interpolator_var::dcbxdx) + #define f_dcbydy k_interp_const(ii, interpolator_var::dcbydy) + #define f_dcbzdz k_interp_const(ii, interpolator_var::dcbzdz) // copy local memmbers from grid //auto nfaces_per_voxel = 6; @@ -871,7 +888,7 @@ advance_p_kokkos_gpu( #endif float v0, v1, v2, v3, v4, v5; - auto k_field_scatter_access = k_f_sv.access(); + auto field_sa = k_f_sv.access(); float dx = p_dx; // Load position float dy = p_dy; @@ -990,21 +1007,21 @@ advance_p_kokkos_gpu( int i2 = VOXEL(xi,yi,zi+1,nx,ny,nz); int i3 = VOXEL(xi,yi+1,zi+1,nx,ny,nz); ACCUMULATE_J( x,y,z ); - contribute_current(team_member, k_field_scatter_access, i0, i1, i2, i3, + contribute_current(team_member, field_sa, i0, i1, i2, i3, field_var::jfx, cx*v0, cx*v1, cx*v2, cx*v3); i1 = VOXEL(xi,yi,zi+1,nx,ny,nz); i2 = VOXEL(xi+1,yi,zi,nx,ny,nz); i3 = VOXEL(xi+1,yi,zi+1,nx,ny,nz); ACCUMULATE_J( y,z,x ); - contribute_current(team_member, k_field_scatter_access, i0, i1, i2, i3, + contribute_current(team_member, field_sa, i0, i1, i2, i3, field_var::jfy, cy*v0, cy*v1, cy*v2, cy*v3); i1 = VOXEL(xi+1,yi,zi,nx,ny,nz); i2 = VOXEL(xi,yi+1,zi,nx,ny,nz); i3 = VOXEL(xi+1,yi+1,zi,nx,ny,nz); ACCUMULATE_J( z,x,y ); - contribute_current(team_member, k_field_scatter_access, i0, i1, i2, i3, + contribute_current(team_member, field_sa, i0, i1, i2, i3, field_var::jfz, cz*v0, cz*v1, cz*v2, cz*v3); } else { #endif @@ -1015,101 +1032,131 @@ advance_p_kokkos_gpu( int yi = iii/(nx+2); int xi = iii - yi*(nx+2); ACCUMULATE_J( x,y,z ); - //Kokkos::atomic_add(&k_field(ii, field_var::jfx), cx*v0); - //Kokkos::atomic_add(&k_field(VOXEL(xi,yi+1,zi,nx,ny,nz), field_var::jfx), cx*v1); - //Kokkos::atomic_add(&k_field(VOXEL(xi,yi,zi+1,nx,ny,nz), field_var::jfx), cx*v2); - //Kokkos::atomic_add(&k_field(VOXEL(xi,yi+1,zi+1,nx,ny,nz), field_var::jfx), cx*v3); - k_field_scatter_access(ii, field_var::jfx) += cx*v0; - k_field_scatter_access(VOXEL(xi,yi+1,zi,nx,ny,nz), field_var::jfx) += cx*v1; - k_field_scatter_access(VOXEL(xi,yi,zi+1,nx,ny,nz), field_var::jfx) += cx*v2; - k_field_scatter_access(VOXEL(xi,yi+1,zi+1,nx,ny,nz), field_var::jfx) += cx*v3; + field_sa(ii, field_var::jfx) += cx*v0; + field_sa(VOXEL(xi,yi+1,zi, nx,ny,nz), field_var::jfx) += cx*v1; + field_sa(VOXEL(xi,yi, zi+1,nx,ny,nz), field_var::jfx) += cx*v2; + field_sa(VOXEL(xi,yi+1,zi+1,nx,ny,nz), field_var::jfx) += cx*v3; ACCUMULATE_J( y,z,x ); - //Kokkos::atomic_add(&k_field(ii, field_var::jfy), cy*v0); - //Kokkos::atomic_add(&k_field(VOXEL(xi,yi,zi+1,nx,ny,nz), field_var::jfy), cy*v1); - //Kokkos::atomic_add(&k_field(VOXEL(xi+1,yi,zi,nx,ny,nz), field_var::jfy), cy*v2); - //Kokkos::atomic_add(&k_field(VOXEL(xi+1,yi,zi+1,nx,ny,nz), field_var::jfy), cy*v3); - k_field_scatter_access(ii, field_var::jfy) += cy*v0; - k_field_scatter_access(VOXEL(xi,yi,zi+1,nx,ny,nz), field_var::jfy) += cy*v1; - k_field_scatter_access(VOXEL(xi+1,yi,zi,nx,ny,nz), field_var::jfy) += cy*v2; - k_field_scatter_access(VOXEL(xi+1,yi,zi+1,nx,ny,nz), field_var::jfy) += cy*v3; + field_sa(ii, field_var::jfy) += cy*v0; + field_sa(VOXEL(xi, yi,zi+1,nx,ny,nz), field_var::jfy) += cy*v1; + field_sa(VOXEL(xi+1,yi,zi, nx,ny,nz), field_var::jfy) += cy*v2; + field_sa(VOXEL(xi+1,yi,zi+1,nx,ny,nz), field_var::jfy) += cy*v3; ACCUMULATE_J( z,x,y ); - //Kokkos::atomic_add(&k_field(ii, field_var::jfz), cz*v0); - //Kokkos::atomic_add(&k_field(VOXEL(xi+1,yi,zi,nx,ny,nz), field_var::jfz), cz*v1); - //Kokkos::atomic_add(&k_field(VOXEL(xi,yi+1,zi,nx,ny,nz), field_var::jfz), cz*v2); - //Kokkos::atomic_add(&k_field(VOXEL(xi+1,yi+1,zi,nx,ny,nz), field_var::jfz), cz*v3); - k_field_scatter_access(ii, field_var::jfz) += cz*v0; - k_field_scatter_access(VOXEL(xi+1,yi,zi,nx,ny,nz), field_var::jfz) += cz*v1; - k_field_scatter_access(VOXEL(xi,yi+1,zi,nx,ny,nz), field_var::jfz) += cz*v2; - k_field_scatter_access(VOXEL(xi+1,yi+1,zi,nx,ny,nz), field_var::jfz) += cz*v3; + field_sa(ii, field_var::jfz) += cz*v0; + field_sa(VOXEL(xi+1,yi, zi,nx,ny,nz), field_var::jfz) += cz*v1; + field_sa(VOXEL(xi, yi+1,zi,nx,ny,nz), field_var::jfz) += cz*v2; + field_sa(VOXEL(xi+1,yi+1,zi,nx,ny,nz), field_var::jfz) += cz*v3; #ifdef VPIC_ENABLE_TEAM_REDUCTION } #endif # undef ACCUMULATE_J } else { - DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 ); - local_pm->dispx = ux; - local_pm->dispy = uy; - local_pm->dispz = uz; - local_pm->i = p_index; - - //printf("Calling move_p index %d dx %e y %e z %e ux %e uy %e yz %e \n", p_index, ux, uy, uz, p_ux, p_uy, p_uz); - if( move_p_kokkos( k_particles, k_particles_i, local_pm, // Unlikely - k_f_sv, g, k_neighbors, rangel, rangeh, qsp, cx, cy, cz, nx, ny, nz ) ) - { - if( k_nm(0) < max_nm ) - { - const int nm = Kokkos::atomic_fetch_add( &k_nm(0), 1 ); - if (nm >= max_nm) Kokkos::abort("overran max_nm"); - - k_particle_movers(nm, particle_mover_var::dispx) = local_pm->dispx; - k_particle_movers(nm, particle_mover_var::dispy) = local_pm->dispy; - k_particle_movers(nm, particle_mover_var::dispz) = local_pm->dispz; - k_particle_movers_i(nm) = local_pm->i; - - // Keep existing mover structure, but also copy the particle data so we have a reduced set to move to host - k_particle_copy(nm, particle_var::dx) = p_dx; - k_particle_copy(nm, particle_var::dy) = p_dy; - k_particle_copy(nm, particle_var::dz) = p_dz; - k_particle_copy(nm, particle_var::ux) = p_ux; - k_particle_copy(nm, particle_var::uy) = p_uy; - k_particle_copy(nm, particle_var::uz) = p_uz; - k_particle_copy(nm, particle_var::w) = p_w; - k_particle_i_copy(nm) = pii; - - // Tag this one as having left - //k_particles(p_index, particle_var::pi) = 999999; - - // Copy local local_pm back - //local_pm_dispx = local_pm->dispx; - //local_pm_dispy = local_pm->dispy; - //local_pm_dispz = local_pm->dispz; - //local_pm_i = local_pm->i; - //printf("rank copying %d to nm %d \n", local_pm_i, nm); - //copy_local_to_pm(nm); - } - } + int idx = Kokkos::atomic_fetch_add(&num_temp_movers_d(0), 1); + //int idx = atomic_nm(0)++; + //int idx = num_temp_movers_d(0)++; + temp_movers(idx, 0) = ux; + temp_movers(idx, 1) = uy; + temp_movers(idx, 2) = uz; + temp_movers_i(idx) = p_index; + +// DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 ); +// local_pm->dispx = ux; +// local_pm->dispy = uy; +// local_pm->dispz = uz; +// local_pm->i = p_index; +// +// //printf("Calling move_p index %d dx %e y %e z %e ux %e uy %e yz %e \n", p_index, ux, uy, uz, p_ux, p_uy, p_uz); +// if( move_p_kokkos( k_particles, k_particles_i, local_pm, // Unlikely +// k_f_sv, g, k_neighbors, rangel, rangeh, qsp, cx, cy, cz, nx, ny, nz ) ) +// { +// if( k_nm(0) < max_nm ) +// { +// const int nm = Kokkos::atomic_fetch_add( &k_nm(0), 1 ); +// if (nm >= max_nm) Kokkos::abort("overran max_nm"); +// +// k_particle_movers(nm, particle_mover_var::dispx) = local_pm->dispx; +// k_particle_movers(nm, particle_mover_var::dispy) = local_pm->dispy; +// k_particle_movers(nm, particle_mover_var::dispz) = local_pm->dispz; +// k_particle_movers_i(nm) = local_pm->i; +// +// // Keep existing mover structure, but also copy the particle data so we have a reduced set to move to host +// k_particle_copy(nm, particle_var::dx) = p_dx; +// k_particle_copy(nm, particle_var::dy) = p_dy; +// k_particle_copy(nm, particle_var::dz) = p_dz; +// k_particle_copy(nm, particle_var::ux) = p_ux; +// k_particle_copy(nm, particle_var::uy) = p_uy; +// k_particle_copy(nm, particle_var::uz) = p_uz; +// k_particle_copy(nm, particle_var::w) = p_w; +// k_particle_i_copy(nm) = pii; +// +// // Tag this one as having left +// //k_particles(p_index, particle_var::pi) = 999999; +// +// // Copy local local_pm back +// //local_pm_dispx = local_pm->dispx; +// //local_pm_dispy = local_pm->dispy; +// //local_pm_dispz = local_pm->dispz; +// //local_pm_i = local_pm->i; +// //printf("rank copying %d to nm %d \n", local_pm_i, nm); +// //copy_local_to_pm(nm); +// } +// } } #ifdef VPIC_ENABLE_HIERARCHICAL } }); #endif }); + Kokkos::deep_copy(num_temp_movers_h, num_temp_movers_d); + Kokkos::parallel_for("advance_p_move_p", Kokkos::RangePolicy<>(0, num_temp_movers_h(0)), KOKKOS_LAMBDA(const int idx) { + DECLARE_ALIGNED_ARRAY( particle_mover_t, 16, local_pm, 1 ); + local_pm->dispx = temp_movers(idx, 0); + local_pm->dispy = temp_movers(idx, 1); + local_pm->dispz = temp_movers(idx, 2); + local_pm->i = temp_movers_i(idx); + const int p_index = temp_movers_i(idx); + + //printf("Calling move_p index %d dx %e y %e z %e ux %e uy %e yz %e \n", p_index, ux, uy, uz, p_ux, p_uy, p_uz); + if( move_p_kokkos( k_particles, k_particles_i, local_pm, // Unlikely + k_f_sv, g, k_neighbors, rangel, rangeh, qsp, cx, cy, cz, nx, ny, nz ) ) + { + if( k_nm(0) < max_nm ) + { + const int nm = Kokkos::atomic_fetch_add( &k_nm(0), 1 ); + if (nm >= max_nm) Kokkos::abort("overran max_nm"); + + k_particle_movers(nm, particle_mover_var::dispx) = local_pm->dispx; + k_particle_movers(nm, particle_mover_var::dispy) = local_pm->dispy; + k_particle_movers(nm, particle_mover_var::dispz) = local_pm->dispz; + k_particle_movers_i(nm) = local_pm->i; + + // Keep existing mover structure, but also copy the particle data so we have a reduced set to move to host + k_particle_copy(nm, particle_var::dx) = p_dx; + k_particle_copy(nm, particle_var::dy) = p_dy; + k_particle_copy(nm, particle_var::dz) = p_dz; + k_particle_copy(nm, particle_var::ux) = p_ux; + k_particle_copy(nm, particle_var::uy) = p_uy; + k_particle_copy(nm, particle_var::uz) = p_uz; + k_particle_copy(nm, particle_var::w) = p_w; + k_particle_i_copy(nm) = pii; + + // Tag this one as having left + //k_particles(p_index, particle_var::pi) = 999999; + + // Copy local local_pm back + //local_pm_dispx = local_pm->dispx; + //local_pm_dispy = local_pm->dispy; + //local_pm_dispz = local_pm->dispz; + //local_pm_i = local_pm->i; + //printf("rank copying %d to nm %d \n", local_pm_i, nm); + //copy_local_to_pm(nm); + } + } + }); Kokkos::Experimental::contribute(k_field, k_f_sv); - - - // TODO: abstract this manual data copy - //Kokkos::deep_copy(h_nm, k_nm); - - //args->seg[pipeline_rank].pm = pm; - //args->seg[pipeline_rank].max_nm = max_nm; - //args->seg[pipeline_rank].nm = h_nm(0); - //args->seg[pipeline_rank].n_ignored = 0; // TODO: update this - //delete(k_local_particle_movers_p); - //return h_nm(0); - } void @@ -1121,25 +1168,24 @@ advance_p( /**/ species_t * RESTRICT sp, //DECLARE_ALIGNED_ARRAY( particle_mover_seg_t, 128, seg, MAX_PIPELINE+1 ); //int rank; + // Check args if( !sp ) { - ERROR(( "Bad args" )); + ERROR(( "Bad args: advance_p: sp" )); + } + if( !fa ) + { + ERROR(( "Bad args: advance_p: fa" )); } if( !ia ) { - ERROR(( "Bad args" )); + ERROR(( "Bad args: advance_p: ia" )); } if( sp->g!=ia->g ) { - ERROR(( "Bad args" )); + ERROR(( "Bad args: advance_p: sp->g != ia->g" )); } - - float qdt_2mc = (sp->q*sp->g->dt)/(2*sp->m*sp->g->cvac); - float cdt_dx = sp->g->cvac*sp->g->dt*sp->g->rdx; - float cdt_dy = sp->g->cvac*sp->g->dt*sp->g->rdy; - float cdt_dz = sp->g->cvac*sp->g->dt*sp->g->rdz; - #ifdef USE_GPU // Use the gpu kernel for slightly better performance #define ADVANCE_P advance_p_kokkos_gpu @@ -1147,31 +1193,11 @@ advance_p( /**/ species_t * RESTRICT sp, // Portable kernel with additional vectorization options #define ADVANCE_P advance_p_kokkos_unified #endif + KOKKOS_TIC(); - ADVANCE_P( - sp->k_p_d, - sp->k_p_i_d, - sp->k_pc_d, - sp->k_pc_i_d, - sp->k_pm_d, - sp->k_pm_i_d, - fa->k_field_sa_d, - ia->k_i_d, - sp->k_nm_d, - sp->g->k_neighbor_d, - fa, - sp->g, - qdt_2mc, - cdt_dx, - cdt_dy, - cdt_dz, - sp->q, - sp->np, - sp->max_nm, - sp->g->nx, - sp->g->ny, - sp->g->nz - ); + Kokkos::fence(); + ADVANCE_P(sp, fa, ia); + Kokkos::fence(); KOKKOS_TOC( advance_p, 1); KOKKOS_TIC(); diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 99875ff8..5965f758 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(particle_push) add_subdirectory(energy_comparison) add_subdirectory(legacy_comparison) +add_subdirectory(particle_sorting) diff --git a/test/unit/particle_sorting/CMakeLists.txt b/test/unit/particle_sorting/CMakeLists.txt new file mode 100644 index 00000000..a3bf1f73 --- /dev/null +++ b/test/unit/particle_sorting/CMakeLists.txt @@ -0,0 +1,3 @@ +add_executable(particle_sorting sort_tests.cc) +target_link_libraries(particle_sorting vpic Kokkos::kokkos) +add_test(NAME particle_sorting COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} particle_sorting ${MPIEXEC_POSTFLAGS}) diff --git a/test/unit/particle_sorting/sort_tests.cc b/test/unit/particle_sorting/sort_tests.cc new file mode 100644 index 00000000..fbafd0cc --- /dev/null +++ b/test/unit/particle_sorting/sort_tests.cc @@ -0,0 +1,126 @@ +#define CATCH_CONFIG_RUNNER // Use our own main +#include "catch.hpp" +#include "Kokkos_Core.hpp" +#include "src/particle_operations/sort.h" +#include "src/vpic/vpic.h" +#include +#include +#include +#include + +TEST_CASE( "Verify ParticleSorter reorders particles correctly", "[ParticleSorter]" ) +{ + ParticleSorter<> sorter; + + const int tilesize = 7; + const int nx = 3; + const int ny = 5; + const int nz = 13; + const int nppc = 11; + const int num_part = nppc*nx*ny*nz; + + k_particles_t part("Particles", num_part); + k_particles_i_t part_i("Particle cell indices", num_part); + k_particles_t::HostMirror part_h = Kokkos::create_mirror_view(part); + k_particles_i_t::HostMirror part_i_h = Kokkos::create_mirror_view(part_i); + + // Initialize cell indices using stdlib for correctness + std::vector cell_indices; + for(int i=0; i part_policy(0,num_part); + Kokkos::parallel_for("Load indices", part_policy, KOKKOS_LAMBDA(const int i) { + part_i_h(i) = cell_indices[i]; + printf("%d ", part_i_h(i)); + }); + printf("\n"); + + Kokkos::deep_copy(part_i, part_i_h); + Kokkos::fence(); + + // Test different sort functions + SECTION( "Standard sort test" ) { + sorter.standard_sort(part, part_i, num_part, nx*ny*nz); + + Kokkos::deep_copy(part_i_h, part_i); + + // Verify particles are in standard order + for(int i=0; i