Skip to content

Commit 23d1c35

Browse files
authored
Replace blockDim.x with compile time constant when possible (#4198)
This improves performance for some kernels.
1 parent bef725e commit 23d1c35

File tree

5 files changed

+34
-37
lines changed

5 files changed

+34
-37
lines changed

Src/AmrCore/AMReX_TagBox.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
472472
{
473473
int bid = blockIdx.x;
474474
int tid = threadIdx.x;
475-
int icell = blockDim.x*blockIdx.x+threadIdx.x;
475+
int icell = block_size*blockIdx.x+threadIdx.x;
476476

477477
int t = 0;
478478
if (icell < ncells && tags[icell] != TagBox::CLEAR) {
@@ -558,7 +558,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
558558
{
559559
int bid = blockIdx.x;
560560
int tid = threadIdx.x;
561-
int icell = blockDim.x*blockIdx.x+threadIdx.x;
561+
int icell = block_size*blockIdx.x+threadIdx.x;
562562

563563
Gpu::SharedMemory<unsigned int> gsm;
564564
unsigned int * shared_counter = gsm.dataPtr();

Src/Base/AMReX_GpuLaunchFunctsG.H

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -806,7 +806,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
806806
if (tid < nleft) {
807807
detail::call_f_scalar_handler(f, tid+start_idx,
808808
Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
809-
(std::uint64_t)blockDim.x)));
809+
(std::uint64_t)MT)));
810810
}
811811
});
812812
}
@@ -829,7 +829,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
829829
auto iv = indexer.intVect(icell);
830830
detail::call_f_intvect_handler(f, iv,
831831
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
832-
(std::uint64_t)blockDim.x)));
832+
(std::uint64_t)MT)));
833833
}
834834
});
835835
}
@@ -852,7 +852,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
852852
auto iv = indexer.intVect(icell);
853853
detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
854854
Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
855-
(std::uint64_t)blockDim.x)));
855+
(std::uint64_t)MT)));
856856
}
857857
});
858858
}
@@ -870,9 +870,9 @@ ParallelForRNG (T n, L const& f) noexcept
870870
amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
871871
ec.numThreads, 0, Gpu::gpuStream(),
872872
[=] AMREX_GPU_DEVICE () noexcept {
873-
Long tid = Long(blockDim.x)*blockIdx.x+threadIdx.x;
873+
Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
874874
RandomEngine engine{&(rand_state[tid])};
875-
for (Long i = tid, stride = Long(blockDim.x)*gridDim.x; i < Long(n); i += stride) {
875+
for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
876876
f(T(i),engine);
877877
}
878878
});
@@ -892,9 +892,9 @@ ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
892892
amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
893893
ec.numThreads, 0, Gpu::gpuStream(),
894894
[=] AMREX_GPU_DEVICE () noexcept {
895-
auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
895+
auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
896896
RandomEngine engine{&(rand_state[tid])};
897-
for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
897+
for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
898898
auto iv = indexer.intVect(icell);
899899
detail::call_f_intvect_engine(f, iv, engine);
900900
}
@@ -915,9 +915,9 @@ ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
915915
amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
916916
ec.numThreads, 0, Gpu::gpuStream(),
917917
[=] AMREX_GPU_DEVICE () noexcept {
918-
auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
918+
auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
919919
RandomEngine engine{&(rand_state[tid])};
920-
for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
920+
for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
921921
auto iv = indexer.intVect(icell);
922922
detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
923923
}
@@ -938,7 +938,7 @@ ParallelFor (Gpu::KernelInfo const&,
938938
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
939939
[=] AMREX_GPU_DEVICE () noexcept {
940940
auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
941-
for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
941+
for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
942942
icell < ncells; icell += stride) {
943943
if (icell < indexer1.numPts()) {
944944
auto iv = indexer1.intVect(icell);
@@ -967,7 +967,7 @@ ParallelFor (Gpu::KernelInfo const&,
967967
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
968968
[=] AMREX_GPU_DEVICE () noexcept {
969969
auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
970-
for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
970+
for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
971971
icell < ncells; icell += stride) {
972972
if (icell < indexer1.numPts()) {
973973
auto iv = indexer1.intVect(icell);
@@ -1001,7 +1001,7 @@ ParallelFor (Gpu::KernelInfo const&,
10011001
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
10021002
[=] AMREX_GPU_DEVICE () noexcept {
10031003
auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
1004-
for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
1004+
for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
10051005
icell < ncells; icell += stride) {
10061006
if (icell < indexer1.numPts()) {
10071007
auto iv = indexer1.intVect(icell);
@@ -1034,7 +1034,7 @@ ParallelFor (Gpu::KernelInfo const&,
10341034
AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
10351035
[=] AMREX_GPU_DEVICE () noexcept {
10361036
auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
1037-
for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
1037+
for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
10381038
icell < ncells; icell += stride) {
10391039
if (icell < indexer1.numPts()) {
10401040
auto iv = indexer1.intVect(icell);

Src/Base/AMReX_MultiFabUtil.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -855,10 +855,10 @@ namespace amrex
855855
{
856856
#ifdef AMREX_USE_SYCL
857857
int i1d = h.blockIdx() / n2dblocks;
858-
int i2d = h.threadIdx() + h.blockDim()*(h.blockIdx()-i1d*n2dblocks);
858+
int i2d = h.threadIdx() + AMREX_GPU_MAX_THREADS*(h.blockIdx()-i1d*n2dblocks);
859859
#else
860860
int i1d = blockIdx.x / n2dblocks;
861-
int i2d = threadIdx.x + blockDim.x*(blockIdx.x-i1d*n2dblocks);
861+
int i2d = threadIdx.x + AMREX_GPU_MAX_THREADS*(blockIdx.x-i1d*n2dblocks);
862862
#endif
863863
int i2dy = i2d / n2dx;
864864
int i2dx = i2d - i2dy*n2dx;

Src/Base/AMReX_Reduce.H

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,6 @@ public:
516516
{
517517
Dim1 blockIdx {gh.blockIdx()};
518518
Dim1 threadIdx{gh.threadIdx()};
519-
Dim1 blockDim {gh.blockDim()};
520519
Dim1 gridDim {gh.gridDim()};
521520
#else
522521
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
@@ -529,7 +528,7 @@ public:
529528
if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
530529
dst = r;
531530
}
532-
for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
531+
for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
533532
icell < ncells; icell += stride) {
534533
int k = icell / lenxy;
535534
int j = (icell - k*lenxy) / lenx;
@@ -575,7 +574,6 @@ public:
575574
{
576575
Dim1 blockIdx {gh.blockIdx()};
577576
Dim1 threadIdx{gh.threadIdx()};
578-
Dim1 blockDim {gh.blockDim()};
579577
Dim1 gridDim {gh.gridDim()};
580578
#else
581579
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
@@ -588,7 +586,7 @@ public:
588586
if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
589587
dst = r;
590588
}
591-
for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
589+
for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
592590
icell < ncells; icell += stride) {
593591
int k = icell / lenxy;
594592
int j = (icell - k*lenxy) / lenx;
@@ -632,7 +630,6 @@ public:
632630
{
633631
Dim1 blockIdx {gh.blockIdx()};
634632
Dim1 threadIdx{gh.threadIdx()};
635-
Dim1 blockDim {gh.blockDim()};
636633
Dim1 gridDim {gh.gridDim()};
637634
#else
638635
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
@@ -645,7 +642,7 @@ public:
645642
if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
646643
dst = r;
647644
}
648-
for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
645+
for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
649646
i < n; i += stride) {
650647
auto pr = f(i);
651648
Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
@@ -728,7 +725,7 @@ public:
728725
ReduceTuple dst = r;
729726
for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
730727
auto dp_stream = dp+istream*maxblocks;
731-
for (int i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
728+
for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
732729
i < nblocks[istream]; i += stride) {
733730
Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
734731
}
@@ -871,7 +868,7 @@ bool AnyOf (N n, T const* v, P const& pred)
871868
if (!(*has_any))
872869
{
873870
int r = false;
874-
for (N i = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
871+
for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
875872
i < n && !r; i += stride)
876873
{
877874
r = pred(v[i]) ? 1 : 0;
@@ -892,7 +889,7 @@ bool AnyOf (N n, T const* v, P const& pred)
892889
if (!has_any)
893890
{
894891
int r = false;
895-
for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
892+
for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
896893
i < n && !r; i += stride)
897894
{
898895
r = pred(v[i]) ? 1 : 0;
@@ -932,7 +929,7 @@ bool AnyOf (Box const& box, P const& pred)
932929
if (!(*has_any))
933930
{
934931
int r = false;
935-
for (int icell = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
932+
for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
936933
icell < ncells && !r; icell += stride) {
937934
int k = icell / lenxy;
938935
int j = (icell - k*lenxy) / lenx;
@@ -958,7 +955,7 @@ bool AnyOf (Box const& box, P const& pred)
958955
if (!has_any)
959956
{
960957
int r = false;
961-
for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
958+
for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
962959
icell < ncells && !r; icell += stride) {
963960
int k = icell / lenxy;
964961
int j = (icell - k*lenxy) / lenx;

Src/Base/AMReX_Scan.H

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -676,7 +676,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
676676
{
677677
auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
678678
auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
679-
const unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x;
679+
const unsigned int gid = blockIdx.x*nthreads + threadIdx.x;
680680
if (gid == 0) { scan_bid.reset(); }
681681
scan_tile_state.initialize_prefix(gid, nblocks);
682682
});
@@ -755,7 +755,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
755755
rocprim::plus<T>());
756756
}
757757
if (totalsum_p) {
758-
if (iend == n && threadIdx.x == blockDim.x-1) { // last thread of last block
758+
if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
759759
T tsum = data[nelms_per_thread-1];
760760
AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
761761
*totalsum_p = tsum;
@@ -768,7 +768,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
768768
BlockExchange().blocked_to_striped(data, data, temp_storage.exchange);
769769

770770
for (int i = 0; i < nelms_per_thread; ++i) {
771-
N offset = ibegin + i*blockDim.x + threadIdx.x;
771+
N offset = ibegin + i*nthreads + threadIdx.x;
772772
if (offset < iend) { fout(offset, data[i]); }
773773
}
774774
});
@@ -888,7 +888,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
888888
BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, prefix_op);
889889
}
890890
if (totalsum_p) {
891-
if (iend == n && threadIdx.x == blockDim.x-1) { // last thread of last block
891+
if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
892892
T tsum = data[nelms_per_thread-1];
893893
AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
894894
*totalsum_p = tsum;
@@ -901,7 +901,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
901901
BlockExchange(temp_storage.exchange).BlockedToStriped(data);
902902

903903
for (int i = 0; i < nelms_per_thread; ++i) {
904-
N offset = ibegin + i*blockDim.x + threadIdx.x;
904+
N offset = ibegin + i*nthreads + threadIdx.x;
905905
if (offset < iend) { fout(offset, data[i]); }
906906
}
907907
});
@@ -962,7 +962,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
962962
{
963963
int lane = threadIdx.x % Gpu::Device::warp_size;
964964
int warp = threadIdx.x / Gpu::Device::warp_size;
965-
int nwarps = blockDim.x / Gpu::Device::warp_size;
965+
int nwarps = nthreads / Gpu::Device::warp_size;
966966

967967
amrex::Gpu::SharedMemory<T> gsm;
968968
T* shared = gsm.dataPtr();
@@ -999,7 +999,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
999999
T sum_prev_chunk = 0; // inclusive sum from previous chunks.
10001000
T tmp_out[nchunks]; // block-wide inclusive sum for chunks
10011001
for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1002-
N offset = ibegin + ichunk*blockDim.x;
1002+
N offset = ibegin + ichunk*nthreads;
10031003
if (offset >= iend) { break; }
10041004

10051005
offset += threadIdx.x;
@@ -1074,7 +1074,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
10741074

10751075
if (virtual_block_id == 0) {
10761076
for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1077-
N offset = ibegin + ichunk*blockDim.x + threadIdx.x;
1077+
N offset = ibegin + ichunk*nthreads + threadIdx.x;
10781078
if (offset >= iend) { break; }
10791079
fout(offset, tmp_out[ichunk]);
10801080
if (offset == n-1) {
@@ -1136,7 +1136,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
11361136
T exclusive_prefix = shared[0];
11371137

11381138
for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
1139-
N offset = ibegin + ichunk*blockDim.x + threadIdx.x;
1139+
N offset = ibegin + ichunk*nthreads + threadIdx.x;
11401140
if (offset >= iend) { break; }
11411141
T t = tmp_out[ichunk] + exclusive_prefix;
11421142
fout(offset, t);

0 commit comments

Comments
 (0)