Replace blockDim.x with compile time constant when possible (#4198)

WeiqunZhang · web-flow · commit 23d1c35c6eba · 2024-11-07T19:13:57.000-08:00
This improves performance for some kernels.
diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
@@ -472,7 +472,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         {
             int bid = blockIdx.x;
             int tid = threadIdx.x;
-            int icell = blockDim.x*blockIdx.x+threadIdx.x;
+            int icell = block_size*blockIdx.x+threadIdx.x;
 
             int t = 0;
             if (icell < ncells && tags[icell] != TagBox::CLEAR) {
@@ -558,7 +558,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             {
                 int bid = blockIdx.x;
                 int tid = threadIdx.x;
-                int icell = blockDim.x*blockIdx.x+threadIdx.x;
+                int icell = block_size*blockIdx.x+threadIdx.x;
 
                 Gpu::SharedMemory<unsigned int> gsm;
                 unsigned int * shared_counter = gsm.dataPtr();
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -806,7 +806,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
             if (tid < nleft) {
                 detail::call_f_scalar_handler(f, tid+start_idx,
                     Gpu::Handler(amrex::min((std::uint64_t(nleft-tid)+(std::uint64_t)threadIdx.x),
-                    (std::uint64_t)blockDim.x)));
+                    (std::uint64_t)MT)));
             }
         });
     }
@@ -829,7 +829,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
                 auto iv = indexer.intVect(icell);
                 detail::call_f_intvect_handler(f, iv,
                     Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                    (std::uint64_t)blockDim.x)));
+                    (std::uint64_t)MT)));
             }
         });
     }
@@ -852,7 +852,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
                 auto iv = indexer.intVect(icell);
                 detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
                     Gpu::Handler(amrex::min((indexer.numPts()-icell+(std::uint64_t)threadIdx.x),
-                    (std::uint64_t)blockDim.x)));
+                    (std::uint64_t)MT)));
             }
         });
     }
@@ -870,9 +870,9 @@ ParallelForRNG (T n, L const& f) noexcept
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        Long tid = Long(blockDim.x)*blockIdx.x+threadIdx.x;
+        Long tid = Long(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (Long i = tid, stride = Long(blockDim.x)*gridDim.x; i < Long(n); i += stride) {
+        for (Long i = tid, stride = Long(AMREX_GPU_MAX_THREADS)*gridDim.x; i < Long(n); i += stride) {
             f(T(i),engine);
         }
     });
@@ -892,9 +892,9 @@ ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
+        auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
+        for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
             auto iv = indexer.intVect(icell);
             detail::call_f_intvect_engine(f, iv, engine);
         }
@@ -915,9 +915,9 @@ ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
                         amrex::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()),
                         ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
-        auto const tid = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
+        auto const tid = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x;
         RandomEngine engine{&(rand_state[tid])};
-        for (std::uint64_t icell = tid, stride = std::uint64_t(blockDim.x)*gridDim.x; icell < indexer.numPts(); icell += stride) {
+        for (std::uint64_t icell = tid, stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; icell < indexer.numPts(); icell += stride) {
             auto iv = indexer.intVect(icell);
             detail::call_f_intvect_ncomp_engine(f, iv, ncomp, engine);
         }
@@ -938,7 +938,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
                 auto iv = indexer1.intVect(icell);
@@ -967,7 +967,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
                 auto iv = indexer1.intVect(icell);
@@ -1001,7 +1001,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
                 auto iv = indexer1.intVect(icell);
@@ -1034,7 +1034,7 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_LAUNCH_KERNEL(MT, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept {
         auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
-        for (std::uint64_t icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x, stride = std::uint64_t(blockDim.x)*gridDim.x;
+        for (std::uint64_t icell = std::uint64_t(MT)*blockIdx.x+threadIdx.x, stride = std::uint64_t(MT)*gridDim.x;
              icell < ncells; icell += stride) {
             if (icell < indexer1.numPts()) {
                 auto iv = indexer1.intVect(icell);
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -855,10 +855,10 @@ namespace amrex
                 {
 #ifdef AMREX_USE_SYCL
                     int i1d = h.blockIdx() / n2dblocks;
-                    int i2d = h.threadIdx() + h.blockDim()*(h.blockIdx()-i1d*n2dblocks);
+                    int i2d = h.threadIdx() + AMREX_GPU_MAX_THREADS*(h.blockIdx()-i1d*n2dblocks);
 #else
                     int i1d = blockIdx.x / n2dblocks;
-                    int i2d = threadIdx.x + blockDim.x*(blockIdx.x-i1d*n2dblocks);
+                    int i2d = threadIdx.x + AMREX_GPU_MAX_THREADS*(blockIdx.x-i1d*n2dblocks);
 #endif
                     int i2dy = i2d / n2dx;
                     int i2dx = i2d - i2dy*n2dx;
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
@@ -516,7 +516,6 @@ public:
         {
             Dim1 blockIdx {gh.blockIdx()};
             Dim1 threadIdx{gh.threadIdx()};
-            Dim1 blockDim {gh.blockDim()};
             Dim1 gridDim  {gh.gridDim()};
 #else
         amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
@@ -529,7 +528,7 @@ public:
             if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
                 dst = r;
             }
-            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  icell < ncells; icell += stride) {
                 int k =  icell /   lenxy;
                 int j = (icell - k*lenxy) /   lenx;
@@ -575,7 +574,6 @@ public:
         {
             Dim1 blockIdx {gh.blockIdx()};
             Dim1 threadIdx{gh.threadIdx()};
-            Dim1 blockDim {gh.blockDim()};
             Dim1 gridDim  {gh.gridDim()};
 #else
         amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
@@ -588,7 +586,7 @@ public:
             if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
                 dst = r;
             }
-            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  icell < ncells; icell += stride) {
                 int k =  icell /   lenxy;
                 int j = (icell - k*lenxy) /   lenx;
@@ -632,7 +630,6 @@ public:
         {
             Dim1 blockIdx {gh.blockIdx()};
             Dim1 threadIdx{gh.threadIdx()};
-            Dim1 blockDim {gh.blockDim()};
             Dim1 gridDim  {gh.gridDim()};
 #else
         amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
@@ -645,7 +642,7 @@ public:
             if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
                 dst = r;
             }
-            for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  i < n; i += stride) {
                 auto pr = f(i);
                 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
@@ -728,7 +725,7 @@ public:
                 ReduceTuple dst = r;
                 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
                     auto dp_stream = dp+istream*maxblocks;
-                    for (int i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+                    for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                          i < nblocks[istream]; i += stride) {
                         Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
                     }
@@ -871,7 +868,7 @@ bool AnyOf (N n, T const* v, P const& pred)
         if (!(*has_any))
         {
             int r = false;
-            for (N i = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
+            for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
                  i < n && !r; i += stride)
             {
                 r = pred(v[i]) ? 1 : 0;
@@ -892,7 +889,7 @@ bool AnyOf (N n, T const* v, P const& pred)
         if (!has_any)
         {
             int r = false;
-            for (N i = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  i < n && !r; i += stride)
             {
                 r = pred(v[i]) ? 1 : 0;
@@ -932,7 +929,7 @@ bool AnyOf (Box const& box, P const& pred)
         if (!(*has_any))
         {
             int r = false;
-            for (int icell = gh.blockDim()*gh.blockIdx()+gh.threadIdx(), stride = gh.blockDim()*gh.gridDim();
+            for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
                  icell < ncells && !r; icell += stride) {
                 int k =  icell /   lenxy;
                 int j = (icell - k*lenxy) /   lenx;
@@ -958,7 +955,7 @@ bool AnyOf (Box const& box, P const& pred)
         if (!has_any)
         {
             int r = false;
-            for (int icell = blockDim.x*blockIdx.x+threadIdx.x, stride = blockDim.x*gridDim.x;
+            for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
                  icell < ncells && !r; icell += stride) {
                 int k =  icell /   lenxy;
                 int j = (icell - k*lenxy) /   lenx;
diff --git a/Src/Base/AMReX_Scan.H b/Src/Base/AMReX_Scan.H
@@ -676,7 +676,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
     {
         auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
         auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
-        const unsigned int gid = blockIdx.x*blockDim.x + threadIdx.x;
+        const unsigned int gid = blockIdx.x*nthreads + threadIdx.x;
         if (gid == 0) { scan_bid.reset(); }
         scan_tile_state.initialize_prefix(gid, nblocks);
     });
@@ -755,7 +755,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
                                            rocprim::plus<T>());
             }
             if (totalsum_p) {
-                if (iend == n && threadIdx.x == blockDim.x-1) { // last thread of last block
+                if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
                     T tsum = data[nelms_per_thread-1];
                     AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
                     *totalsum_p = tsum;
@@ -768,7 +768,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
         BlockExchange().blocked_to_striped(data, data, temp_storage.exchange);
 
         for (int i = 0; i < nelms_per_thread; ++i) {
-            N offset = ibegin + i*blockDim.x + threadIdx.x;
+            N offset = ibegin + i*nthreads + threadIdx.x;
             if (offset < iend) { fout(offset, data[i]); }
         }
     });
@@ -888,7 +888,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
                 BlockScan(temp_storage.scan_storeage.scan).InclusiveSum(data, data, prefix_op);
             }
             if (totalsum_p) {
-                if (iend == n && threadIdx.x == blockDim.x-1) { // last thread of last block
+                if (iend == n && threadIdx.x == nthreads-1) { // last thread of last block
                     T tsum = data[nelms_per_thread-1];
                     AMREX_IF_CONSTEXPR(is_exclusive) { tsum += last; }
                     *totalsum_p = tsum;
@@ -901,7 +901,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
         BlockExchange(temp_storage.exchange).BlockedToStriped(data);
 
         for (int i = 0; i < nelms_per_thread; ++i) {
-            N offset = ibegin + i*blockDim.x + threadIdx.x;
+            N offset = ibegin + i*nthreads + threadIdx.x;
             if (offset < iend) { fout(offset, data[i]); }
         }
     });
@@ -962,7 +962,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
     {
         int lane = threadIdx.x % Gpu::Device::warp_size;
         int warp = threadIdx.x / Gpu::Device::warp_size;
-        int nwarps = blockDim.x / Gpu::Device::warp_size;
+        int nwarps = nthreads / Gpu::Device::warp_size;
 
         amrex::Gpu::SharedMemory<T> gsm;
         T* shared = gsm.dataPtr();
@@ -999,7 +999,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
         T sum_prev_chunk = 0; // inclusive sum from previous chunks.
         T tmp_out[nchunks]; // block-wide inclusive sum for chunks
         for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
-            N offset = ibegin + ichunk*blockDim.x;
+            N offset = ibegin + ichunk*nthreads;
             if (offset >= iend) { break; }
 
             offset += threadIdx.x;
@@ -1074,7 +1074,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
 
         if (virtual_block_id == 0) {
             for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
-                N offset = ibegin + ichunk*blockDim.x + threadIdx.x;
+                N offset = ibegin + ichunk*nthreads + threadIdx.x;
                 if (offset >= iend) { break; }
                 fout(offset, tmp_out[ichunk]);
                 if (offset == n-1) {
@@ -1136,7 +1136,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
             T exclusive_prefix = shared[0];
 
             for (int ichunk = 0; ichunk < nchunks; ++ichunk) {
-                N offset = ibegin + ichunk*blockDim.x + threadIdx.x;
+                N offset = ibegin + ichunk*nthreads + threadIdx.x;
                 if (offset >= iend) { break; }
                 T t = tmp_out[ichunk] + exclusive_prefix;
                 fout(offset, t);