@@ -806,7 +806,7 @@ ParallelFor (Gpu::KernelInfo const&, T n, L const& f) noexcept
806
806
if (tid < nleft) {
807
807
detail::call_f_scalar_handler (f, tid+start_idx,
808
808
Gpu::Handler (amrex::min ((std::uint64_t (nleft-tid)+(std::uint64_t )threadIdx.x ),
809
- (std::uint64_t )blockDim. x )));
809
+ (std::uint64_t )MT )));
810
810
}
811
811
});
812
812
}
@@ -829,7 +829,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, L const& f) noexcept
829
829
auto iv = indexer.intVect (icell);
830
830
detail::call_f_intvect_handler (f, iv,
831
831
Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
832
- (std::uint64_t )blockDim. x )));
832
+ (std::uint64_t )MT )));
833
833
}
834
834
});
835
835
}
@@ -852,7 +852,7 @@ ParallelFor (Gpu::KernelInfo const&, BoxND<dim> const& box, T ncomp, L const& f)
852
852
auto iv = indexer.intVect (icell);
853
853
detail::call_f_intvect_ncomp_handler (f, iv, ncomp,
854
854
Gpu::Handler (amrex::min ((indexer.numPts ()-icell+(std::uint64_t )threadIdx.x ),
855
- (std::uint64_t )blockDim. x )));
855
+ (std::uint64_t )MT )));
856
856
}
857
857
});
858
858
}
@@ -870,9 +870,9 @@ ParallelForRNG (T n, L const& f) noexcept
870
870
amrex::min (ec.numBlocks .x , Gpu::Device::maxBlocksPerLaunch ()),
871
871
ec.numThreads , 0 , Gpu::gpuStream (),
872
872
[=] AMREX_GPU_DEVICE () noexcept {
873
- Long tid = Long (blockDim. x )*blockIdx.x +threadIdx.x ;
873
+ Long tid = Long (AMREX_GPU_MAX_THREADS )*blockIdx.x +threadIdx.x ;
874
874
RandomEngine engine{&(rand_state[tid])};
875
- for (Long i = tid, stride = Long (blockDim. x )*gridDim.x ; i < Long (n); i += stride) {
875
+ for (Long i = tid, stride = Long (AMREX_GPU_MAX_THREADS )*gridDim.x ; i < Long (n); i += stride) {
876
876
f (T (i),engine);
877
877
}
878
878
});
@@ -892,9 +892,9 @@ ParallelForRNG (BoxND<dim> const& box, L const& f) noexcept
892
892
amrex::min (ec.numBlocks .x , Gpu::Device::maxBlocksPerLaunch ()),
893
893
ec.numThreads , 0 , Gpu::gpuStream (),
894
894
[=] AMREX_GPU_DEVICE () noexcept {
895
- auto const tid = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x ;
895
+ auto const tid = std::uint64_t (AMREX_GPU_MAX_THREADS )*blockIdx.x +threadIdx.x ;
896
896
RandomEngine engine{&(rand_state[tid])};
897
- for (std::uint64_t icell = tid, stride = std::uint64_t (blockDim. x )*gridDim.x ; icell < indexer.numPts (); icell += stride) {
897
+ for (std::uint64_t icell = tid, stride = std::uint64_t (AMREX_GPU_MAX_THREADS )*gridDim.x ; icell < indexer.numPts (); icell += stride) {
898
898
auto iv = indexer.intVect (icell);
899
899
detail::call_f_intvect_engine (f, iv, engine);
900
900
}
@@ -915,9 +915,9 @@ ParallelForRNG (BoxND<dim> const& box, T ncomp, L const& f) noexcept
915
915
amrex::min (ec.numBlocks .x , Gpu::Device::maxBlocksPerLaunch ()),
916
916
ec.numThreads , 0 , Gpu::gpuStream (),
917
917
[=] AMREX_GPU_DEVICE () noexcept {
918
- auto const tid = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x ;
918
+ auto const tid = std::uint64_t (AMREX_GPU_MAX_THREADS )*blockIdx.x +threadIdx.x ;
919
919
RandomEngine engine{&(rand_state[tid])};
920
- for (std::uint64_t icell = tid, stride = std::uint64_t (blockDim. x )*gridDim.x ; icell < indexer.numPts (); icell += stride) {
920
+ for (std::uint64_t icell = tid, stride = std::uint64_t (AMREX_GPU_MAX_THREADS )*gridDim.x ; icell < indexer.numPts (); icell += stride) {
921
921
auto iv = indexer.intVect (icell);
922
922
detail::call_f_intvect_ncomp_engine (f, iv, ncomp, engine);
923
923
}
@@ -938,7 +938,7 @@ ParallelFor (Gpu::KernelInfo const&,
938
938
AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
939
939
[=] AMREX_GPU_DEVICE () noexcept {
940
940
auto const ncells = std::max (indexer1.numPts (), indexer2.numPts ());
941
- for (std::uint64_t icell = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim. x )*gridDim.x ;
941
+ for (std::uint64_t icell = std::uint64_t (MT )*blockIdx.x +threadIdx.x , stride = std::uint64_t (MT )*gridDim.x ;
942
942
icell < ncells; icell += stride) {
943
943
if (icell < indexer1.numPts ()) {
944
944
auto iv = indexer1.intVect (icell);
@@ -967,7 +967,7 @@ ParallelFor (Gpu::KernelInfo const&,
967
967
AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
968
968
[=] AMREX_GPU_DEVICE () noexcept {
969
969
auto const ncells = std::max ({indexer1.numPts (), indexer2.numPts (), indexer3.numPts ()});
970
- for (std::uint64_t icell = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim. x )*gridDim.x ;
970
+ for (std::uint64_t icell = std::uint64_t (MT )*blockIdx.x +threadIdx.x , stride = std::uint64_t (MT )*gridDim.x ;
971
971
icell < ncells; icell += stride) {
972
972
if (icell < indexer1.numPts ()) {
973
973
auto iv = indexer1.intVect (icell);
@@ -1001,7 +1001,7 @@ ParallelFor (Gpu::KernelInfo const&,
1001
1001
AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
1002
1002
[=] AMREX_GPU_DEVICE () noexcept {
1003
1003
auto const ncells = std::max (indexer1.numPts (), indexer2.numPts ());
1004
- for (std::uint64_t icell = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim. x )*gridDim.x ;
1004
+ for (std::uint64_t icell = std::uint64_t (MT )*blockIdx.x +threadIdx.x , stride = std::uint64_t (MT )*gridDim.x ;
1005
1005
icell < ncells; icell += stride) {
1006
1006
if (icell < indexer1.numPts ()) {
1007
1007
auto iv = indexer1.intVect (icell);
@@ -1034,7 +1034,7 @@ ParallelFor (Gpu::KernelInfo const&,
1034
1034
AMREX_LAUNCH_KERNEL (MT, ec.numBlocks , ec.numThreads , 0 , Gpu::gpuStream (),
1035
1035
[=] AMREX_GPU_DEVICE () noexcept {
1036
1036
auto const ncells = std::max ({indexer1.numPts (), indexer2.numPts (), indexer3.numPts ()});
1037
- for (std::uint64_t icell = std::uint64_t (blockDim. x )*blockIdx.x +threadIdx.x , stride = std::uint64_t (blockDim. x )*gridDim.x ;
1037
+ for (std::uint64_t icell = std::uint64_t (MT )*blockIdx.x +threadIdx.x , stride = std::uint64_t (MT )*gridDim.x ;
1038
1038
icell < ncells; icell += stride) {
1039
1039
if (icell < indexer1.numPts ()) {
1040
1040
auto iv = indexer1.intVect (icell);
0 commit comments