fix vheloop to ensure minimum value of 1

hyoon1 · hyoon1 · commit a7ddaa9b926a · 2025-05-07T23:45:24.000-04:00
Signed-off-by: Hosang Yoon &lt;hosang.yoon@amd.com&gt;
diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu
@@ -1839,9 +1839,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
       VTOKENS_PER_LANE,
       CONTIGUOUS_KV_ELEMS_16B_LOAD);  // optimized for 16B fetches; assumes
                                       // minimum block size is 16
-  constexpr int VHELOOP =
-      HEAD_SIZE / 16 / NWARPS;  // head_size distributed across warps; each wmma
-                                // instr works on 16 head elements
+  constexpr int VHELOOP = DIVIDE_ROUND_UP(
+      (HEAD_SIZE / 16), NWARPS);  // head_size distributed across warps; each
+                                  // wmma instr works on 16 head elements
 
   int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];
 
@@ -2612,9 +2612,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
       VTOKENS_PER_LANE,
       CONTIGUOUS_KV_ELEMS_16B_LOAD);  // optimized for 16B fetches; assumes
                                       // minimum block size is 16
-  constexpr int VHELOOP =
-      HEAD_SIZE / 16 / NWARPS;  // head_size distributed across warps; each wmma
-                                // instr works on 16 head elements
+  constexpr int VHELOOP = DIVIDE_ROUND_UP(
+      (HEAD_SIZE / 16), NWARPS);  // head_size distributed across warps; each
+                                  // wmma instr works on 16 head elements
 
   int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];