Skip to content

Commit a7ddaa9

Browse files
committed
fix vheloop to ensure minimum value of 1
Signed-off-by: Hosang Yoon <hosang.yoon@amd.com>
1 parent 36dc4b4 commit a7ddaa9

File tree

1 file changed

+6
-6
lines changed

1 file changed

+6
-6
lines changed

csrc/rocm/attention.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1839,9 +1839,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
18391839
VTOKENS_PER_LANE,
18401840
CONTIGUOUS_KV_ELEMS_16B_LOAD); // optimized for 16B fetches; assumes
18411841
// minimum block size is 16
1842-
constexpr int VHELOOP =
1843-
HEAD_SIZE / 16 / NWARPS; // head_size distributed across warps; each wmma
1844-
// instr works on 16 head elements
1842+
constexpr int VHELOOP = DIVIDE_ROUND_UP(
1843+
(HEAD_SIZE / 16), NWARPS); // head_size distributed across warps; each
1844+
// wmma instr works on 16 head elements
18451845

18461846
int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];
18471847

@@ -2612,9 +2612,9 @@ __launch_bounds__(NUM_THREADS, 3) void paged_attention_ll4mi_QKV_mfma16_kernel(
26122612
VTOKENS_PER_LANE,
26132613
CONTIGUOUS_KV_ELEMS_16B_LOAD); // optimized for 16B fetches; assumes
26142614
// minimum block size is 16
2615-
constexpr int VHELOOP =
2616-
HEAD_SIZE / 16 / NWARPS; // head_size distributed across warps; each wmma
2617-
// instr works on 16 head elements
2615+
constexpr int VHELOOP = DIVIDE_ROUND_UP(
2616+
(HEAD_SIZE / 16), NWARPS); // head_size distributed across warps; each
2617+
// wmma instr works on 16 head elements
26182618

26192619
int vphysical_block_number[VTLOOP][VBLOCKS_PER_LANE];
26202620

0 commit comments

Comments
 (0)