Skip to content

Commit ca1a49d

Browse files
committed
Remove convert layout op for any layout if the user is tt.store with block pointer. It is always lower cost to store the value to the memory referred by block pointer. without layout conversion.
Signed-off-by: Lu,Chengjun <chengjun.lu@intel.com>
1 parent ed40670 commit ca1a49d

File tree

2 files changed

+24
-11
lines changed

2 files changed

+24
-11
lines changed

test/TritonIntelGPU/backward_combine_dpas_dot_layout.mlir

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -261,3 +261,27 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
261261
tt.return
262262
}
263263
}
264+
265+
// -----
266+
267+
// COM: Case 5:
268+
// COM: Checks that block encoding has been forwarded to the store op
269+
// COM: and the ttg.convert_layout operation has been removed
270+
// CHECK: #[[BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
271+
#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [1, 0]}>
272+
#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 4], order = [1, 0]}>
273+
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 16 : i32, "ttig.support_sg_2d_block"} {
274+
tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
275+
%c8_i32 = arith.constant 8 : i32
276+
%c64_i64 = arith.constant 64 : i64
277+
%c1_i64 = arith.constant 1 : i64
278+
%c256_i64 = arith.constant 256 : i64
279+
%cst = arith.constant dense<0.000000e+00> : tensor<64x256xf16, #blocked>
280+
%25 = ttg.convert_layout %cst : tensor<64x256xf16, #blocked> -> tensor<64x256xf16, #blocked1>
281+
// CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #[[BLOCKED]]>>
282+
%27 = tt.make_tensor_ptr %arg2, [%c256_i64, %c256_i64], [%c64_i64, %c1_i64], [%c8_i32, %c8_i32] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #blocked1>>
283+
// CHECK: tt.store {{.*}}, {{.*}} {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #[[BLOCKED]]>>
284+
tt.store %27, %25 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<64x256xf16, #blocked1>>
285+
tt.return
286+
}
287+
}

third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -826,20 +826,9 @@ bool LayoutPropagation::rewriteTensorPtrStoreOp(StoreOp storeOp) {
826826
tensorType.getElementType(), encoding);
827827
newPtrType = PointerType::get(tmpType, ptrType.getAddressSpace());
828828
} else {
829-
Attribute convertOpDstEncoding = convertOp.getType().getEncoding();
830829
RankedTensorType convertOpSrcType = convertOp.getSrc().getType();
831-
if (((!convertOpDstEncoding) ||
832-
isa<ttgi::DpasEncodingAttr>(convertOpDstEncoding)) ||
833-
(!convertOpSrcType ||
834-
!isa<ttgi::DpasEncodingAttr>(convertOpSrcType.getEncoding())))
835-
return false;
836830

837831
auto ptrType = cast<PointerType>(makeTensorPtrOp.getType());
838-
auto tensorType = cast<RankedTensorType>(ptrType.getPointeeType());
839-
// If the output type of the MakeTensorPtrOp already has a
840-
// DPAS encoding, we do not forward the previous DPAS encoding.
841-
if (isa<ttgi::DpasEncodingAttr>(tensorType.getEncoding()))
842-
return false;
843832

844833
newPtrType = PointerType::get(convertOpSrcType, ptrType.getAddressSpace());
845834

0 commit comments

Comments
 (0)