propagate layout to tt.advance

alexbaden · alexbaden · commit a915a9001831 · 2025-06-25T13:03:24.000Z
diff --git a/test/TritonIntelGPU/optimize-block-io-encoding.mlir b/test/TritonIntelGPU/optimize-block-io-encoding.mlir
@@ -60,7 +60,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.th
 
 // -----
 
-// COM: Test while loop / tt.advance before tt.load (TODO)
+// COM: Test while loop / nested tt.advance
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 4], warpsPerCTA = [32, 1], order = [1, 0]}>
 #mma = #ttig.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
 // CHECK-DAG: #[[$BLOCKED:.+]] = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 4], warpsPerCTA = [32, 1], order = [1, 0]}>
@@ -99,8 +99,8 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.th
         // CHECK: tt.dot {{.*}} : tensor<256x32xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>> * tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>> -> tensor<256x256xf32, #[[$DPAS]]>
         %5 = tt.dot %4, %cstB, %cst, inputPrecision = tf32 : tensor<256x32xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>> * tensor<32x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<256x256xf32, #mma>
         %6 = ttg.convert_layout %5 : tensor<256x256xf32, #mma> -> tensor<256x256xf32, #blocked1>
-        // COM: TODO: support nested tt.advance
-        // %3 = tt.advance %a_ptr_crt, [%c0_i32, %c32_i32] : <tensor<256x32xf16, #blocked1>>
+        // CHECK: tt.advance {{.*}} : <tensor<256x32xf16, #[[$SUBGROUP_2D_BLOCK]]>>
+        %7 = tt.advance %a_ptr_crt, [%c0_i32, %c32_i32] : <tensor<256x32xf16, #blocked1>>
 
         // CHECK: scf.yield {{.*}} : !tt.ptr<tensor<256x32xf16, #[[$SUBGROUP_2D_BLOCK]]>>
         scf.yield %a_ptr_crt : !tt.ptr<tensor<256x32xf16, #blocked1>>
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeBlockIOEncoding.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeBlockIOEncoding.cpp
@@ -117,6 +117,12 @@ void rewriteTensorLayoutsForOp(Attribute encoding, Operation *op) {
       } else if (isa<scf::YieldOp>(op)) {
         auto vals = getTiedArgs(op->getParentOp(), use.getOperandNumber());
         updateEncoding(vals, EncodingInfo{encoding});
+      } else if (isa<AdvanceOp>(op)) {
+        // The operand will be updated when the MakeTensorPtr op result is
+        // updated. Make sure the result type matches.
+        for (auto result : op->getResults())
+          if (auto desc = dyn_cast<TypedValue<PointerType>>(result))
+            updateEncoding(desc, EncodingInfo{encoding});
       }
     }
 

Original file line number	Diff line number	Diff line change
`@@ -117,6 +117,12 @@ void rewriteTensorLayoutsForOp(Attribute encoding, Operation *op) {`
`117`	`117`	`} else if (isa<scf::YieldOp>(op)) {`
`118`	`118`	`auto vals = getTiedArgs(op->getParentOp(), use.getOperandNumber());`
`119`	`119`	`updateEncoding(vals, EncodingInfo{encoding});`
	`120`	`+ } else if (isa<AdvanceOp>(op)) {`
	`121`	`+ // The operand will be updated when the MakeTensorPtr op result is`
	`122`	`+ // updated. Make sure the result type matches.`
	`123`	`+ for (auto result : op->getResults())`
	`124`	`+ if (auto desc = dyn_cast<TypedValue<PointerType>>(result))`
	`125`	`+ updateEncoding(desc, EncodingInfo{encoding});`
`120`	`126`	`}`
`121`	`127`	`}`
`122`	`128`