From e0d6f022b32c23a1e4e75885ed7e08a2e806ea4b Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed@amd.com>
Date: Fri, 7 Mar 2025 12:47:52 -0600
Subject: [PATCH 1/6] Initial commit does not build

---
 lib/Conversion/TorchToLinalg/Pooling.cpp | 217 +++++++++++++++++++++++
 1 file changed, 217 insertions(+)
diff --git a/lib/Conversion/TorchToLinalg/Pooling.cpp b/lib/Conversion/TorchToLinalg/Pooling.cpp
index 3c971354783a..0b80ee1ffd03 100644
--- a/lib/Conversion/TorchToLinalg/Pooling.cpp
+++ b/lib/Conversion/TorchToLinalg/Pooling.cpp
@@ -1616,6 +1616,223 @@ class ConvertAtenAdaptivePoolOp : public OpConversionPattern<OpTy> {
 };
 } // namespace
 
+namespace {
+template <typename OpTy, typename PoolingOpTy, int Dim>
+class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
+public:
+  using OpConversionPattern<OpTy>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
+      return failure();
+
+    Location loc = op->getLoc();
+    const TypeConverter *typeConverter = this->getTypeConverter();
+    Value result = op.getResult();
+
+    uint64_t pooledHeight =
+        cast<ConstantIntOp>(op.getPooledHeight().getDefiningOp()).getValue();
+    uint64_t pooledWidth =
+        cast<ConstantIntOp>(op.getPooledWidth().getDefiningOp()).getValue();
+    uint64_t samplingRatio =
+        cast<ConstantIntOp>(op.getSamplingRatio().getDefiningOp()).getValue();
+    Value pooledH = op.getPooledHeight();
+    Value pooledW = op.getPooledWidth();
+    Value spatialScaleVal = op.getSpatialScale();
+    llvm::APFloat spatialScale =
+        cast<ConstantFloatOp>(op.getSpatialScale().getDefiningOp()).getValue();
+    Value rois = op.getRois();
+    Value input = op.getInput();
+    // RankedTensorType inputType = input.getType();
+    Value offset =
+        rewriter.create<arith::ConstantOp>(loc, b.getF32FloatAttr(0.0));
+    Type resultType = cast<RankedTensorType>(result.getType());
+    Type resultElementType = resultType.getElementType();
+    if (!op.getAligned()) {
+      offset = rewriter.create<arith::ConstantOp>(loc, b.getF32FloatAttr(0.5));
+    }
+
+    Value lb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value ub0 = rewriter.create<tensor::DimOp>(loc, rois, 0);
+    Value ub1 = rewriter.create<tensor::DimOp>(loc, input, 1);
+    Value step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    SmallVector<Value> finalOutputShape = {ub0, ub1, pooledH, pooledW};
+    Value finalOutputTensor = rewriter.create<tensor::EmptyOp>(
+        loc, getAsOpFoldResult(finalOutputShape), resultElementType);
+    auto forLoop = rewriter.create<scf::ForOp>(
+        loc, lb, ub0, step, ValueRange{},
+        [&](OpBuilder &b1, Location loc, Value iv0, ValueRange args) {
+          auto forLoop = b1.create<scf::ForOp>(
+              loc, lb, ub1, step, ValueRange{},
+              [&](OpBuilder &b, Location loc, Value iv1, ValueRange args) {
+                // Step 1: Extract bounds for region of interest (roi)
+                OpFoldResult zeroAttr = b.getI64IntegerAttr(0);
+                OpFoldResult oneAttr = b.getI64IntegerAttr(1);
+                OpFoldResult twoAttr = b.getI64IntegerAttr(2);
+                OpFoldResult threeAttr = b.getI64IntegerAttr(3);
+                OpFoldResult fourAttr = b.getI64IntegerAttr(4);
+                OpFoldResult fiveAttr = b.getI64IntegerAttr(5);
+                // SmallVector<Value> offsetVals{iv0, zeroAttr};
+                // SmallVector<OpFoldResult> sizeVals{oneAttr, fiveAttr};
+                SmallVector<OpFoldResult> strideVals{oneAttr, oneAttr, oneAttr,
+                                                     oneAttr};
+                // Value extractRoiBounds = b.create<tensor::ExtractSliceOp>(
+                //     loc, rois, offsetVals, sizeVals, strideVals);
+                Value lowY = b.create<tensor::ExtractOp>(
+                    loc, rois, ValueRange{iv0, oneAttr});
+                Value lowX = b.create<tensor::ExtractOp>(
+                    loc, rois, ValueRange{iv0, twoAttr});
+                Value highY = b.create<tensor::ExtractOp>(
+                    loc, rois, ValueRange{iv0, threeAttr});
+                Value highX = b.create<tensor::ExtractOp>(
+                    loc, rois, ValueRange{iv0, fourAttr});
+
+                lowY = b.create<arith::MulFOp>(loc, lowY, spatialScaleVal);
+                lowX = b.create<arith::MulFOp>(loc, lowX, spatialScaleVal);
+                highY = b.create<arith::MulFOp>(loc, highY, spatialScaleVal);
+                highX = b.create<arith::MulFOp>(loc, highX, spatialScaleVal);
+
+                lowY = b.create<arith::SubFOp>(loc, lowY, offset);
+                lowX = b.create<arith::SubFOp>(loc, lowX, offset);
+                highY = b.create<arith::SubFOp>(loc, highY, offset);
+                highX = b.create<arith::SubFOp>(loc, highX, offset);
+
+                // Step 2: Extract region of interest using bounds
+                Value lowY_int = b.create<math::FloorOp>(loc, lowY);
+                Value lowX_int = b.create<math::FloorOp>(loc, lowX);
+                Value highY_int = b.create<math::CeilOp>(loc, highY);
+                Value highX_int = b.create<math::CeilOp>(loc, highX);
+                lowY_int =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowY_int);
+                lowX_int =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowX_int);
+                highY_int =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highY_int);
+                highX_int =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highX_int);
+
+                Value roiHeight =
+                    b.create<arith::SubIOp>(loc, highY_int, lowY_int);
+                Value roiWidth =
+                    b.create<arith::SubIOp>(loc, highX_int, lowX_int);
+
+                SmallVector<Value> roiOffsetVals{zeroAttr, iv1, lowY_int,
+                                                 lowX_int};
+                SmallVector<Value> roiSizeVals{oneAttr, oneAttr, roiHeight,
+                                               roiWidth};
+
+                Value extractRoi = b.create<tensor::ExtractSliceOp>(
+                    loc, input, roiOffsetVals, roiSizeVals, strideVals);
+
+                // Step 3: Perform bilinear interpolation over roi
+                Value roiBinH = b.create<arith::SubOp>(loc, highY, lowY);
+                Value roiBinW = b.create<arith::SubOp>(loc, highX, lowX);
+                Value scaleH = b.create<arith::DivOp>(loc, roiBinH, pooledH);
+                Value scaleW = b.create<arith::DivOp>(loc, roiBinW, pooledW);
+                scaleH = b.create<arith::CeilOp>(loc, scaleH);
+                scaleW = b.create<arith::CeilOp>(loc, scaleW);
+                scaleH = b.create<arith::FPToSIOp>(loc, b.getI64Type(), scaleH);
+                scaleW = b.create<arith::FPToSIOp>(loc, b.getI64Type(), scaleW);
+
+                Value roiSampleHeight =
+                    b.create<arith::MulIOp>(loc, pooledH, scaleH);
+                Value roiSampleWidth =
+                    b.create<arith::MulIOp>(loc, pooledW, scaleW);
+
+                SmallVector<Value> outputSizeIntValues = {roiSampleHeight,
+                                                          roiSampleWidth};
+                SmallVector<Value> dims =
+                    getTensorSizesUntilDim(b, loc, extractRoi, 1);
+                for (unsigned i = 2; i < inputRank; i++) {
+                  dims.push_back(
+                      castIntToIndex(b, loc, outputSizeIntValues[i - 2]));
+                }
+                SmallVector<Value> inputSizes;
+                auto inputType = cast<RankedTensorType>(extractRoi.getType());
+                auto inputRank = inputType.getRank();
+                for (unsigned i = 2; i < inputRank; i++) {
+                  Value inputSize = getDimOp(b, loc, extractRoi, i);
+                  inputSizes.push_back(b.create<arith::IndexCastOp>(
+                      loc, b.getIntegerType(64), roiSizeVals[i]));
+                }
+                Value outTensor = b.create<tensor::EmptyOp>(
+                    loc, getAsOpFoldResult(dims), inputType.getElementType());
+                AffineMap idMap = b.getMultiDimIdentityMap(inputRank);
+                SmallVector<utils::IteratorType> iteratorTypes(
+                    inputRank, utils::IteratorType::parallel);
+                Value bilinearInterpolatedRoi =
+                    b.create<linalg::GenericOp>(
+                         loc, outTensor.getType(), ValueRange{}, outTensor,
+                         /*indexingMaps=*/idMap,
+                         /*iteratorTypes=*/iteratorTypes,
+                         [&](OpBuilder &b, Location loc, ValueRange args) {
+                           Value retVal = bilinearInterpolate(
+                               b, op, loc, outputSizeIntValues, extractRoi,
+                               inputSizes, ValueRange{}, "bilinear");
+                           b.create<linalg::YieldOp>(loc, retVal);
+                         })
+                        .getResult(0);
+
+                // Step 4: Sum pool over interpolated values
+                Value sumPool, paddedInput;
+                SmallVector<Value> kernelSizeIntValues = {oneAttr, oneAttr,
+                                                          scaleH, scaleW};
+                SmallVector<Value, 2> strideInts = {scaleH, scaleW};
+                SmallVector<Value, 2> paddingInts = {zeroAttr, zeroAttr};
+                SmallVector<Value, 2> dilationInts(oneAttr, 2);
+                SmallVector<Value, 4> outTensorShape;
+                if (failed(createPoolingOp<linalg::PoolingNchwSumOp>(
+                        op, b, self, /*supportNonFPInput=*/true, false,
+                        /*dimensionality=*/2, kernelSizeIntValues, strideInts,
+                        paddingInts, dilationInts,
+                        b.getZeroAttr(resultElementType), outTensorShape,
+                        paddedInput, sumPool)))
+                  return b.notifyMatchFailure(op, "unable to compute sumpool");
+
+                // Step 5: elementwise division by number of sampling points
+                // to compute avg pool
+                Value outputTensor = b.create<tensor::EmptyOp>(
+                    loc, getAsOpFoldResult(outTensorShape), resultElementType);
+                Value divisor = b.create<arith::MulIOp>(loc, scaleH, scaleW);
+                Value avgPool =
+                    b.create<linalg::GenericOp>(
+                         loc, outputTensor.getType(), sumPool, outputTensor,
+                         /*indexingMaps=*/indexingMapsAvg,
+                         /*iteratorTypes=*/iteratorTypesAvg,
+                         [&](OpBuilder &b, Location loc, ValueRange args) {
+                           Value avg;
+                           if (isa<mlir::IntegerType>(resultElementType))
+                             avg = b.create<arith::DivSIOp>(loc, args[0],
+                                                            divisor);
+                           else if (isa<mlir::FloatType>(resultElementType))
+                             avg =
+                                 b.create<arith::DivFOp>(loc, args[0], divisor);
+                           b.create<linalg::YieldOp>(loc, avg);
+                         })
+                        .getResult(0);
+
+                SmallVector<OpFoldResult> finalStrides(inputRank, oneAttr);
+                SmallVector<OpFoldResult> finalOffsets = {
+                    getAsOpFoldResult(iv0), getAsOpFoldResult(iv1), zeroAttr,
+                    zeroAttr};
+                SmallVector<OpFoldResult> finalSizes = {
+                    oneAttr, oneAttr, getAsOpFoldResult(pooledH),
+                    getAsOpFoldResult(pooledW)};
+                SmallVector<OpFoldResult> diagStrides(inputRank, oneAttr);
+                finalOutputTensor = b.create<tensor::InsertSliceOp>(
+                    loc, finalOutputTensor, avgPool, finalOffsets, finalSizes,
+                    finalStrides);
+              });
+        });
+
+    Type resultType = typeConverter->convertType(op.getType());
+    b.replaceOp(op, finalOutputTensor);
+    return success();
+  }
+};
+} // namespace
+
 void mlir::torch::torch_to_linalg::populatePoolingPatternsAndLegality(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     ConversionTarget &target) {

From 31140fb5760076bcee6fbea66ca6414684877e7c Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed@amd.com>
Date: Mon, 10 Mar 2025 01:06:38 -0500
Subject: [PATCH 2/6] Fixing issues

---
 lib/Conversion/TorchToLinalg/Pooling.cpp | 412 +++++++++++++++++++++++
 1 file changed, 412 insertions(+)

diff --git a/lib/Conversion/TorchToLinalg/Pooling.cpp b/lib/Conversion/TorchToLinalg/Pooling.cpp
index 0b80ee1ffd03..ce41c25ddfc3 100644
--- a/lib/Conversion/TorchToLinalg/Pooling.cpp
+++ b/lib/Conversion/TorchToLinalg/Pooling.cpp
@@ -18,6 +18,8 @@
 #include "torch-mlir/Conversion/Utils/Utils.h"
 #include "torch-mlir/Dialect/Torch/IR/TorchOps.h"
 #include "torch-mlir/Dialect/Torch/Utils/Utils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Math/IR/Math.h"
 #include <optional>
 
 using namespace mlir;
@@ -1833,6 +1835,414 @@ class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
 };
 } // namespace
 
+namespace {
+template <typename OpTy>
+class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
+public:
+  using OpConversionPattern<OpTy>::OpConversionPattern;
+
+  static SmallVector<Value> coordinateTransform(
+      OpBuilder &b, OpTy op, Location loc, SmallVector<Value> outputSizes,
+      Value input, SmallVector<Value> inputSizes,
+      SmallVector<Value> scaleValues, std::string coordStr,
+      bool alignCornersBool, SmallVector<Value> indices, bool clip) {
+
+    unsigned dimOffset = 2;
+    auto inputType = cast<RankedTensorType>(input.getType());
+    auto inputRank = inputType.getRank();
+
+    Value cstOneFloat =
+        b.create<arith::ConstantOp>(loc, b.getF32FloatAttr(1.0));
+    Value cstHalf = b.create<arith::ConstantOp>(loc, b.getF32FloatAttr(0.5));
+    Value zero = b.create<arith::ConstantOp>(loc, b.getF32FloatAttr(0.0));
+
+    SmallVector<Value> proj;
+    for (unsigned i = 0; i < inputRank - dimOffset; i++) {
+      // length_original
+      Value inputFP =
+          b.create<arith::SIToFPOp>(loc, b.getF32Type(), inputSizes[i]);
+      // length_resized
+      Value outputSizeFP =
+          b.create<arith::SIToFPOp>(loc, b.getF32Type(), outputSizes[i]);
+      // scale = length_resized/length_original
+      Value scale;
+      if (alignCornersBool) {
+        // x_original = x_resized * (length_original - 1) / (length_resized - 1)
+        Value inputSubOne = b.create<arith::SubFOp>(loc, inputFP, cstOneFloat);
+        Value outputSizeSubOne =
+            b.create<arith::SubFOp>(loc, outputSizeFP, cstOneFloat);
+        Value cmp = b.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UEQ,
+                                            outputSizeSubOne, zero);
+        scale = b.create<arith::DivFOp>(loc, inputSubOne, outputSizeSubOne);
+        scale = b.create<arith::SelectOp>(loc, cmp, zero, scale);
+        coordStr = "_align_corners";
+      } else if (scaleValues.empty())
+        scale = b.create<arith::DivFOp>(loc, outputSizeFP, inputFP);
+      else
+        scale = scaleValues[i];
+      // y_resized
+      Value outInt = b.create<arith::IndexCastOp>(loc, b.getI64Type(),
+                                                  indices[i + dimOffset]);
+      Value outFP = b.create<arith::SIToFPOp>(loc, b.getF32Type(), outInt);
+      Value preClip;
+      if (coordStr == "_align_corners") {
+        preClip = b.create<arith::MulFOp>(loc, outFP, scale);
+      }
+      if (coordStr == "_asymmetric") {
+        preClip = b.create<arith::DivFOp>(loc, outFP, scale);
+      }
+      if (coordStr == "_pytorch_half_pixel" || coordStr == "" ||
+          coordStr == "_half_pixel_symmetric") {
+        // half-pixel modes
+        // y_resized + 0.5
+        Value outPlusHalf = b.create<arith::AddFOp>(loc, outFP, cstHalf);
+        // (y_resized + 0.5) / scale
+        Value outDivScale = b.create<arith::DivFOp>(loc, outPlusHalf, scale);
+        // _ - 0.5
+        preClip = b.create<arith::SubFOp>(loc, outDivScale, cstHalf);
+      }
+      // for half_pixel_symmetric, need to compute offset from raw scales
+      if (coordStr == "_half_pixel_symmetric" && !scaleValues.empty()) {
+        Value outputSizeFromScale =
+            b.create<arith::MulFOp>(loc, inputFP, scale);
+        Value adjustment =
+            b.create<arith::DivFOp>(loc, outputSizeFP, outputSizeFromScale);
+        Value cstTwo = b.create<arith::ConstantOp>(loc, b.getF32FloatAttr(2.0));
+        Value center = b.create<arith::DivFOp>(loc, inputFP, cstTwo);
+        Value oneMAdjustment =
+            b.create<arith::SubFOp>(loc, cstOneFloat, adjustment);
+        Value offset = b.create<arith::MulFOp>(loc, center, oneMAdjustment);
+        preClip = b.create<arith::AddFOp>(loc, offset, preClip);
+      }
+      // for pytorch half pixel , special case for length_resized == 1:
+      if (coordStr == "_pytorch_half_pixel") {
+        Value cmp = b.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UEQ,
+                                            outputSizeFP, cstOneFloat);
+        preClip = b.create<arith::SelectOp>(loc, cmp, zero, preClip);
+      }
+      if (clip) {
+        // preClip is the fp position inside the input image to extract from.
+        // clip to [0,inf)
+        Value max = b.create<arith::MaximumFOp>(loc, preClip, zero);
+        Value inputSubOne = b.create<arith::SubFOp>(loc, inputFP, cstOneFloat);
+        // clip to [0,length_original - 1].
+        // proj is properly within the input image.
+        proj.push_back(b.create<arith::MinimumFOp>(loc, max, inputSubOne));
+      } else {
+        proj.push_back(preClip);
+      }
+    }
+    return proj;
+  }
+
+  static Value bilinearInterpolate(OpBuilder &b, OpTy op, Location loc,
+                                   SmallVector<Value> outputSizes, Value input,
+                                   SmallVector<Value> inputSizes,
+                                   SmallVector<Value> scaleValues,
+                                   std::string coordStr) {
+    unsigned dimOffset = 2;
+    auto inputType = cast<RankedTensorType>(input.getType());
+    auto inputRank = inputType.getRank();
+
+    Value cstOneFloat =
+        b.create<arith::ConstantOp>(loc, b.getF32FloatAttr(1.0));
+
+    SmallVector<Value> indices;
+    for (unsigned i = 0; i < inputRank; i++) {
+      indices.push_back(b.create<linalg::IndexOp>(loc, i));
+    }
+
+    SmallVector<Value> proj, high, low, highFP, lowFP;
+    proj = coordinateTransform(b, op, loc, outputSizes, input, inputSizes,
+                               scaleValues, coordStr, false, indices, true);
+    for (unsigned i = 0; i < inputRank - dimOffset; i++) {
+      // length_original
+      Value inputFP =
+          b.create<arith::SIToFPOp>(loc, b.getF32Type(), inputSizes[i]);
+      Value inputSubOne = b.create<arith::SubFOp>(loc, inputFP, cstOneFloat);
+
+      // for bilinear interpolation, we look for the nearest indices below and
+      // above proj
+      lowFP.push_back(b.create<math::FloorOp>(loc, proj[i]));
+      Value projPlusOne = b.create<arith::AddFOp>(loc, cstOneFloat, proj[i]);
+      highFP.push_back(b.create<math::FloorOp>(loc, projPlusOne));
+
+      Value lowInt = b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowFP[i]);
+      low.push_back(
+          b.create<arith::IndexCastOp>(loc, b.getIndexType(), lowInt));
+
+      // highFP could be out-of-bounds, so make sure to clip it down before
+      // extracting. If highFP actually gets clipped here, then high[i] will
+      // extract at the last pixel, but will treat it as if it were extracted
+      // from one further position when computing the interpolation weights.
+      Value highExtract =
+          b.create<arith::MinimumFOp>(loc, projPlusOne, inputSubOne);
+      highExtract = b.create<arith::FPToSIOp>(loc, b.getI64Type(), highExtract);
+      high.push_back(
+          b.create<arith::IndexCastOp>(loc, b.getIndexType(), highExtract));
+    }
+
+    indices[dimOffset] = low[0];
+    indices[dimOffset + 1] = low[1];
+    Value p00 = b.create<tensor::ExtractOp>(loc, input, indices);
+
+    indices[dimOffset] = low[0];
+    indices[dimOffset + 1] = high[1];
+    Value p01 = b.create<tensor::ExtractOp>(loc, input, indices);
+
+    indices[dimOffset] = high[0];
+    indices[dimOffset + 1] = low[1];
+    Value p10 = b.create<tensor::ExtractOp>(loc, input, indices);
+
+    indices[dimOffset] = high[0];
+    indices[dimOffset + 1] = high[1];
+    Value p11 = b.create<tensor::ExtractOp>(loc, input, indices);
+
+    // Let Aij := area rect((yProj,xProj) <-> (y_i*,x_j*)),
+    // where i* = i+1 mod 2 and x_0 = xLow, x_1 = xHigh etc.
+    // We interpolate via the weighted average of pij by weights Aij
+    // the formula is retval = Sum(pij*Aij for i and j in range(2))
+    // Note: we do not need to divide by total rect area == 1
+
+    // lengths : Aij == dyi*dxj
+    Value dy0 = b.create<arith::SubFOp>(loc, highFP[0], proj[0]);
+    Value dy1 = b.create<arith::SubFOp>(loc, proj[0], lowFP[0]);
+    Value dx0 = b.create<arith::SubFOp>(loc, highFP[1], proj[1]);
+    Value dx1 = b.create<arith::SubFOp>(loc, proj[1], lowFP[1]);
+
+    // left = A00*p00 + A01*p01 = dy0(dx0p00 + dx1p01)
+    Value dx0p00 = b.create<arith::MulFOp>(loc, dx0, p00);
+    Value dx1p01 = b.create<arith::MulFOp>(loc, dx1, p01);
+    Value sum = b.create<arith::AddFOp>(loc, dx0p00, dx1p01);
+    Value left = b.create<arith::MulFOp>(loc, dy0, sum);
+    // right = A10*p10 + A11*p11 = dy1(dx0p10 + dx1p11)
+    Value dx0p10 = b.create<arith::MulFOp>(loc, dx0, p10);
+    Value dx1p11 = b.create<arith::MulFOp>(loc, dx1, p11);
+    sum = b.create<arith::AddFOp>(loc, dx0p10, dx1p11);
+    Value right = b.create<arith::MulFOp>(loc, dy1, sum);
+
+    return b.create<arith::AddFOp>(loc, left, right);
+  }
+  LogicalResult
+  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
+      return failure();
+
+    Location loc = op->getLoc();
+    Value result = op.getResult();
+
+    uint64_t samplingRatio =
+        cast<ConstantIntOp>(op.getSamplingRatio().getDefiningOp()).getValue();
+    int64_t samplingRatioInt = static_cast<int64_t>(samplingRatio);
+    Value pooledH = op.getPooledHeight();
+    Value pooledW = op.getPooledWidth();
+    Value spatialScaleVal = op.getSpatialScale();
+    llvm::APFloat spatialScale =
+        cast<ConstantFloatOp>(op.getSpatialScale().getDefiningOp()).getValue();
+    Value rois = op.getRois();
+    Value input = op.getInput();
+    unsigned inputRank = cast<RankedTensorType>(input.getType()).getRank();
+    Value offset =
+        rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(0.0));
+    RankedTensorType resultType = cast<RankedTensorType>(result.getType());
+    Type resultElementType = resultType.getElementType();
+    if (!op.getAligned()) {
+      offset = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getF32FloatAttr(0.5));
+    }
+
+    Value lb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value ub0 = rewriter.create<tensor::DimOp>(loc, rois, 0);
+    Value ub1 = rewriter.create<tensor::DimOp>(loc, input, 1);
+    Value step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+    SmallVector<Value> finalOutputShape = {ub0, ub1, pooledH, pooledW};
+    Value finalOutputTensor = rewriter.create<tensor::EmptyOp>(
+        loc, getAsOpFoldResult(finalOutputShape), resultElementType);
+    rewriter.create<scf::ForOp>(
+        loc, lb, ub0, step, ValueRange{},
+        [&](OpBuilder &b, Location loc, Value iv0, ValueRange args) {
+          b.create<scf::ForOp>(
+              loc, lb, ub1, step, ValueRange{},
+              [&](OpBuilder &b, Location loc, Value iv1, ValueRange args) {
+                // Step 1: Extract bounds for region of interest (roi)
+                OpFoldResult zeroAttr = b.getI64IntegerAttr(0);
+                OpFoldResult oneAttr = b.getI64IntegerAttr(1);
+
+                Value cstZero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+                Value cstOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+                Value cstTwo = rewriter.create<arith::ConstantIndexOp>(loc, 2);
+                Value cstThree =
+                    rewriter.create<arith::ConstantIndexOp>(loc, 3);
+                Value cstFour = rewriter.create<arith::ConstantIndexOp>(loc, 4);
+                SmallVector<OpFoldResult> strideVals{oneAttr, oneAttr, oneAttr,
+                                                     oneAttr};
+                SmallVector<Value> lowYIndices = {iv0, cstOne};
+                Value lowY =
+                    b.create<tensor::ExtractOp>(loc, rois, lowYIndices);
+                SmallVector<Value> lowXIndices = {iv0, cstTwo};
+                Value lowX =
+                    b.create<tensor::ExtractOp>(loc, rois, lowXIndices);
+                SmallVector<Value> highYIndices = {iv0, cstThree};
+                Value highY =
+                    b.create<tensor::ExtractOp>(loc, rois, highYIndices);
+                SmallVector<Value> highXIndices = {iv0, cstFour};
+                Value highX =
+                    b.create<tensor::ExtractOp>(loc, rois, highXIndices);
+
+                lowY = b.create<arith::MulFOp>(loc, lowY, spatialScaleVal);
+                lowX = b.create<arith::MulFOp>(loc, lowX, spatialScaleVal);
+                highY = b.create<arith::MulFOp>(loc, highY, spatialScaleVal);
+                highX = b.create<arith::MulFOp>(loc, highX, spatialScaleVal);
+
+                lowY = b.create<arith::SubFOp>(loc, lowY, offset);
+                lowX = b.create<arith::SubFOp>(loc, lowX, offset);
+                highY = b.create<arith::SubFOp>(loc, highY, offset);
+                highX = b.create<arith::SubFOp>(loc, highX, offset);
+
+                // Step 2: Extract region of interest using bounds
+                Value lowY_int = b.create<math::FloorOp>(loc, lowY);
+                Value lowX_int = b.create<math::FloorOp>(loc, lowX);
+                Value highY_int = b.create<math::CeilOp>(loc, highY);
+                Value highX_int = b.create<math::CeilOp>(loc, highX);
+                lowY_int =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowY_int);
+                lowX_int =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowX_int);
+                highY_int =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highY_int);
+                highX_int =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highX_int);
+
+                Value roiHeight =
+                    b.create<arith::SubIOp>(loc, highY_int, lowY_int);
+                Value roiWidth =
+                    b.create<arith::SubIOp>(loc, highX_int, lowX_int);
+
+                SmallVector<OpFoldResult> roiOffsetVals = {
+                    getAsOpFoldResult(cstZero), getAsOpFoldResult(iv1),
+                    getAsOpFoldResult(lowY_int), getAsOpFoldResult(lowX_int)};
+                SmallVector<Value> roiSizeVals = {cstOne, cstOne, roiHeight,
+                                                  roiWidth};
+
+                Value extractRoi = b.create<tensor::ExtractSliceOp>(
+                    loc, input, ValueRange{cstZero, iv1, lowY_int, lowX_int},
+                    ValueRange{cstOne, cstOne, roiHeight, roiWidth},
+                    ValueRange{cstOne, cstOne, cstOne, cstOne});
+
+                // Step 3: Perform bilinear interpolation over roi
+                Value roiBinH = b.create<arith::SubFOp>(loc, highY, lowY);
+                Value roiBinW = b.create<arith::SubFOp>(loc, highX, lowX);
+                Value scaleH = b.create<arith::DivFOp>(loc, roiBinH, pooledH);
+                Value scaleW = b.create<arith::DivFOp>(loc, roiBinW, pooledW);
+                scaleH = b.create<math::CeilOp>(loc, scaleH);
+                scaleW = b.create<math::CeilOp>(loc, scaleW);
+                scaleH = b.create<arith::FPToSIOp>(loc, b.getI64Type(), scaleH);
+                scaleW = b.create<arith::FPToSIOp>(loc, b.getI64Type(), scaleW);
+                if (samplingRatio > 0) {
+                  scaleH = b.create<arith::ConstantOp>(
+                      loc, rewriter.getI64IntegerAttr(samplingRatio));
+                  scaleW = b.create<arith::ConstantOp>(
+                      loc, rewriter.getI64IntegerAttr(samplingRatio));
+                }
+
+                Value roiSampleHeight =
+                    b.create<arith::MulIOp>(loc, pooledH, scaleH);
+                Value roiSampleWidth =
+                    b.create<arith::MulIOp>(loc, pooledW, scaleW);
+
+                SmallVector<Value> outputSizeIntValues = {roiSampleHeight,
+                                                          roiSampleWidth};
+                SmallVector<Value> dims =
+                    getTensorSizesUntilDim(b, loc, extractRoi, 1);
+                for (unsigned i = 2; i < inputRank; i++) {
+                  dims.push_back(
+                      castIntToIndex(b, loc, outputSizeIntValues[i - 2]));
+                }
+                SmallVector<Value> inputSizes;
+                auto inputType = cast<RankedTensorType>(extractRoi.getType());
+                auto inputRank = inputType.getRank();
+                for (unsigned i = 2; i < inputRank; i++) {
+                  inputSizes.push_back(b.create<arith::IndexCastOp>(
+                      loc, b.getIntegerType(64), roiSizeVals[i]));
+                }
+                Value outTensor = b.create<tensor::EmptyOp>(
+                    loc, getAsOpFoldResult(dims), inputType.getElementType());
+                AffineMap idMap = b.getMultiDimIdentityMap(inputRank);
+                SmallVector<utils::IteratorType> iteratorTypes(
+                    inputRank, utils::IteratorType::parallel);
+                Value bilinearInterpolatedRoi =
+                    b.create<linalg::GenericOp>(
+                         loc, outTensor.getType(), ValueRange{}, outTensor,
+                         /*indexingMaps=*/idMap,
+                         /*iteratorTypes=*/iteratorTypes,
+                         [&](OpBuilder &b, Location loc, ValueRange args) {
+                           Value retVal = bilinearInterpolate(
+                               b, op, loc, outputSizeIntValues, extractRoi,
+                               inputSizes, ValueRange{}, "bilinear");
+                           b.create<linalg::YieldOp>(loc, retVal);
+                         })
+                        .getResult(0);
+
+                // Step 4: Sum pool over interpolated values
+                Value sumPool, paddedInput;
+                SmallVector<Value> kernelSizeIntValues = {cstOne, cstOne,
+                                                          scaleH, scaleW};
+                SmallVector<int64_t, 2> strideInts = {samplingRatioInt,
+                                                      samplingRatioInt};
+                SmallVector<int64_t, 2> paddingInts = {0, 0};
+                SmallVector<int64_t, 2> dilationInts(2, 1);
+                SmallVector<Value, 4> outTensorShape;
+                if (failed(createPoolingOp<linalg::PoolingNchwSumOp>(
+                        op, rewriter, bilinearInterpolatedRoi,
+                        /*supportNonFPInput=*/true, false,
+                        /*dimensionality=*/2, kernelSizeIntValues, strideInts,
+                        paddingInts, dilationInts,
+                        b.getZeroAttr(resultElementType), outTensorShape,
+                        paddedInput, sumPool)))
+                  op.emitError("unable to compute sumpool");
+
+                // Step 5: elementwise division by number of sampling points
+                // to compute avg pool
+                Value outputTensor = b.create<tensor::EmptyOp>(
+                    loc, getAsOpFoldResult(outTensorShape), resultElementType);
+                Value divisor = b.create<arith::MulIOp>(loc, scaleH, scaleW);
+                Value avgPool =
+                    b.create<linalg::GenericOp>(
+                         loc, outputTensor.getType(), sumPool, outputTensor,
+                         /*indexingMaps=*/idMap,
+                         /*iteratorTypes=*/iteratorTypes,
+                         [&](OpBuilder &b, Location loc, ValueRange args) {
+                           Value avg;
+                           if (isa<mlir::IntegerType>(resultElementType))
+                             avg = b.create<arith::DivSIOp>(loc, args[0],
+                                                            divisor);
+                           else if (isa<mlir::FloatType>(resultElementType))
+                             avg =
+                                 b.create<arith::DivFOp>(loc, args[0], divisor);
+                           b.create<linalg::YieldOp>(loc, avg);
+                         })
+                        .getResult(0);
+
+                SmallVector<OpFoldResult> finalStrides(inputRank, oneAttr);
+                SmallVector<OpFoldResult> finalOffsets = {
+                    getAsOpFoldResult(iv0), getAsOpFoldResult(iv1), zeroAttr,
+                    zeroAttr};
+                SmallVector<OpFoldResult> finalSizes = {
+                    oneAttr, oneAttr, getAsOpFoldResult(pooledH),
+                    getAsOpFoldResult(pooledW)};
+                SmallVector<OpFoldResult> diagStrides(inputRank, oneAttr);
+                finalOutputTensor = b.create<tensor::InsertSliceOp>(
+                    loc, finalOutputTensor, avgPool, finalOffsets, finalSizes,
+                    finalStrides);
+              });
+        });
+    rewriter.replaceOp(op, finalOutputTensor);
+    return success();
+  }
+};
+} // namespace
+
 void mlir::torch::torch_to_linalg::populatePoolingPatternsAndLegality(
     TypeConverter &typeConverter, RewritePatternSet &patterns,
     ConversionTarget &target) {
@@ -1882,4 +2292,6 @@ void mlir::torch::torch_to_linalg::populatePoolingPatternsAndLegality(
       typeConverter, context);
   patterns.add<ConvertAtenAdaptivePoolOp<AtenAdaptiveMaxPool3dOp>>(
       typeConverter, context);
+  patterns.add<ConvertRoiAlignOp<Torch::TorchvisionRoiAlignOp>>(typeConverter,
+                                                                context);
 }

From 059c443ebdea9d232b612730c1f21fc0d0e11bea Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed@amd.com>
Date: Mon, 10 Mar 2025 18:10:25 -0500
Subject: [PATCH 3/6] debug statements remove later

---
 lib/Conversion/TorchToLinalg/Pooling.cpp | 299 ++++-------------------
 1 file changed, 51 insertions(+), 248 deletions(-)

diff --git a/lib/Conversion/TorchToLinalg/Pooling.cpp b/lib/Conversion/TorchToLinalg/Pooling.cpp
index ce41c25ddfc3..ea9c2e07e5fd 100644
--- a/lib/Conversion/TorchToLinalg/Pooling.cpp
+++ b/lib/Conversion/TorchToLinalg/Pooling.cpp
@@ -20,6 +20,7 @@
 #include "torch-mlir/Dialect/Torch/Utils/Utils.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Math/IR/Math.h"
+#include "llvm/Support/Debug.h"
 #include <optional>
 
 using namespace mlir;
@@ -1616,229 +1617,9 @@ class ConvertAtenAdaptivePoolOp : public OpConversionPattern<OpTy> {
     return success();
   }
 };
-} // namespace
-
-namespace {
-template <typename OpTy, typename PoolingOpTy, int Dim>
-class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
-public:
-  using OpConversionPattern<OpTy>::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
-      return failure();
-
-    Location loc = op->getLoc();
-    const TypeConverter *typeConverter = this->getTypeConverter();
-    Value result = op.getResult();
-
-    uint64_t pooledHeight =
-        cast<ConstantIntOp>(op.getPooledHeight().getDefiningOp()).getValue();
-    uint64_t pooledWidth =
-        cast<ConstantIntOp>(op.getPooledWidth().getDefiningOp()).getValue();
-    uint64_t samplingRatio =
-        cast<ConstantIntOp>(op.getSamplingRatio().getDefiningOp()).getValue();
-    Value pooledH = op.getPooledHeight();
-    Value pooledW = op.getPooledWidth();
-    Value spatialScaleVal = op.getSpatialScale();
-    llvm::APFloat spatialScale =
-        cast<ConstantFloatOp>(op.getSpatialScale().getDefiningOp()).getValue();
-    Value rois = op.getRois();
-    Value input = op.getInput();
-    // RankedTensorType inputType = input.getType();
-    Value offset =
-        rewriter.create<arith::ConstantOp>(loc, b.getF32FloatAttr(0.0));
-    Type resultType = cast<RankedTensorType>(result.getType());
-    Type resultElementType = resultType.getElementType();
-    if (!op.getAligned()) {
-      offset = rewriter.create<arith::ConstantOp>(loc, b.getF32FloatAttr(0.5));
-    }
-
-    Value lb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-    Value ub0 = rewriter.create<tensor::DimOp>(loc, rois, 0);
-    Value ub1 = rewriter.create<tensor::DimOp>(loc, input, 1);
-    Value step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    SmallVector<Value> finalOutputShape = {ub0, ub1, pooledH, pooledW};
-    Value finalOutputTensor = rewriter.create<tensor::EmptyOp>(
-        loc, getAsOpFoldResult(finalOutputShape), resultElementType);
-    auto forLoop = rewriter.create<scf::ForOp>(
-        loc, lb, ub0, step, ValueRange{},
-        [&](OpBuilder &b1, Location loc, Value iv0, ValueRange args) {
-          auto forLoop = b1.create<scf::ForOp>(
-              loc, lb, ub1, step, ValueRange{},
-              [&](OpBuilder &b, Location loc, Value iv1, ValueRange args) {
-                // Step 1: Extract bounds for region of interest (roi)
-                OpFoldResult zeroAttr = b.getI64IntegerAttr(0);
-                OpFoldResult oneAttr = b.getI64IntegerAttr(1);
-                OpFoldResult twoAttr = b.getI64IntegerAttr(2);
-                OpFoldResult threeAttr = b.getI64IntegerAttr(3);
-                OpFoldResult fourAttr = b.getI64IntegerAttr(4);
-                OpFoldResult fiveAttr = b.getI64IntegerAttr(5);
-                // SmallVector<Value> offsetVals{iv0, zeroAttr};
-                // SmallVector<OpFoldResult> sizeVals{oneAttr, fiveAttr};
-                SmallVector<OpFoldResult> strideVals{oneAttr, oneAttr, oneAttr,
-                                                     oneAttr};
-                // Value extractRoiBounds = b.create<tensor::ExtractSliceOp>(
-                //     loc, rois, offsetVals, sizeVals, strideVals);
-                Value lowY = b.create<tensor::ExtractOp>(
-                    loc, rois, ValueRange{iv0, oneAttr});
-                Value lowX = b.create<tensor::ExtractOp>(
-                    loc, rois, ValueRange{iv0, twoAttr});
-                Value highY = b.create<tensor::ExtractOp>(
-                    loc, rois, ValueRange{iv0, threeAttr});
-                Value highX = b.create<tensor::ExtractOp>(
-                    loc, rois, ValueRange{iv0, fourAttr});
-
-                lowY = b.create<arith::MulFOp>(loc, lowY, spatialScaleVal);
-                lowX = b.create<arith::MulFOp>(loc, lowX, spatialScaleVal);
-                highY = b.create<arith::MulFOp>(loc, highY, spatialScaleVal);
-                highX = b.create<arith::MulFOp>(loc, highX, spatialScaleVal);
-
-                lowY = b.create<arith::SubFOp>(loc, lowY, offset);
-                lowX = b.create<arith::SubFOp>(loc, lowX, offset);
-                highY = b.create<arith::SubFOp>(loc, highY, offset);
-                highX = b.create<arith::SubFOp>(loc, highX, offset);
-
-                // Step 2: Extract region of interest using bounds
-                Value lowY_int = b.create<math::FloorOp>(loc, lowY);
-                Value lowX_int = b.create<math::FloorOp>(loc, lowX);
-                Value highY_int = b.create<math::CeilOp>(loc, highY);
-                Value highX_int = b.create<math::CeilOp>(loc, highX);
-                lowY_int =
-                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowY_int);
-                lowX_int =
-                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowX_int);
-                highY_int =
-                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highY_int);
-                highX_int =
-                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highX_int);
-
-                Value roiHeight =
-                    b.create<arith::SubIOp>(loc, highY_int, lowY_int);
-                Value roiWidth =
-                    b.create<arith::SubIOp>(loc, highX_int, lowX_int);
-
-                SmallVector<Value> roiOffsetVals{zeroAttr, iv1, lowY_int,
-                                                 lowX_int};
-                SmallVector<Value> roiSizeVals{oneAttr, oneAttr, roiHeight,
-                                               roiWidth};
-
-                Value extractRoi = b.create<tensor::ExtractSliceOp>(
-                    loc, input, roiOffsetVals, roiSizeVals, strideVals);
-
-                // Step 3: Perform bilinear interpolation over roi
-                Value roiBinH = b.create<arith::SubOp>(loc, highY, lowY);
-                Value roiBinW = b.create<arith::SubOp>(loc, highX, lowX);
-                Value scaleH = b.create<arith::DivOp>(loc, roiBinH, pooledH);
-                Value scaleW = b.create<arith::DivOp>(loc, roiBinW, pooledW);
-                scaleH = b.create<arith::CeilOp>(loc, scaleH);
-                scaleW = b.create<arith::CeilOp>(loc, scaleW);
-                scaleH = b.create<arith::FPToSIOp>(loc, b.getI64Type(), scaleH);
-                scaleW = b.create<arith::FPToSIOp>(loc, b.getI64Type(), scaleW);
-
-                Value roiSampleHeight =
-                    b.create<arith::MulIOp>(loc, pooledH, scaleH);
-                Value roiSampleWidth =
-                    b.create<arith::MulIOp>(loc, pooledW, scaleW);
-
-                SmallVector<Value> outputSizeIntValues = {roiSampleHeight,
-                                                          roiSampleWidth};
-                SmallVector<Value> dims =
-                    getTensorSizesUntilDim(b, loc, extractRoi, 1);
-                for (unsigned i = 2; i < inputRank; i++) {
-                  dims.push_back(
-                      castIntToIndex(b, loc, outputSizeIntValues[i - 2]));
-                }
-                SmallVector<Value> inputSizes;
-                auto inputType = cast<RankedTensorType>(extractRoi.getType());
-                auto inputRank = inputType.getRank();
-                for (unsigned i = 2; i < inputRank; i++) {
-                  Value inputSize = getDimOp(b, loc, extractRoi, i);
-                  inputSizes.push_back(b.create<arith::IndexCastOp>(
-                      loc, b.getIntegerType(64), roiSizeVals[i]));
-                }
-                Value outTensor = b.create<tensor::EmptyOp>(
-                    loc, getAsOpFoldResult(dims), inputType.getElementType());
-                AffineMap idMap = b.getMultiDimIdentityMap(inputRank);
-                SmallVector<utils::IteratorType> iteratorTypes(
-                    inputRank, utils::IteratorType::parallel);
-                Value bilinearInterpolatedRoi =
-                    b.create<linalg::GenericOp>(
-                         loc, outTensor.getType(), ValueRange{}, outTensor,
-                         /*indexingMaps=*/idMap,
-                         /*iteratorTypes=*/iteratorTypes,
-                         [&](OpBuilder &b, Location loc, ValueRange args) {
-                           Value retVal = bilinearInterpolate(
-                               b, op, loc, outputSizeIntValues, extractRoi,
-                               inputSizes, ValueRange{}, "bilinear");
-                           b.create<linalg::YieldOp>(loc, retVal);
-                         })
-                        .getResult(0);
-
-                // Step 4: Sum pool over interpolated values
-                Value sumPool, paddedInput;
-                SmallVector<Value> kernelSizeIntValues = {oneAttr, oneAttr,
-                                                          scaleH, scaleW};
-                SmallVector<Value, 2> strideInts = {scaleH, scaleW};
-                SmallVector<Value, 2> paddingInts = {zeroAttr, zeroAttr};
-                SmallVector<Value, 2> dilationInts(oneAttr, 2);
-                SmallVector<Value, 4> outTensorShape;
-                if (failed(createPoolingOp<linalg::PoolingNchwSumOp>(
-                        op, b, self, /*supportNonFPInput=*/true, false,
-                        /*dimensionality=*/2, kernelSizeIntValues, strideInts,
-                        paddingInts, dilationInts,
-                        b.getZeroAttr(resultElementType), outTensorShape,
-                        paddedInput, sumPool)))
-                  return b.notifyMatchFailure(op, "unable to compute sumpool");
-
-                // Step 5: elementwise division by number of sampling points
-                // to compute avg pool
-                Value outputTensor = b.create<tensor::EmptyOp>(
-                    loc, getAsOpFoldResult(outTensorShape), resultElementType);
-                Value divisor = b.create<arith::MulIOp>(loc, scaleH, scaleW);
-                Value avgPool =
-                    b.create<linalg::GenericOp>(
-                         loc, outputTensor.getType(), sumPool, outputTensor,
-                         /*indexingMaps=*/indexingMapsAvg,
-                         /*iteratorTypes=*/iteratorTypesAvg,
-                         [&](OpBuilder &b, Location loc, ValueRange args) {
-                           Value avg;
-                           if (isa<mlir::IntegerType>(resultElementType))
-                             avg = b.create<arith::DivSIOp>(loc, args[0],
-                                                            divisor);
-                           else if (isa<mlir::FloatType>(resultElementType))
-                             avg =
-                                 b.create<arith::DivFOp>(loc, args[0], divisor);
-                           b.create<linalg::YieldOp>(loc, avg);
-                         })
-                        .getResult(0);
-
-                SmallVector<OpFoldResult> finalStrides(inputRank, oneAttr);
-                SmallVector<OpFoldResult> finalOffsets = {
-                    getAsOpFoldResult(iv0), getAsOpFoldResult(iv1), zeroAttr,
-                    zeroAttr};
-                SmallVector<OpFoldResult> finalSizes = {
-                    oneAttr, oneAttr, getAsOpFoldResult(pooledH),
-                    getAsOpFoldResult(pooledW)};
-                SmallVector<OpFoldResult> diagStrides(inputRank, oneAttr);
-                finalOutputTensor = b.create<tensor::InsertSliceOp>(
-                    loc, finalOutputTensor, avgPool, finalOffsets, finalSizes,
-                    finalStrides);
-              });
-        });
-
-    Type resultType = typeConverter->convertType(op.getType());
-    b.replaceOp(op, finalOutputTensor);
-    return success();
-  }
-};
-} // namespace
 
-namespace {
 template <typename OpTy>
-class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
-public:
+struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
   using OpConversionPattern<OpTy>::OpConversionPattern;
 
   static SmallVector<Value> coordinateTransform(
@@ -2001,7 +1782,7 @@ class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
     // Let Aij := area rect((yProj,xProj) <-> (y_i*,x_j*)),
     // where i* = i+1 mod 2 and x_0 = xLow, x_1 = xHigh etc.
     // We interpolate via the weighted average of pij by weights Aij
-    // the formula is retval = Sum(pij*Aij for i and j in range(2))
+    // the formula is retval = Sum(pij*Aij for i and j in range(2)).
     // Note: we do not need to divide by total rect area == 1
 
     // lengths : Aij == dyi*dxj
@@ -2042,16 +1823,28 @@ class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
         cast<ConstantFloatOp>(op.getSpatialScale().getDefiningOp()).getValue();
     Value rois = op.getRois();
     Value input = op.getInput();
-    unsigned inputRank = cast<RankedTensorType>(input.getType()).getRank();
+    RankedTensorType inputType = dyn_cast_or_null<RankedTensorType>(
+      this->getTypeConverter()->convertType(input.getType()));
+    llvm::dbgs() << "input";
+    if (inputType == nullptr) {
+      op.emitError("Cannot determine input shape");
+    }
+    
+    unsigned inputRank = inputType.getRank();
     Value offset =
         rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(0.0));
-    RankedTensorType resultType = cast<RankedTensorType>(result.getType());
+    RankedTensorType resultType = dyn_cast_or_null<RankedTensorType>(
+          this->getTypeConverter()->convertType(result.getType()));
+    if (resultType == nullptr) {
+      op.emitError("Cannot determine result shape");
+    }
+    llvm::dbgs() << "that\n";
     Type resultElementType = resultType.getElementType();
     if (!op.getAligned()) {
       offset = rewriter.create<arith::ConstantOp>(
           loc, rewriter.getF32FloatAttr(0.5));
     }
-
+    llvm::dbgs() << "1\n";
     Value lb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
     Value ub0 = rewriter.create<tensor::DimOp>(loc, rois, 0);
     Value ub1 = rewriter.create<tensor::DimOp>(loc, input, 1);
@@ -2065,68 +1858,79 @@ class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
           b.create<scf::ForOp>(
               loc, lb, ub1, step, ValueRange{},
               [&](OpBuilder &b, Location loc, Value iv1, ValueRange args) {
-                // Step 1: Extract bounds for region of interest (roi)
+                llvm::dbgs() << "2\n";
+                // Step 1: Extract bounds for region of interest (roi).
                 OpFoldResult zeroAttr = b.getI64IntegerAttr(0);
                 OpFoldResult oneAttr = b.getI64IntegerAttr(1);
-
+                llvm::dbgs() << "2.1\n";
                 Value cstZero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
                 Value cstOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
                 Value cstTwo = rewriter.create<arith::ConstantIndexOp>(loc, 2);
                 Value cstThree =
                     rewriter.create<arith::ConstantIndexOp>(loc, 3);
                 Value cstFour = rewriter.create<arith::ConstantIndexOp>(loc, 4);
+                llvm::dbgs() << "2.2\n";
                 SmallVector<OpFoldResult> strideVals{oneAttr, oneAttr, oneAttr,
                                                      oneAttr};
+                llvm::dbgs() << "2.21\n";
                 SmallVector<Value> lowYIndices = {iv0, cstOne};
+                llvm::dbgs() << "2.211\n";
+                llvm::dbgs() << rois << "\n";
                 Value lowY =
                     b.create<tensor::ExtractOp>(loc, rois, lowYIndices);
+                // Value lowY = b.create<arith::ConstantOp>(loc, b.getF32FloatAttr(0.0));
+                llvm::dbgs() << "2.212\n";
                 SmallVector<Value> lowXIndices = {iv0, cstTwo};
+                llvm::dbgs() << "2.213\n";
                 Value lowX =
                     b.create<tensor::ExtractOp>(loc, rois, lowXIndices);
+                llvm::dbgs() << "2.214\n";
                 SmallVector<Value> highYIndices = {iv0, cstThree};
+                llvm::dbgs() << "2.22\n";
                 Value highY =
                     b.create<tensor::ExtractOp>(loc, rois, highYIndices);
                 SmallVector<Value> highXIndices = {iv0, cstFour};
+                llvm::dbgs() << "2.23\n";
                 Value highX =
                     b.create<tensor::ExtractOp>(loc, rois, highXIndices);
-
+                llvm::dbgs() << "2.5\n";
                 lowY = b.create<arith::MulFOp>(loc, lowY, spatialScaleVal);
                 lowX = b.create<arith::MulFOp>(loc, lowX, spatialScaleVal);
                 highY = b.create<arith::MulFOp>(loc, highY, spatialScaleVal);
                 highX = b.create<arith::MulFOp>(loc, highX, spatialScaleVal);
-
+                llvm::dbgs() << "3\n";
                 lowY = b.create<arith::SubFOp>(loc, lowY, offset);
                 lowX = b.create<arith::SubFOp>(loc, lowX, offset);
                 highY = b.create<arith::SubFOp>(loc, highY, offset);
                 highX = b.create<arith::SubFOp>(loc, highX, offset);
 
                 // Step 2: Extract region of interest using bounds
-                Value lowY_int = b.create<math::FloorOp>(loc, lowY);
-                Value lowX_int = b.create<math::FloorOp>(loc, lowX);
-                Value highY_int = b.create<math::CeilOp>(loc, highY);
-                Value highX_int = b.create<math::CeilOp>(loc, highX);
-                lowY_int =
-                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowY_int);
-                lowX_int =
-                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowX_int);
-                highY_int =
-                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highY_int);
-                highX_int =
-                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highX_int);
+                Value lowYInt = b.create<math::FloorOp>(loc, lowY);
+                Value lowXInt = b.create<math::FloorOp>(loc, lowX);
+                Value highYInt = b.create<math::CeilOp>(loc, highY);
+                Value highXInt = b.create<math::CeilOp>(loc, highX);
+                lowYInt =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowYInt);
+                lowXInt =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowXInt);
+                highYInt =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highYInt);
+                highXInt =
+                    b.create<arith::FPToSIOp>(loc, b.getI64Type(), highXInt);
 
                 Value roiHeight =
-                    b.create<arith::SubIOp>(loc, highY_int, lowY_int);
+                    b.create<arith::SubIOp>(loc, highYInt, lowYInt);
                 Value roiWidth =
-                    b.create<arith::SubIOp>(loc, highX_int, lowX_int);
+                    b.create<arith::SubIOp>(loc, highXInt, lowXInt);
 
                 SmallVector<OpFoldResult> roiOffsetVals = {
                     getAsOpFoldResult(cstZero), getAsOpFoldResult(iv1),
-                    getAsOpFoldResult(lowY_int), getAsOpFoldResult(lowX_int)};
+                    getAsOpFoldResult(lowYInt), getAsOpFoldResult(lowXInt)};
                 SmallVector<Value> roiSizeVals = {cstOne, cstOne, roiHeight,
                                                   roiWidth};
 
                 Value extractRoi = b.create<tensor::ExtractSliceOp>(
-                    loc, input, ValueRange{cstZero, iv1, lowY_int, lowX_int},
+                    loc, input, ValueRange{cstZero, iv1, lowYInt, lowXInt},
                     ValueRange{cstOne, cstOne, roiHeight, roiWidth},
                     ValueRange{cstOne, cstOne, cstOne, cstOne});
 
@@ -2160,8 +1964,6 @@ class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
                       castIntToIndex(b, loc, outputSizeIntValues[i - 2]));
                 }
                 SmallVector<Value> inputSizes;
-                auto inputType = cast<RankedTensorType>(extractRoi.getType());
-                auto inputRank = inputType.getRank();
                 for (unsigned i = 2; i < inputRank; i++) {
                   inputSizes.push_back(b.create<arith::IndexCastOp>(
                       loc, b.getIntegerType(64), roiSizeVals[i]));
@@ -2223,7 +2025,7 @@ class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
                            b.create<linalg::YieldOp>(loc, avg);
                          })
                         .getResult(0);
-
+                llvm::dbgs() << "4\n";
                 SmallVector<OpFoldResult> finalStrides(inputRank, oneAttr);
                 SmallVector<OpFoldResult> finalOffsets = {
                     getAsOpFoldResult(iv0), getAsOpFoldResult(iv1), zeroAttr,
@@ -2237,6 +2039,7 @@ class ConvertRoiAlignOp : public OpConversionPattern<OpTy> {
                     finalStrides);
               });
         });
+    llvm::dbgs() << "5\n";
     rewriter.replaceOp(op, finalOutputTensor);
     return success();
   }

From ca5df5522f9d5ade566634534911f94f63758d30 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed@amd.com>
Date: Mon, 10 Mar 2025 19:28:14 -0500
Subject: [PATCH 4/6] removed debug statements addressed comments

---
 lib/Conversion/TorchToLinalg/Pooling.cpp | 161 +++++++++++++----------
 1 file changed, 95 insertions(+), 66 deletions(-)

diff --git a/lib/Conversion/TorchToLinalg/Pooling.cpp b/lib/Conversion/TorchToLinalg/Pooling.cpp
index ea9c2e07e5fd..7594b82b5465 100644
--- a/lib/Conversion/TorchToLinalg/Pooling.cpp
+++ b/lib/Conversion/TorchToLinalg/Pooling.cpp
@@ -153,26 +153,30 @@ static LogicalResult createPoolingOp(
     SmallVectorImpl<int64_t> &dilationInts, Attribute initValueAttr,
     SmallVectorImpl<Value> &outTensorShape, Value &paddedInput, Value &result) {
   Location loc = op->getLoc();
+  
+  
   Type elementType = cast<RankedTensorType>(self.getType()).getElementType();
   if (!isa<mlir::FloatType>(elementType) && !supportNonFPInput)
     return op->emitError("unimplemented: non-floating point type");
-
+  
   Value initValue =
       rewriter.create<arith::ConstantOp>(loc, cast<TypedAttr>(initValueAttr));
 
   paddedInput = padInputTensor(op, rewriter, self, ceilMode, dimensionality,
                                strideInts, paddingInts, initValue);
-
+  
   auto outTensorInitialized = computeOutputTensor(
       op, rewriter, self, dimensionality, ceilMode, strideInts, paddingInts,
       dilationInts, kernelSizeIntValues, outTensorShape, initValue);
-
+  
+      
   auto stridesAttr = rewriter.getI64VectorAttr(strideInts);
   auto dilationAttr = rewriter.getI64VectorAttr(dilationInts);
   auto shape = castIntVectorToIndexVector(rewriter, loc, kernelSizeIntValues);
+  
   Value windowTensor = rewriter.create<tensor::EmptyOp>(
       loc, getAsOpFoldResult(shape), elementType);
-
+  
   Value permutedInput = paddedInput, permutedOutput = outTensorInitialized;
   if (dimensionality == 3) {
     // Permute input and output tensor as follows:
@@ -190,7 +194,7 @@ static LogicalResult createPoolingOp(
       return rewriter.notifyMatchFailure(
           op, "failed to perform permutation of tensor");
   }
-
+  
   Value poolingResult =
       rewriter
           .create<OpTy>(loc, permutedOutput.getType(),
@@ -1618,15 +1622,17 @@ class ConvertAtenAdaptivePoolOp : public OpConversionPattern<OpTy> {
   }
 };
 
-template <typename OpTy>
-struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
-  using OpConversionPattern<OpTy>::OpConversionPattern;
+struct ConvertRoiAlignOp final
+    : OpConversionPattern<Torch::TorchvisionRoiAlignOp> {
+  using OpConversionPattern::OpConversionPattern;
 
-  static SmallVector<Value> coordinateTransform(
-      OpBuilder &b, OpTy op, Location loc, SmallVector<Value> outputSizes,
-      Value input, SmallVector<Value> inputSizes,
-      SmallVector<Value> scaleValues, std::string coordStr,
-      bool alignCornersBool, SmallVector<Value> indices, bool clip) {
+  static SmallVector<Value>
+  coordinateTransform(OpBuilder &b, Torch::TorchvisionRoiAlignOp op,
+                      Location loc, SmallVector<Value> outputSizes, Value input,
+                      SmallVector<Value> inputSizes,
+                      SmallVector<Value> scaleValues, std::string coordStr,
+                      bool alignCornersBool, SmallVector<Value> indices,
+                      bool clip) {
 
     unsigned dimOffset = 2;
     auto inputType = cast<RankedTensorType>(input.getType());
@@ -1647,6 +1653,7 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
           b.create<arith::SIToFPOp>(loc, b.getF32Type(), outputSizes[i]);
       // scale = length_resized/length_original
       Value scale;
+
       if (alignCornersBool) {
         // x_original = x_resized * (length_original - 1) / (length_resized - 1)
         Value inputSubOne = b.create<arith::SubFOp>(loc, inputFP, cstOneFloat);
@@ -1695,32 +1702,43 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
         Value offset = b.create<arith::MulFOp>(loc, center, oneMAdjustment);
         preClip = b.create<arith::AddFOp>(loc, offset, preClip);
       }
+
       // for pytorch half pixel , special case for length_resized == 1:
       if (coordStr == "_pytorch_half_pixel") {
+
         Value cmp = b.create<arith::CmpFOp>(loc, arith::CmpFPredicate::UEQ,
                                             outputSizeFP, cstOneFloat);
+
         preClip = b.create<arith::SelectOp>(loc, cmp, zero, preClip);
       }
       if (clip) {
         // preClip is the fp position inside the input image to extract from.
         // clip to [0,inf)
+
         Value max = b.create<arith::MaximumFOp>(loc, preClip, zero);
+
         Value inputSubOne = b.create<arith::SubFOp>(loc, inputFP, cstOneFloat);
         // clip to [0,length_original - 1].
         // proj is properly within the input image.
+
         proj.push_back(b.create<arith::MinimumFOp>(loc, max, inputSubOne));
+
       } else {
+
         proj.push_back(preClip);
       }
     }
+
     return proj;
   }
 
-  static Value bilinearInterpolate(OpBuilder &b, OpTy op, Location loc,
-                                   SmallVector<Value> outputSizes, Value input,
-                                   SmallVector<Value> inputSizes,
+  static Value bilinearInterpolate(OpBuilder &b,
+                                   Torch::TorchvisionRoiAlignOp op,
+                                   Location loc, SmallVector<Value> outputSizes,
+                                   Value input, SmallVector<Value> inputSizes,
                                    SmallVector<Value> scaleValues,
                                    std::string coordStr) {
+
     unsigned dimOffset = 2;
     auto inputType = cast<RankedTensorType>(input.getType());
     auto inputRank = inputType.getRank();
@@ -1729,21 +1747,22 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
         b.create<arith::ConstantOp>(loc, b.getF32FloatAttr(1.0));
 
     SmallVector<Value> indices;
-    for (unsigned i = 0; i < inputRank; i++) {
+    for (unsigned i = 0; i < inputRank; ++i) {
       indices.push_back(b.create<linalg::IndexOp>(loc, i));
     }
 
     SmallVector<Value> proj, high, low, highFP, lowFP;
+
     proj = coordinateTransform(b, op, loc, outputSizes, input, inputSizes,
                                scaleValues, coordStr, false, indices, true);
-    for (unsigned i = 0; i < inputRank - dimOffset; i++) {
+    for (unsigned i = 0; i < inputRank - dimOffset; ++i) {
       // length_original
       Value inputFP =
           b.create<arith::SIToFPOp>(loc, b.getF32Type(), inputSizes[i]);
       Value inputSubOne = b.create<arith::SubFOp>(loc, inputFP, cstOneFloat);
 
       // for bilinear interpolation, we look for the nearest indices below and
-      // above proj
+      // above proj.
       lowFP.push_back(b.create<math::FloorOp>(loc, proj[i]));
       Value projPlusOne = b.create<arith::AddFOp>(loc, cstOneFloat, proj[i]);
       highFP.push_back(b.create<math::FloorOp>(loc, projPlusOne));
@@ -1759,6 +1778,7 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
       Value highExtract =
           b.create<arith::MinimumFOp>(loc, projPlusOne, inputSubOne);
       highExtract = b.create<arith::FPToSIOp>(loc, b.getI64Type(), highExtract);
+
       high.push_back(
           b.create<arith::IndexCastOp>(loc, b.getIndexType(), highExtract));
     }
@@ -1783,7 +1803,7 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
     // where i* = i+1 mod 2 and x_0 = xLow, x_1 = xHigh etc.
     // We interpolate via the weighted average of pij by weights Aij
     // the formula is retval = Sum(pij*Aij for i and j in range(2)).
-    // Note: we do not need to divide by total rect area == 1
+    // Note: we do not need to divide by total rect area == 1.
 
     // lengths : Aij == dyi*dxj
     Value dy0 = b.create<arith::SubFOp>(loc, highFP[0], proj[0]);
@@ -1797,6 +1817,7 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
     Value sum = b.create<arith::AddFOp>(loc, dx0p00, dx1p01);
     Value left = b.create<arith::MulFOp>(loc, dy0, sum);
     // right = A10*p10 + A11*p11 = dy1(dx0p10 + dx1p11)
+
     Value dx0p10 = b.create<arith::MulFOp>(loc, dx0, p10);
     Value dx1p11 = b.create<arith::MulFOp>(loc, dx1, p11);
     sum = b.create<arith::AddFOp>(loc, dx0p10, dx1p11);
@@ -1805,7 +1826,8 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
     return b.create<arith::AddFOp>(loc, left, right);
   }
   LogicalResult
-  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+  matchAndRewrite(Torch::TorchvisionRoiAlignOp op,
+                  typename Torch::TorchvisionRoiAlignOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
       return failure();
@@ -1818,33 +1840,33 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
     int64_t samplingRatioInt = static_cast<int64_t>(samplingRatio);
     Value pooledH = op.getPooledHeight();
     Value pooledW = op.getPooledWidth();
-    Value spatialScaleVal = op.getSpatialScale();
+    Value spatialScaleVal = adaptor.getSpatialScale();
     llvm::APFloat spatialScale =
         cast<ConstantFloatOp>(op.getSpatialScale().getDefiningOp()).getValue();
-    Value rois = op.getRois();
-    Value input = op.getInput();
+    Value rois = adaptor.getRois();
+    Value input = adaptor.getInput();
     RankedTensorType inputType = dyn_cast_or_null<RankedTensorType>(
-      this->getTypeConverter()->convertType(input.getType()));
-    llvm::dbgs() << "input";
+        this->getTypeConverter()->convertType(op.getInput().getType()));
+
     if (inputType == nullptr) {
       op.emitError("Cannot determine input shape");
     }
-    
+
     unsigned inputRank = inputType.getRank();
     Value offset =
         rewriter.create<arith::ConstantOp>(loc, rewriter.getF32FloatAttr(0.0));
     RankedTensorType resultType = dyn_cast_or_null<RankedTensorType>(
-          this->getTypeConverter()->convertType(result.getType()));
+        this->getTypeConverter()->convertType(result.getType()));
     if (resultType == nullptr) {
       op.emitError("Cannot determine result shape");
     }
-    llvm::dbgs() << "that\n";
+
     Type resultElementType = resultType.getElementType();
     if (!op.getAligned()) {
       offset = rewriter.create<arith::ConstantOp>(
           loc, rewriter.getF32FloatAttr(0.5));
     }
-    llvm::dbgs() << "1\n";
+
     Value lb = rewriter.create<arith::ConstantIndexOp>(loc, 0);
     Value ub0 = rewriter.create<tensor::DimOp>(loc, rois, 0);
     Value ub1 = rewriter.create<tensor::DimOp>(loc, input, 1);
@@ -1858,47 +1880,43 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
           b.create<scf::ForOp>(
               loc, lb, ub1, step, ValueRange{},
               [&](OpBuilder &b, Location loc, Value iv1, ValueRange args) {
-                llvm::dbgs() << "2\n";
                 // Step 1: Extract bounds for region of interest (roi).
                 OpFoldResult zeroAttr = b.getI64IntegerAttr(0);
                 OpFoldResult oneAttr = b.getI64IntegerAttr(1);
-                llvm::dbgs() << "2.1\n";
+
                 Value cstZero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
                 Value cstOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
                 Value cstTwo = rewriter.create<arith::ConstantIndexOp>(loc, 2);
                 Value cstThree =
                     rewriter.create<arith::ConstantIndexOp>(loc, 3);
                 Value cstFour = rewriter.create<arith::ConstantIndexOp>(loc, 4);
-                llvm::dbgs() << "2.2\n";
+
                 SmallVector<OpFoldResult> strideVals{oneAttr, oneAttr, oneAttr,
                                                      oneAttr};
-                llvm::dbgs() << "2.21\n";
+
                 SmallVector<Value> lowYIndices = {iv0, cstOne};
-                llvm::dbgs() << "2.211\n";
-                llvm::dbgs() << rois << "\n";
-                Value lowY =
-                    b.create<tensor::ExtractOp>(loc, rois, lowYIndices);
-                // Value lowY = b.create<arith::ConstantOp>(loc, b.getF32FloatAttr(0.0));
-                llvm::dbgs() << "2.212\n";
+                Value lowY = b.create<tensor::ExtractOp>(loc, b.getF32Type(),
+                                                         rois, lowYIndices);
+
                 SmallVector<Value> lowXIndices = {iv0, cstTwo};
-                llvm::dbgs() << "2.213\n";
-                Value lowX =
-                    b.create<tensor::ExtractOp>(loc, rois, lowXIndices);
-                llvm::dbgs() << "2.214\n";
+
+                Value lowX = b.create<tensor::ExtractOp>(loc, b.getF32Type(),
+                                                         rois, lowXIndices);
+
                 SmallVector<Value> highYIndices = {iv0, cstThree};
-                llvm::dbgs() << "2.22\n";
-                Value highY =
-                    b.create<tensor::ExtractOp>(loc, rois, highYIndices);
+
+                Value highY = b.create<tensor::ExtractOp>(loc, b.getF32Type(),
+                                                          rois, highYIndices);
                 SmallVector<Value> highXIndices = {iv0, cstFour};
-                llvm::dbgs() << "2.23\n";
-                Value highX =
-                    b.create<tensor::ExtractOp>(loc, rois, highXIndices);
-                llvm::dbgs() << "2.5\n";
+
+                Value highX = b.create<tensor::ExtractOp>(loc, b.getF32Type(),
+                                                          rois, highXIndices);
+
                 lowY = b.create<arith::MulFOp>(loc, lowY, spatialScaleVal);
                 lowX = b.create<arith::MulFOp>(loc, lowX, spatialScaleVal);
                 highY = b.create<arith::MulFOp>(loc, highY, spatialScaleVal);
                 highX = b.create<arith::MulFOp>(loc, highX, spatialScaleVal);
-                llvm::dbgs() << "3\n";
+
                 lowY = b.create<arith::SubFOp>(loc, lowY, offset);
                 lowX = b.create<arith::SubFOp>(loc, lowX, offset);
                 highY = b.create<arith::SubFOp>(loc, highY, offset);
@@ -1934,7 +1952,7 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
                     ValueRange{cstOne, cstOne, roiHeight, roiWidth},
                     ValueRange{cstOne, cstOne, cstOne, cstOne});
 
-                // Step 3: Perform bilinear interpolation over roi
+                // Step 3: Perform bilinear interpolation over roi.
                 Value roiBinH = b.create<arith::SubFOp>(loc, highY, lowY);
                 Value roiBinW = b.create<arith::SubFOp>(loc, highX, lowX);
                 Value scaleH = b.create<arith::DivFOp>(loc, roiBinH, pooledH);
@@ -1943,6 +1961,7 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
                 scaleW = b.create<math::CeilOp>(loc, scaleW);
                 scaleH = b.create<arith::FPToSIOp>(loc, b.getI64Type(), scaleH);
                 scaleW = b.create<arith::FPToSIOp>(loc, b.getI64Type(), scaleW);
+
                 if (samplingRatio > 0) {
                   scaleH = b.create<arith::ConstantOp>(
                       loc, rewriter.getI64IntegerAttr(samplingRatio));
@@ -1959,20 +1978,25 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
                                                           roiSampleWidth};
                 SmallVector<Value> dims =
                     getTensorSizesUntilDim(b, loc, extractRoi, 1);
-                for (unsigned i = 2; i < inputRank; i++) {
-                  dims.push_back(
-                      castIntToIndex(b, loc, outputSizeIntValues[i - 2]));
+
+                for (unsigned i = 2; i < inputRank; ++i) {
+                  auto dim = b.create<arith::IndexCastOp>(
+                      loc, b.getIndexType(), outputSizeIntValues[i - 2]);
+                  dims.push_back(dim);
                 }
+
                 SmallVector<Value> inputSizes;
-                for (unsigned i = 2; i < inputRank; i++) {
+                for (unsigned i = 2; i < inputRank; ++i) {
                   inputSizes.push_back(b.create<arith::IndexCastOp>(
                       loc, b.getIntegerType(64), roiSizeVals[i]));
                 }
+
                 Value outTensor = b.create<tensor::EmptyOp>(
                     loc, getAsOpFoldResult(dims), inputType.getElementType());
                 AffineMap idMap = b.getMultiDimIdentityMap(inputRank);
                 SmallVector<utils::IteratorType> iteratorTypes(
                     inputRank, utils::IteratorType::parallel);
+
                 Value bilinearInterpolatedRoi =
                     b.create<linalg::GenericOp>(
                          loc, outTensor.getType(), ValueRange{}, outTensor,
@@ -1981,20 +2005,25 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
                          [&](OpBuilder &b, Location loc, ValueRange args) {
                            Value retVal = bilinearInterpolate(
                                b, op, loc, outputSizeIntValues, extractRoi,
-                               inputSizes, ValueRange{}, "bilinear");
+                               inputSizes, ValueRange{}, "");
+
                            b.create<linalg::YieldOp>(loc, retVal);
                          })
                         .getResult(0);
 
-                // Step 4: Sum pool over interpolated values
+                // Step 4: Sum pool over interpolated values.
+
                 Value sumPool, paddedInput;
-                SmallVector<Value> kernelSizeIntValues = {cstOne, cstOne,
+                Value oneInt =
+                    b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(1));
+                SmallVector<Value> kernelSizeIntValues = {oneInt, oneInt,
                                                           scaleH, scaleW};
                 SmallVector<int64_t, 2> strideInts = {samplingRatioInt,
                                                       samplingRatioInt};
                 SmallVector<int64_t, 2> paddingInts = {0, 0};
-                SmallVector<int64_t, 2> dilationInts(2, 1);
+                SmallVector<int64_t, 2> dilationInts = {1, 1};
                 SmallVector<Value, 4> outTensorShape;
+
                 if (failed(createPoolingOp<linalg::PoolingNchwSumOp>(
                         op, rewriter, bilinearInterpolatedRoi,
                         /*supportNonFPInput=*/true, false,
@@ -2005,10 +2034,11 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
                   op.emitError("unable to compute sumpool");
 
                 // Step 5: elementwise division by number of sampling points
-                // to compute avg pool
+                // to compute avg pool.
                 Value outputTensor = b.create<tensor::EmptyOp>(
                     loc, getAsOpFoldResult(outTensorShape), resultElementType);
                 Value divisor = b.create<arith::MulIOp>(loc, scaleH, scaleW);
+
                 Value avgPool =
                     b.create<linalg::GenericOp>(
                          loc, outputTensor.getType(), sumPool, outputTensor,
@@ -2025,7 +2055,7 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
                            b.create<linalg::YieldOp>(loc, avg);
                          })
                         .getResult(0);
-                llvm::dbgs() << "4\n";
+
                 SmallVector<OpFoldResult> finalStrides(inputRank, oneAttr);
                 SmallVector<OpFoldResult> finalOffsets = {
                     getAsOpFoldResult(iv0), getAsOpFoldResult(iv1), zeroAttr,
@@ -2039,7 +2069,7 @@ struct ConvertRoiAlignOp : final OpConversionPattern<OpTy> {
                     finalStrides);
               });
         });
-    llvm::dbgs() << "5\n";
+
     rewriter.replaceOp(op, finalOutputTensor);
     return success();
   }
@@ -2095,6 +2125,5 @@ void mlir::torch::torch_to_linalg::populatePoolingPatternsAndLegality(
       typeConverter, context);
   patterns.add<ConvertAtenAdaptivePoolOp<AtenAdaptiveMaxPool3dOp>>(
       typeConverter, context);
-  patterns.add<ConvertRoiAlignOp<Torch::TorchvisionRoiAlignOp>>(typeConverter,
-                                                                context);
+  patterns.add<ConvertRoiAlignOp>(typeConverter, context);
 }

From e4a50e5bab6394c923744ce38d00bc06e68e5d37 Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed@amd.com>
Date: Wed, 12 Mar 2025 01:09:17 -0500
Subject: [PATCH 5/6] not ready for review, fixing bugs, will drop commit later

---
 lib/Conversion/TorchToLinalg/Pooling.cpp | 121 ++++++++++++++---------
 1 file changed, 74 insertions(+), 47 deletions(-)

diff --git a/lib/Conversion/TorchToLinalg/Pooling.cpp b/lib/Conversion/TorchToLinalg/Pooling.cpp
index 7594b82b5465..421ee91f9e3f 100644
--- a/lib/Conversion/TorchToLinalg/Pooling.cpp
+++ b/lib/Conversion/TorchToLinalg/Pooling.cpp
@@ -168,7 +168,7 @@ static LogicalResult createPoolingOp(
   auto outTensorInitialized = computeOutputTensor(
       op, rewriter, self, dimensionality, ceilMode, strideInts, paddingInts,
       dilationInts, kernelSizeIntValues, outTensorShape, initValue);
-  
+  llvm::dbgs() << outTensorInitialized << " [][][][][][][][][]\n";
       
   auto stridesAttr = rewriter.getI64VectorAttr(strideInts);
   auto dilationAttr = rewriter.getI64VectorAttr(dilationInts);
@@ -201,6 +201,7 @@ static LogicalResult createPoolingOp(
                         ValueRange{permutedInput, windowTensor}, permutedOutput,
                         stridesAttr, dilationAttr)
           .getResult(0);
+  llvm::dbgs() << poolingResult << "{}{}{}{}{}{}\n";
 
   result = poolingResult;
   if (dimensionality == 3) {
@@ -1671,6 +1672,7 @@ struct ConvertRoiAlignOp final
       // y_resized
       Value outInt = b.create<arith::IndexCastOp>(loc, b.getI64Type(),
                                                   indices[i + dimOffset]);
+llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
       Value outFP = b.create<arith::SIToFPOp>(loc, b.getF32Type(), outInt);
       Value preClip;
       if (coordStr == "_align_corners") {
@@ -1738,7 +1740,7 @@ struct ConvertRoiAlignOp final
                                    Value input, SmallVector<Value> inputSizes,
                                    SmallVector<Value> scaleValues,
                                    std::string coordStr) {
-
+    llvm::dbgs() << "12A\n";
     unsigned dimOffset = 2;
     auto inputType = cast<RankedTensorType>(input.getType());
     auto inputRank = inputType.getRank();
@@ -1750,11 +1752,12 @@ struct ConvertRoiAlignOp final
     for (unsigned i = 0; i < inputRank; ++i) {
       indices.push_back(b.create<linalg::IndexOp>(loc, i));
     }
-
+    llvm::dbgs() << "12A1\n";
     SmallVector<Value> proj, high, low, highFP, lowFP;
 
     proj = coordinateTransform(b, op, loc, outputSizes, input, inputSizes,
                                scaleValues, coordStr, false, indices, true);
+    llvm::dbgs() << "12B\n";
     for (unsigned i = 0; i < inputRank - dimOffset; ++i) {
       // length_original
       Value inputFP =
@@ -1782,7 +1785,7 @@ struct ConvertRoiAlignOp final
       high.push_back(
           b.create<arith::IndexCastOp>(loc, b.getIndexType(), highExtract));
     }
-
+    llvm::dbgs() << "12B1\n";
     indices[dimOffset] = low[0];
     indices[dimOffset + 1] = low[1];
     Value p00 = b.create<tensor::ExtractOp>(loc, input, indices);
@@ -1823,12 +1826,14 @@ struct ConvertRoiAlignOp final
     sum = b.create<arith::AddFOp>(loc, dx0p10, dx1p11);
     Value right = b.create<arith::MulFOp>(loc, dy1, sum);
 
+    llvm::dbgs() << "12C\n";
     return b.create<arith::AddFOp>(loc, left, right);
   }
   LogicalResult
   matchAndRewrite(Torch::TorchvisionRoiAlignOp op,
                   typename Torch::TorchvisionRoiAlignOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    
     if (failed(verifyLinalgCompatibleTypes(op, rewriter)))
       return failure();
 
@@ -1838,11 +1843,16 @@ struct ConvertRoiAlignOp final
     uint64_t samplingRatio =
         cast<ConstantIntOp>(op.getSamplingRatio().getDefiningOp()).getValue();
     int64_t samplingRatioInt = static_cast<int64_t>(samplingRatio);
-    Value pooledH = op.getPooledHeight();
-    Value pooledW = op.getPooledWidth();
-    Value spatialScaleVal = adaptor.getSpatialScale();
+    Value pooledH = adaptor.getPooledHeight();
+    Value pooledW = adaptor.getPooledWidth();
+    Value pooledHFp = rewriter.create<arith::SIToFPOp>(loc, rewriter.getF32Type(), pooledH);
+    Value pooledWFp = rewriter.create<arith::SIToFPOp>(loc, rewriter.getF32Type(), pooledW);
+    
+    // Value spatialScaleVal = adaptor.getSpatialScale();
     llvm::APFloat spatialScale =
         cast<ConstantFloatOp>(op.getSpatialScale().getDefiningOp()).getValue();
+    Value spatialScaleVal = rewriter.create<arith::ConstantOp>(
+          loc, rewriter.getF32FloatAttr(spatialScale.convertToDouble()));
     Value rois = adaptor.getRois();
     Value input = adaptor.getInput();
     RankedTensorType inputType = dyn_cast_or_null<RankedTensorType>(
@@ -1871,7 +1881,11 @@ struct ConvertRoiAlignOp final
     Value ub0 = rewriter.create<tensor::DimOp>(loc, rois, 0);
     Value ub1 = rewriter.create<tensor::DimOp>(loc, input, 1);
     Value step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-    SmallVector<Value> finalOutputShape = {ub0, ub1, pooledH, pooledW};
+    auto pooledHIdx = rewriter.create<arith::IndexCastOp>(
+        loc, rewriter.getIndexType(), pooledH);
+    auto pooledWIdx = rewriter.create<arith::IndexCastOp>(
+        loc, rewriter.getIndexType(), pooledW);
+    SmallVector<Value> finalOutputShape = {ub0, ub1, pooledHIdx, pooledWIdx};
     Value finalOutputTensor = rewriter.create<tensor::EmptyOp>(
         loc, getAsOpFoldResult(finalOutputShape), resultElementType);
     rewriter.create<scf::ForOp>(
@@ -1883,9 +1897,12 @@ struct ConvertRoiAlignOp final
                 // Step 1: Extract bounds for region of interest (roi).
                 OpFoldResult zeroAttr = b.getI64IntegerAttr(0);
                 OpFoldResult oneAttr = b.getI64IntegerAttr(1);
-
-                Value cstZero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-                Value cstOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+                Value intOne =
+                    b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(1));
+                // Value intZero =
+                //     b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(0));
+                Value idxZero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+                Value idxOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
                 Value cstTwo = rewriter.create<arith::ConstantIndexOp>(loc, 2);
                 Value cstThree =
                     rewriter.create<arith::ConstantIndexOp>(loc, 3);
@@ -1894,7 +1911,7 @@ struct ConvertRoiAlignOp final
                 SmallVector<OpFoldResult> strideVals{oneAttr, oneAttr, oneAttr,
                                                      oneAttr};
 
-                SmallVector<Value> lowYIndices = {iv0, cstOne};
+                SmallVector<Value> lowYIndices = {iv0, idxOne};
                 Value lowY = b.create<tensor::ExtractOp>(loc, b.getF32Type(),
                                                          rois, lowYIndices);
 
@@ -1911,22 +1928,27 @@ struct ConvertRoiAlignOp final
 
                 Value highX = b.create<tensor::ExtractOp>(loc, b.getF32Type(),
                                                           rois, highXIndices);
-
+                llvm::dbgs() << "7A" << "\n";
+                llvm::dbgs() << " LOL..\n";
                 lowY = b.create<arith::MulFOp>(loc, lowY, spatialScaleVal);
+                llvm::dbgs() << " LOLA\n";
                 lowX = b.create<arith::MulFOp>(loc, lowX, spatialScaleVal);
+                llvm::dbgs() << " LOLB\n";
                 highY = b.create<arith::MulFOp>(loc, highY, spatialScaleVal);
+                llvm::dbgs() << " LOLC\n";
                 highX = b.create<arith::MulFOp>(loc, highX, spatialScaleVal);
-
+                llvm::dbgs() << " LOL1\n";
                 lowY = b.create<arith::SubFOp>(loc, lowY, offset);
                 lowX = b.create<arith::SubFOp>(loc, lowX, offset);
                 highY = b.create<arith::SubFOp>(loc, highY, offset);
                 highX = b.create<arith::SubFOp>(loc, highX, offset);
-
+                llvm::dbgs() << " LOL2\n";
                 // Step 2: Extract region of interest using bounds
                 Value lowYInt = b.create<math::FloorOp>(loc, lowY);
                 Value lowXInt = b.create<math::FloorOp>(loc, lowX);
                 Value highYInt = b.create<math::CeilOp>(loc, highY);
                 Value highXInt = b.create<math::CeilOp>(loc, highX);
+                llvm::dbgs() << " LOL3\n";
                 lowYInt =
                     b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowYInt);
                 lowXInt =
@@ -1935,28 +1957,33 @@ struct ConvertRoiAlignOp final
                     b.create<arith::FPToSIOp>(loc, b.getI64Type(), highYInt);
                 highXInt =
                     b.create<arith::FPToSIOp>(loc, b.getI64Type(), highXInt);
-
+                Value lowYIdx = b.create<arith::IndexCastOp>(loc, b.getIndexType(), lowYInt);
+                Value lowXIdx = b.create<arith::IndexCastOp>(loc, b.getIndexType(), lowXInt);
+                llvm::dbgs() << " LOL4\n";
+                llvm::dbgs() << lowYIdx << "\n^^^\n\n";
                 Value roiHeight =
                     b.create<arith::SubIOp>(loc, highYInt, lowYInt);
                 Value roiWidth =
                     b.create<arith::SubIOp>(loc, highXInt, lowXInt);
+                Value roiHIdx = b.create<arith::IndexCastOp>(loc, b.getIndexType(), roiHeight);
+                Value roiWIdx = b.create<arith::IndexCastOp>(loc, b.getIndexType(), roiWidth);
 
                 SmallVector<OpFoldResult> roiOffsetVals = {
-                    getAsOpFoldResult(cstZero), getAsOpFoldResult(iv1),
+                    getAsOpFoldResult(idxZero), getAsOpFoldResult(iv1),
                     getAsOpFoldResult(lowYInt), getAsOpFoldResult(lowXInt)};
-                SmallVector<Value> roiSizeVals = {cstOne, cstOne, roiHeight,
+                SmallVector<Value> roiSizeVals = {intOne, intOne, roiHeight,
                                                   roiWidth};
 
                 Value extractRoi = b.create<tensor::ExtractSliceOp>(
-                    loc, input, ValueRange{cstZero, iv1, lowYInt, lowXInt},
-                    ValueRange{cstOne, cstOne, roiHeight, roiWidth},
-                    ValueRange{cstOne, cstOne, cstOne, cstOne});
+                    loc, input, ValueRange{idxZero, iv1, lowYIdx, lowXIdx},
+                    ValueRange{idxOne, idxOne, roiHIdx, roiWIdx},
+                    ValueRange{idxOne, idxOne, idxOne, idxOne});
 
                 // Step 3: Perform bilinear interpolation over roi.
                 Value roiBinH = b.create<arith::SubFOp>(loc, highY, lowY);
                 Value roiBinW = b.create<arith::SubFOp>(loc, highX, lowX);
-                Value scaleH = b.create<arith::DivFOp>(loc, roiBinH, pooledH);
-                Value scaleW = b.create<arith::DivFOp>(loc, roiBinW, pooledW);
+                Value scaleH = b.create<arith::DivFOp>(loc, roiBinH, pooledHFp);
+                Value scaleW = b.create<arith::DivFOp>(loc, roiBinW, pooledWFp);
                 scaleH = b.create<math::CeilOp>(loc, scaleH);
                 scaleW = b.create<math::CeilOp>(loc, scaleW);
                 scaleH = b.create<arith::FPToSIOp>(loc, b.getI64Type(), scaleH);
@@ -1987,22 +2014,28 @@ struct ConvertRoiAlignOp final
 
                 SmallVector<Value> inputSizes;
                 for (unsigned i = 2; i < inputRank; ++i) {
-                  inputSizes.push_back(b.create<arith::IndexCastOp>(
-                      loc, b.getIntegerType(64), roiSizeVals[i]));
+                  inputSizes.push_back(roiSizeVals[i]);
                 }
 
                 Value outTensor = b.create<tensor::EmptyOp>(
                     loc, getAsOpFoldResult(dims), inputType.getElementType());
-                AffineMap idMap = b.getMultiDimIdentityMap(inputRank);
-                SmallVector<utils::IteratorType> iteratorTypes(
-                    inputRank, utils::IteratorType::parallel);
-
+                    auto iteratorTypes =
+                SmallVector<utils::IteratorType>(inputRank, utils::IteratorType::parallel);
+                iteratorTypes.append(inputRank, utils::IteratorType::parallel);
+                SmallVector<AffineMap> idMap(2, b.getMultiDimIdentityMap(inputRank));
+                //AffineMap idMap = b.getMultiDimIdentityMap(inputRank);
+                // SmallVector<utils::IteratorType> iteratorTypes(
+                //     inputRank, utils::IteratorType::parallel);
+
+                llvm::dbgs() << "5A" << "\n";
+                llvm::dbgs() << "4.99A" << "\n";
                 Value bilinearInterpolatedRoi =
                     b.create<linalg::GenericOp>(
-                         loc, outTensor.getType(), ValueRange{}, outTensor,
+                         loc, outTensor.getType(), extractRoi, outTensor,
                          /*indexingMaps=*/idMap,
                          /*iteratorTypes=*/iteratorTypes,
                          [&](OpBuilder &b, Location loc, ValueRange args) {
+                           llvm::dbgs() << "4.9A" << "\n";
                            Value retVal = bilinearInterpolate(
                                b, op, loc, outputSizeIntValues, extractRoi,
                                inputSizes, ValueRange{}, "");
@@ -2010,20 +2043,18 @@ struct ConvertRoiAlignOp final
                            b.create<linalg::YieldOp>(loc, retVal);
                          })
                         .getResult(0);
-
+                llvm::dbgs() << "4A" << "\n";
                 // Step 4: Sum pool over interpolated values.
-
                 Value sumPool, paddedInput;
-                Value oneInt =
-                    b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(1));
-                SmallVector<Value> kernelSizeIntValues = {oneInt, oneInt,
+                
+                SmallVector<Value> kernelSizeIntValues = {/*intOne, intOne,*/
                                                           scaleH, scaleW};
                 SmallVector<int64_t, 2> strideInts = {samplingRatioInt,
                                                       samplingRatioInt};
                 SmallVector<int64_t, 2> paddingInts = {0, 0};
                 SmallVector<int64_t, 2> dilationInts = {1, 1};
                 SmallVector<Value, 4> outTensorShape;
-
+                llvm::dbgs() << "3A" << "\n";
                 if (failed(createPoolingOp<linalg::PoolingNchwSumOp>(
                         op, rewriter, bilinearInterpolatedRoi,
                         /*supportNonFPInput=*/true, false,
@@ -2038,24 +2069,20 @@ struct ConvertRoiAlignOp final
                 Value outputTensor = b.create<tensor::EmptyOp>(
                     loc, getAsOpFoldResult(outTensorShape), resultElementType);
                 Value divisor = b.create<arith::MulIOp>(loc, scaleH, scaleW);
-
+                divisor = rewriter.create<arith::SIToFPOp>(loc, rewriter.getF32Type(), divisor);
+                llvm::dbgs() << "2A" << "\n";
                 Value avgPool =
                     b.create<linalg::GenericOp>(
                          loc, outputTensor.getType(), sumPool, outputTensor,
                          /*indexingMaps=*/idMap,
                          /*iteratorTypes=*/iteratorTypes,
                          [&](OpBuilder &b, Location loc, ValueRange args) {
-                           Value avg;
-                           if (isa<mlir::IntegerType>(resultElementType))
-                             avg = b.create<arith::DivSIOp>(loc, args[0],
-                                                            divisor);
-                           else if (isa<mlir::FloatType>(resultElementType))
-                             avg =
-                                 b.create<arith::DivFOp>(loc, args[0], divisor);
-                           b.create<linalg::YieldOp>(loc, avg);
+                          Value res = b.create<arith::DivFOp>(loc, args[0], divisor);
+                           b.create<linalg::YieldOp>(loc, res);
                          })
                         .getResult(0);
-
+                llvm::dbgs() << avgPool << " <------------------ \n";
+                llvm::dbgs() << "1" << "\n";
                 SmallVector<OpFoldResult> finalStrides(inputRank, oneAttr);
                 SmallVector<OpFoldResult> finalOffsets = {
                     getAsOpFoldResult(iv0), getAsOpFoldResult(iv1), zeroAttr,
@@ -2069,7 +2096,7 @@ struct ConvertRoiAlignOp final
                     finalStrides);
               });
         });
-
+    llvm::dbgs() << "0" << "\n";
     rewriter.replaceOp(op, finalOutputTensor);
     return success();
   }

From d035744812b33aa082b0973448d169d14902b39f Mon Sep 17 00:00:00 2001
From: Muzammiluddin Syed <muzasyed@amd.com>
Date: Wed, 12 Mar 2025 13:59:21 -0500
Subject: [PATCH 6/6] last commit before hiatus

---
 lib/Conversion/TorchToLinalg/Pooling.cpp | 58 ++++++------------------
 1 file changed, 13 insertions(+), 45 deletions(-)

diff --git a/lib/Conversion/TorchToLinalg/Pooling.cpp b/lib/Conversion/TorchToLinalg/Pooling.cpp
index 421ee91f9e3f..60fbbf675a28 100644
--- a/lib/Conversion/TorchToLinalg/Pooling.cpp
+++ b/lib/Conversion/TorchToLinalg/Pooling.cpp
@@ -168,7 +168,6 @@ static LogicalResult createPoolingOp(
   auto outTensorInitialized = computeOutputTensor(
       op, rewriter, self, dimensionality, ceilMode, strideInts, paddingInts,
       dilationInts, kernelSizeIntValues, outTensorShape, initValue);
-  llvm::dbgs() << outTensorInitialized << " [][][][][][][][][]\n";
       
   auto stridesAttr = rewriter.getI64VectorAttr(strideInts);
   auto dilationAttr = rewriter.getI64VectorAttr(dilationInts);
@@ -201,7 +200,6 @@ static LogicalResult createPoolingOp(
                         ValueRange{permutedInput, windowTensor}, permutedOutput,
                         stridesAttr, dilationAttr)
           .getResult(0);
-  llvm::dbgs() << poolingResult << "{}{}{}{}{}{}\n";
 
   result = poolingResult;
   if (dimensionality == 3) {
@@ -1672,7 +1670,6 @@ struct ConvertRoiAlignOp final
       // y_resized
       Value outInt = b.create<arith::IndexCastOp>(loc, b.getI64Type(),
                                                   indices[i + dimOffset]);
-llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
       Value outFP = b.create<arith::SIToFPOp>(loc, b.getF32Type(), outInt);
       Value preClip;
       if (coordStr == "_align_corners") {
@@ -1740,7 +1737,6 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
                                    Value input, SmallVector<Value> inputSizes,
                                    SmallVector<Value> scaleValues,
                                    std::string coordStr) {
-    llvm::dbgs() << "12A\n";
     unsigned dimOffset = 2;
     auto inputType = cast<RankedTensorType>(input.getType());
     auto inputRank = inputType.getRank();
@@ -1752,12 +1748,10 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
     for (unsigned i = 0; i < inputRank; ++i) {
       indices.push_back(b.create<linalg::IndexOp>(loc, i));
     }
-    llvm::dbgs() << "12A1\n";
     SmallVector<Value> proj, high, low, highFP, lowFP;
 
     proj = coordinateTransform(b, op, loc, outputSizes, input, inputSizes,
                                scaleValues, coordStr, false, indices, true);
-    llvm::dbgs() << "12B\n";
     for (unsigned i = 0; i < inputRank - dimOffset; ++i) {
       // length_original
       Value inputFP =
@@ -1785,7 +1779,6 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
       high.push_back(
           b.create<arith::IndexCastOp>(loc, b.getIndexType(), highExtract));
     }
-    llvm::dbgs() << "12B1\n";
     indices[dimOffset] = low[0];
     indices[dimOffset + 1] = low[1];
     Value p00 = b.create<tensor::ExtractOp>(loc, input, indices);
@@ -1826,7 +1819,6 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
     sum = b.create<arith::AddFOp>(loc, dx0p10, dx1p11);
     Value right = b.create<arith::MulFOp>(loc, dy1, sum);
 
-    llvm::dbgs() << "12C\n";
     return b.create<arith::AddFOp>(loc, left, right);
   }
   LogicalResult
@@ -1888,19 +1880,17 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
     SmallVector<Value> finalOutputShape = {ub0, ub1, pooledHIdx, pooledWIdx};
     Value finalOutputTensor = rewriter.create<tensor::EmptyOp>(
         loc, getAsOpFoldResult(finalOutputShape), resultElementType);
-    rewriter.create<scf::ForOp>(
-        loc, lb, ub0, step, ValueRange{},
-        [&](OpBuilder &b, Location loc, Value iv0, ValueRange args) {
-          b.create<scf::ForOp>(
-              loc, lb, ub1, step, ValueRange{},
+    auto resOut = rewriter.create<scf::ForOp>(
+        loc, lb, ub0, step, ValueRange{finalOutputTensor},
+        [&](OpBuilder &b, Location loc, Value iv0, ValueRange args0) {
+          auto res = b.create<scf::ForOp>(
+              loc, lb, ub1, step, ValueRange{args0[0]},
               [&](OpBuilder &b, Location loc, Value iv1, ValueRange args) {
                 // Step 1: Extract bounds for region of interest (roi).
                 OpFoldResult zeroAttr = b.getI64IntegerAttr(0);
                 OpFoldResult oneAttr = b.getI64IntegerAttr(1);
                 Value intOne =
                     b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(1));
-                // Value intZero =
-                //     b.create<arith::ConstantOp>(loc, b.getI64IntegerAttr(0));
                 Value idxZero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
                 Value idxOne = rewriter.create<arith::ConstantIndexOp>(loc, 1);
                 Value cstTwo = rewriter.create<arith::ConstantIndexOp>(loc, 2);
@@ -1928,27 +1918,20 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
 
                 Value highX = b.create<tensor::ExtractOp>(loc, b.getF32Type(),
                                                           rois, highXIndices);
-                llvm::dbgs() << "7A" << "\n";
-                llvm::dbgs() << " LOL..\n";
                 lowY = b.create<arith::MulFOp>(loc, lowY, spatialScaleVal);
-                llvm::dbgs() << " LOLA\n";
                 lowX = b.create<arith::MulFOp>(loc, lowX, spatialScaleVal);
-                llvm::dbgs() << " LOLB\n";
                 highY = b.create<arith::MulFOp>(loc, highY, spatialScaleVal);
-                llvm::dbgs() << " LOLC\n";
                 highX = b.create<arith::MulFOp>(loc, highX, spatialScaleVal);
-                llvm::dbgs() << " LOL1\n";
                 lowY = b.create<arith::SubFOp>(loc, lowY, offset);
                 lowX = b.create<arith::SubFOp>(loc, lowX, offset);
                 highY = b.create<arith::SubFOp>(loc, highY, offset);
                 highX = b.create<arith::SubFOp>(loc, highX, offset);
-                llvm::dbgs() << " LOL2\n";
+
                 // Step 2: Extract region of interest using bounds
                 Value lowYInt = b.create<math::FloorOp>(loc, lowY);
                 Value lowXInt = b.create<math::FloorOp>(loc, lowX);
                 Value highYInt = b.create<math::CeilOp>(loc, highY);
                 Value highXInt = b.create<math::CeilOp>(loc, highX);
-                llvm::dbgs() << " LOL3\n";
                 lowYInt =
                     b.create<arith::FPToSIOp>(loc, b.getI64Type(), lowYInt);
                 lowXInt =
@@ -1959,8 +1942,6 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
                     b.create<arith::FPToSIOp>(loc, b.getI64Type(), highXInt);
                 Value lowYIdx = b.create<arith::IndexCastOp>(loc, b.getIndexType(), lowYInt);
                 Value lowXIdx = b.create<arith::IndexCastOp>(loc, b.getIndexType(), lowXInt);
-                llvm::dbgs() << " LOL4\n";
-                llvm::dbgs() << lowYIdx << "\n^^^\n\n";
                 Value roiHeight =
                     b.create<arith::SubIOp>(loc, highYInt, lowYInt);
                 Value roiWidth =
@@ -2019,23 +2000,15 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
 
                 Value outTensor = b.create<tensor::EmptyOp>(
                     loc, getAsOpFoldResult(dims), inputType.getElementType());
-                    auto iteratorTypes =
+                auto iteratorTypes =
                 SmallVector<utils::IteratorType>(inputRank, utils::IteratorType::parallel);
-                iteratorTypes.append(inputRank, utils::IteratorType::parallel);
                 SmallVector<AffineMap> idMap(2, b.getMultiDimIdentityMap(inputRank));
-                //AffineMap idMap = b.getMultiDimIdentityMap(inputRank);
-                // SmallVector<utils::IteratorType> iteratorTypes(
-                //     inputRank, utils::IteratorType::parallel);
-
-                llvm::dbgs() << "5A" << "\n";
-                llvm::dbgs() << "4.99A" << "\n";
                 Value bilinearInterpolatedRoi =
                     b.create<linalg::GenericOp>(
                          loc, outTensor.getType(), extractRoi, outTensor,
                          /*indexingMaps=*/idMap,
                          /*iteratorTypes=*/iteratorTypes,
                          [&](OpBuilder &b, Location loc, ValueRange args) {
-                           llvm::dbgs() << "4.9A" << "\n";
                            Value retVal = bilinearInterpolate(
                                b, op, loc, outputSizeIntValues, extractRoi,
                                inputSizes, ValueRange{}, "");
@@ -2043,7 +2016,6 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
                            b.create<linalg::YieldOp>(loc, retVal);
                          })
                         .getResult(0);
-                llvm::dbgs() << "4A" << "\n";
                 // Step 4: Sum pool over interpolated values.
                 Value sumPool, paddedInput;
                 
@@ -2054,7 +2026,6 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
                 SmallVector<int64_t, 2> paddingInts = {0, 0};
                 SmallVector<int64_t, 2> dilationInts = {1, 1};
                 SmallVector<Value, 4> outTensorShape;
-                llvm::dbgs() << "3A" << "\n";
                 if (failed(createPoolingOp<linalg::PoolingNchwSumOp>(
                         op, rewriter, bilinearInterpolatedRoi,
                         /*supportNonFPInput=*/true, false,
@@ -2070,7 +2041,6 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
                     loc, getAsOpFoldResult(outTensorShape), resultElementType);
                 Value divisor = b.create<arith::MulIOp>(loc, scaleH, scaleW);
                 divisor = rewriter.create<arith::SIToFPOp>(loc, rewriter.getF32Type(), divisor);
-                llvm::dbgs() << "2A" << "\n";
                 Value avgPool =
                     b.create<linalg::GenericOp>(
                          loc, outputTensor.getType(), sumPool, outputTensor,
@@ -2081,23 +2051,21 @@ llvm::dbgs() << outInt << " HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n";
                            b.create<linalg::YieldOp>(loc, res);
                          })
                         .getResult(0);
-                llvm::dbgs() << avgPool << " <------------------ \n";
-                llvm::dbgs() << "1" << "\n";
                 SmallVector<OpFoldResult> finalStrides(inputRank, oneAttr);
                 SmallVector<OpFoldResult> finalOffsets = {
                     getAsOpFoldResult(iv0), getAsOpFoldResult(iv1), zeroAttr,
                     zeroAttr};
                 SmallVector<OpFoldResult> finalSizes = {
-                    oneAttr, oneAttr, getAsOpFoldResult(pooledH),
-                    getAsOpFoldResult(pooledW)};
+                    idxOne, idxOne, getAsOpFoldResult(pooledHIdx), getAsOpFoldResult(pooledWIdx)};
                 SmallVector<OpFoldResult> diagStrides(inputRank, oneAttr);
-                finalOutputTensor = b.create<tensor::InsertSliceOp>(
-                    loc, finalOutputTensor, avgPool, finalOffsets, finalSizes,
+                auto insert = b.create<tensor::InsertSliceOp>(
+                    loc, avgPool, args[0], finalOffsets, finalSizes,
                     finalStrides);
+                b.create<scf::YieldOp>(loc, insert.getResult());
               });
+              b.create<scf::YieldOp>(loc, res.getResult(0));
         });
-    llvm::dbgs() << "0" << "\n";
-    rewriter.replaceOp(op, finalOutputTensor);
+    rewriter.replaceOpWithNewOp<tensor::CastOp>(op, resultType, resOut.getResult(0));
     return success();
   }
 };