Upcast gradually when computing variance

ftynse · ftynse · commit 5b0479cf4ef0 · 2025-07-30T13:12:40.000Z
Going all the way to f64 is undesirable, especially for low-precision
tensors in bf16 or f8 variants. Upcast only to the next type, e.g.,
bf16-&gt;f32 or f8-&gt;bf16. This is consistent with what Pytorch seems to be
doing internally.

Signed-off-by: Alex Zinenko &lt;git@ozinenko.com&gt;
diff --git a/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp b/lib/Dialect/Torch/Transforms/DecomposeComplexOps.cpp
@@ -9322,9 +9322,15 @@ static LogicalResult calculateVariance(OpTy op, PatternRewriter &rewriter,
         op, "support floating-point type input only");
   }
 
-  // Upcasting the input tensor to `F64` dtype for higher precision during the
-  // computation of the result.
-  if (inputTensorTy.getDtype().getIntOrFloatBitWidth() != 64) {
+  // Upcasting the input tensor to a double-bitwidth dtype for higher precision
+  // during the computation of the result.
+  unsigned bitwidth = inputTensorTy.getDtype().getIntOrFloatBitWidth();
+  if (bitwidth != 64) {
+    Type targetTy = rewriter.getF64Type();
+    if (bitwidth == 8)
+      targetTy = rewriter.getBF16Type();
+    else if (bitwidth == 16)
+      targetTy = rewriter.getF32Type();
     self = convertTensorToDtype(rewriter, loc, self, rewriter.getF64Type());
     inputTensorTy = cast<BaseTensorType>(self.getType());
   }